From f6e55a026abf33867141896b1e227e791942c2a3 Mon Sep 17 00:00:00 2001 From: P. J. McDermott Date: Fri, 19 Feb 2016 04:01:32 -0500 Subject: [WIP] Add lexer/parser demo --- (limited to 'parsing') diff --git a/parsing/lexer.sh b/parsing/lexer.sh new file mode 100644 index 0000000..3081a79 --- /dev/null +++ b/parsing/lexer.sh @@ -0,0 +1,457 @@ +fname= +lineno= +c= +tok= +tokens= + +pgetc() +{ + c="$(dd bs=1 count=1 2>/dev/null; printf '.')" + c="${c%.}" +} + +next() +{ + while :; do + echo "parsing char '$c'" >&2 + case "${c}" in + '') + pgetc + tok=T_EOF + return + ;; + "${LF}") + pgetc + lineno=$((${lineno} + 1)) + tok=T_NEWLINE + return + ;; + ' '|"${HT}") + pgetc + continue + ;; + \\) + pgetc + case "${c}" in "${LF}") + lineno=$((${lineno} + 1)) + pgetc + continue + ;; + esac + next_word + return + ;; + '#') + pgetc + while :; do + case "${c}" in "${LF}"|'') + break + ;; + esac + pgetc + done + continue + ;; + '&') + pgetc + case "${c}" in '&') + pgetc + tok=T_AND_IF + return + ;; + esac + tok=T_AND + return + ;; + '|') + pgetc + case "${c}" in '|') + pgetc + tok=T_OR_IF + return + ;; + esac + tok=T_PIPE + return + ;; + ';') + pgetc + case "${c}" in ';') + pgetc + tok=T_DSEMI + return + ;; + esac + tok=T_SEMI + return + ;; + '(') + pgetc + tok=T_LPAREN + return + ;; + ')') + pgetc + tok=T_RPAREN + return + ;; + '<'|'>') + next_io + return + ;; + *) + next_word + return + ;; + esac + pgetc + done +} + +next_io() +{ + case "${c}" in + '<') + pgetc + case "${c}" in '<') + pgetc + case "${c}" in '-') + pgetc + tok=T_DLESSDASH + ;; + esac + tok=T_DLESS + ;; + esac + tok=T_LESS + ;; + '>') + pgetc + case "${c}" in '>') + pgetc + tok=T_DGREAT + ;; + esac + tok=T_GREAT + ;; + esac +} + +next_word() +{ + local res= + local lineno_offset= + local word= + + res="$(scan_word false)" + lineno_offset=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + word="${res%%${RS}*}" + + # We must advance lineno because scan_word() was run in a subshell. + lineno=$((${lineno} + ${lineno_offset})) + tok="T_WORD${US}${word}" +} + +scan_word() +{ + local in_param="${1}" + local res= + local word= + local quoted= + local lines= + local lineno_offset= + local wordexp= + + word='' + quoted=false + lines=0 + while :; do + echo "parsing word char '$c'" >&2 + case "${c}" in + '') + break + ;; + "${LF}") + if ! ${in_param} && ! ${quoted}; then + break + fi + lineno=$((${lineno} + 1)) + lines=$((${lines} + 1)) + word="${word}${c}" + ;; + ' '|"${HT}") + if ! ${in_param} && ! ${quoted}; then + break + fi + word="${word}${c}" + ;; + '$') + pgetc + res=$(scan_wordexp) + lineno_offset=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + wordexp="${res%%${RS}*}" + # We must advance lineno because scan_wordexp() + # was run in a subshell. + lineno=$((${lineno} + ${lineno_offset})) + word="${word}${wordexp}" + # scan_wordexp() leaves behind an unused + # character, so we should skip the pgetc() call + # below. + continue + ;; + \') + word="${word}${c}" + while :; do + pgetc + word="${word}${c}" + case "${c}" in \') + break + ;; + esac + done + ;; + '"') + word="${word}${c}" + if ${quoted}; then + quoted=false + else + quoted=true + fi + ;; + '}') + if ${in_param} && ! ${quoted}; then + break + fi + word="${word}${c}" + ;; + *) + word="${word}${c}" + ;; + esac + pgetc + done + + printf "%d${RS}%s${RS}%s" ${lines} "${c}" "${word}" +} + +scan_wordexp() +{ + local wordexp= + local lineno_offset= + local mod= + local res= + local word= + + wordexp='' + lineno_offset=0 + case "${c}" in + '{') + # Parameter expansion brace + pgetc + case "${c}" in + '#') + pgetc + case "${c}" in + [@*#?$!A-Za-z0-9_-]) + # String length + # expansion + next_param + ;; + *) + # Special parameter "#" + param='#' + ;; + esac + ;; + *) + next_param + ;; + esac + wordexp="\${${param}" + # Check for modifications + mod=false + case "${c}" in + ':') + mod=true + wordexp="${wordexp}${c}" + pgetc + case "${c}" in '-'|'='|'?'|'+') + wordexp="${wordexp}${c}" + pgetc + ;; + esac + ;; + '-'|'='|'?'|'+') + mod=true + wordexp="${wordexp}${c}" + pgetc + ;; + '%') + mod=true + wordexp="${wordexp}${c}" + pgetc + case "${c}" in '%') + wordexp="${wordexp}${c}" + pgetc + ;; + esac + ;; + '#') + mod=true + wordexp="${wordexp}${c}" + pgetc + case "${c}" in '#') + wordexp="${wordexp}${c}" + pgetc + ;; + esac + ;; + esac + if ${mod}; then + # Get word. + res="$(scan_word true)" + lineno_offset=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + word="${res%%${RS}*}" + # We must advance lineno because scan_word() was + # run in a subshell. + lineno=$((${lineno} + ${lineno_offset})) + wordexp="${wordexp}${word}" + echo "param mod word: '$word'" >&2 + fi + # Check for right brace. + case "${c}" in + '}') + wordexp="${wordexp}${c}" + pgetc + ;; + *) + synerr 'Missing "}"' + ;; + esac + ;; + '(') + ;; + [@*#?$!A-Za-z0-9_-]) + next_param + wordexp="\$${param}" + ;; + esac + + printf "%d${RS}%s${RS}%s" ${lineno_offset} "${c}" "${wordexp}" + +} + +next_param() +{ + param='' + case "${c}" in + [@*#?$!0-]) + # Special parameter + param="${c}" + pgetc + ;; + [1-9]) + # Positional parameter + param="${param}${c}" + pgetc + while :; do + case "${c}" in [!0-9]) + break + ;; + esac + param="${param}${c}" + pgetc + done + ;; + [A-Za-z_]) + # Parameter name + param="${param}${c}" + pgetc + while :; do + case "${c}" in [!A-Za-z0-9_]) + break + ;; + esac + param="${param}${c}" + pgetc + done + ;; + esac +} + +# Check the current token. If it matches, add it to the syntax array. +accept() +{ + local t="${1}" + + if [ "x${tok%%${US}*}" = "x${t}" ]; then + echo "accept $t" >&2 + tokens="${tokens}${tok}${RS}" + next + return 0 + fi + return 1 +} + +error() +{ + local fmt="${1}" + shift 1 + + case "${fname}" in + '-') + printf "stdin:%d: ${fmt}\n" ${lineno} "${@}" >&2 + ;; + *) + printf "%s:%d: ${fmt}\n" "${fname}" ${lineno} "${@}" >&2 + ;; + esac +} + +init_lexer() +{ + local fn="${1}" + shift 1 + + fname="${fn}" + lineno=1 + tokens='' + pgetc + next +} + +get_tokens() +{ + printf '%s' "${tokens}" + return 0 +} + +synexp() +{ + local t="${1}" + shift 1 + + if [ "x${t}" = 'x' ]; then + synerr '%s unexpected' "$(tokname "${tok}")" + else + synerr '%s unexpected (expecting %s)' "$(tokname "${tok}")" \ + "$(tokname "${t}")" + fi +} + +synerr() +{ + local fmt="${1}" + shift 1 + + error "Syntax error: ${fmt}" "${@}" +} diff --git a/parsing/parse.sh b/parsing/parse.sh new file mode 100644 index 0000000..2ac4215 --- /dev/null +++ b/parsing/parse.sh @@ -0,0 +1,133 @@ +HT="$(printf '\t.')"; HT="${HT%.}" +LF="$(printf '\n.')"; LF="${LF%.}" +RS="$(printf '\036.')"; RS="${RS%.}" +US="$(printf '\037.')"; US="${US%.}" + +. ./tokens.sh +. ./lexer.sh + +complete_command() +{ + if list; then + separator + return 0 + fi + return 1 +} + +list() +{ + if and_or; then + while separator_op; do + if ! and_or; then + return 1 + fi + done + return 0 + fi + return 1 +} +and_or() +{ + if pipeline; then + while accept T_AND_IF || accept T_OR_IF; do + if ! linebreak || ! pipeline; then + return 1 + fi + done + return 0 + fi + return 1 +} +pipeline() +{ + accept T_BANG + if pipe_sequence; then + return 0 + fi + return 1 +} + +pipe_sequence() +{ + if command; then + while accept T_PIPE; do + if ! linebreak || ! command; then + return 1 + fi + done + return 0 + fi + return 1 +} + +command() +{ + # XXX: Unfinished + accept T_WORD +} + + + +newline_list() +{ + if accept T_NEWLINE; then + while accept T_NEWLINE; do + : + done + return 0 + fi + return 1 +} +linebreak() +{ + newline_list + return 0 +} + +separator_op() +{ + if accept T_AND || accept T_SEMI; then + return 0 + fi + return 1 +} + +separator() +{ + if separator_op && linebreak; then + return 0 + elif newline_list; then + return 0 + fi + return 1 +} + +parse() +{ + local fn="${1}" + shift 1 + + init_lexer "${fn}" + while complete_command; do :; done + if :; then # TODO: Test for EOF or errors + get_tokens + return 0 + fi + return 1 +} + +if tokens="$(printf '%s\n' '"foo bar" && $baz || qux' '${quux%uux } quuux' | \ + parse -)"; then + IFS="${RS}" + for t in ${tokens}; do + printf 'Token: %s\n' "$(tokname "${t}")" + case "${t%${US}*}" in T_WORD) + printf ' "%s"\n' "${t#T_WORD${US}}" + ;; + esac + done + unset IFS +else + echo FAIL +fi diff --git a/parsing/tokens.sh b/parsing/tokens.sh new file mode 100644 index 0000000..37d741b --- /dev/null +++ b/parsing/tokens.sh @@ -0,0 +1,55 @@ +tokname() +{ + local t="${1}" + shift 1 + local n= + + case "${t%${US}*}" in + # Operators + T_EOF) n='end of file';; + T_NEWLINE) n='newline';; + T_AND) n='&';; + T_SEMI) n=';';; + T_AND_IF) n='&&';; + T_OR_IF) n='||';; + T_DSEMI) n=';;';; + T_LESS) n='<';; + T_GREAT) n='>';; + T_DLESS) n='<<';; + T_DGREAT) n='>>';; + T_LESSAND) n='<&';; + T_GREATAND) n='>&';; + T_LESSGREAT) n='<>';; + T_DLESSDASH) n='<<-';; + T_CLOBBER) n='>|';; + T_PIPE) n='|';; + T_LPAREN) n='(';; + T_RPAREN) n=')';; + # Reserved words + T_IF) n='if';; + T_THEN) n='then';; + T_ELSE) n='else';; + T_ELIF) n='elif';; + T_FI) n='fi';; + T_DO) n='do';; + T_DONE) n='done';; + T_CASE) n='case';; + T_ESAC) n='esac';; + T_WHILE) n='while';; + T_UNTIL) n='until';; + T_FOR) n='for';; + T_LBRACE) n='{';; + T_RBRACE) n='}';; + T_BANG) n='!';; + T_IN) n='in';; + # Special symbols + T_NAME) n='name';; + T_IO_NUMBER) n='I/O number';; + T_WORD) n='word';; + T_ASSIGNMENT_WORD) n='assignment word';; + # Unknown + *) n='unknown token';; + esac + + printf '%s' "${n}" +} -- cgit v0.9.1