From 8a9a6865954ade85d4a55f955829ae08941c31b8 Mon Sep 17 00:00:00 2001 From: P. J. McDermott Date: Sun, 21 Feb 2016 21:13:33 -0500 Subject: Remove old demo parsing code --- (limited to 'parsing/lexer.sh') diff --git a/parsing/lexer.sh b/parsing/lexer.sh deleted file mode 100644 index 886e7f8..0000000 --- a/parsing/lexer.sh +++ /dev/null @@ -1,958 +0,0 @@ -fname= -lineno= -ln_off= -start= -c= -wordexp= -here_queue= -here_awaiting_end= -here_awaiting_word= -tok= -tokens= - -# -# Error handling (used by scanning and interface functions) -# - -error() -{ - local fmt="${1}" - shift 1 - - case "${fname}" in - '-') - printf "stdin:%d: ${fmt}\n" ${lineno} "${@}" >&2 - ;; - *) - printf "%s:%d: ${fmt}\n" "${fname}" ${lineno} "${@}" >&2 - ;; - esac - - # The parser and lexer run in a subshell, so this just returns up to the - # caller like an exception. - exit 1 -} - -synexp() -{ - local t="${1}" - shift 1 - - if [ "x${t}" = 'x' ]; then - synerr '%s unexpected' "$(tokname "${tok}")" - else - synerr '%s unexpected (expecting %s)' "$(tokname "${tok}")" \ - "$(tokname "${t}")" - fi -} - -synerr() -{ - local fmt="${1}" - shift 1 - - error "Syntax error: ${fmt}" "${@}" -} - -# -# Input reading -# - -lgetc() -{ - c="$(dd bs=1 count=1 2>/dev/null; printf '.')" - c="${c%.}" -} - -# -# Token recognition -# - -next() -{ - if ${here_awaiting_word}; then - next_here - return - fi - while :; do - dbg "parsing char '$c' at lineno $lineno" - case "${c}" in - '') - lgetc - tok=T_EOF - return - ;; - "${LF}") - if ${here_awaiting_end}; then - synexp '' - else - case "${here_queue}" in *"${RS}"*) - here_awaiting_end=false - here_awaiting_word=true - ;; - esac - fi - lgetc - lineno=$((${lineno} + 1)) - tok=T_NEWLINE - return - ;; - ' '|"${HT}") - lgetc - continue - ;; - \\) - lgetc - case "${c}" in "${LF}") - lineno=$((${lineno} + 1)) - lgetc - continue - ;; - esac - next_word \\ - return - ;; - '#') - lgetc - while :; do - case "${c}" in "${LF}"|'') - break - ;; - esac - lgetc - done - continue - ;; - '&') - lgetc - case "${c}" in '&') - lgetc - tok=T_AND_IF - return - ;; - esac - tok=T_AND - return - ;; - '|') - lgetc - case "${c}" in '|') - lgetc - tok=T_OR_IF - return - ;; - esac - tok=T_PIPE - return - ;; - ';') - lgetc - case "${c}" in ';') - lgetc - tok=T_DSEMI - return - ;; - esac - dbg T_SEMI - tok=T_SEMI - return - ;; - '(') - lgetc - tok=T_LPAREN - return - ;; - ')') - lgetc - tok=T_RPAREN - return - ;; - '<'|'>') - next_io - return - ;; - *) - next_word '' - return - ;; - esac - lgetc - done -} - -next_here() -{ - local here= - local here_strip_tabs= - local here_end= - local here_escaped= - local line= - local word= - local res= - local wordexp= - - # Dequeue the here-document. - here="${here_queue%%${RS}*}" - here_strip_tabs="${here%%${US}*}" - here_end="${here%${US}*}" - here_end="$(printf '%s' "${here_end#*${US}}" | \ - sed 's/\\//g; s/"//g; s/'\''//g;')" # Stupid Vim: ')" - here_escaped="${here##*${US}}" - here_queue="${here_queue#*${RS}}" - here_awaiting_word=false - - line='' - word='' - while :; do - case "${c}" in - '') - # Bash throws a warning when EOF occurs in a - # here document. mksh throws an error. dash, - # BusyBox ash, ksh93, and zsh accept EOF as a - # delimiter. We aim for the lowest common - # denominator, so throw an error like mksh does. - synerr 'Here-document "%s" unclosed' \ - "${here_end}" - ;; - "${LF}") - word="${word}${line}" - case "${line}" in "${here_end}") - tok="T_WORD${US}${word}" - return - ;; - esac - word="${word}${c}" - line='' - ;; - "${HT}") - if ${here_strip_tabs}; then - case "${line}" in - '') - ;; - *) - line="${line}${c}" - ;; - esac - else - line="${line}${c}" - fi - ;; - '$') - if ! ${here_escaped}; then - lgetc - if ! res="$(scan_wordexp)"; then - exit 1 - fi - ln_off=${res%%${RS}*} - res="${res#*${RS}}" - c="${res%%${RS}*}" - res="${res#*${RS}}" - wordexp="${res%%${RS}*}" - lineno=$((${lineno} + ${ln_off})) - line="${line}${wordexp}" - continue - else - line="${line}${c}" - fi - ;; - *) - line="${line}${c}" - ;; - esac - lgetc - done -} - -next_io() -{ - case "${c}" in - '<') - lgetc - case "${c}" in - '<') - lgetc - case "${c}" in '-') - lgetc - tok=T_DLESSDASH - here_queue="${here_queue}true" - here_awaiting_end=true - here_awaiting_word=false - break - ;; - esac - tok=T_DLESS - here_queue="${here_queue}false" - here_awaiting_end=true - here_awaiting_word=false - break - ;; - '&') - lgetc - tok=T_LESSAND - break - ;; - '>') - lgetc - tok=T_LESSGREAT - break - ;; - esac - tok=T_LESS - break - ;; - '>') - lgetc - case "${c}" in - '>') - lgetc - tok=T_DGREAT - break - ;; - '&') - lgetc - tok=T_GREATAND - break - ;; - '|') - lgetc - tok=T_CLOBBER - break - ;; - esac - tok=T_GREAT - break - ;; - esac -} - -next_word() -{ - local prev_c="${1}" - shift 1 - local res= - local word= - - if ! res="$(scan_word false)"; then - exit 1 - fi - ln_off=${res%%${RS}*} - res="${res#*${RS}}" - c="${res%%${RS}*}" - res="${res#*${RS}}" - word="${prev_c}${res%%${RS}*}" - - # We must advance lineno because scan_word() was run in a subshell. - lineno=$((${lineno} + ${ln_off})) - tok="T_WORD${US}${word}" - - if ${here_awaiting_end}; then - here_queue="${here_queue}${US}${word}" - case "${word}" in - *\\*|*'"'*|*"'"*) - here_queue="${here_queue}${US}true" - ;; - *) - here_queue="${here_queue}${US}false" - ;; - esac - here_queue="${here_queue}${RS}" - here_awaiting_end=false - fi -} - -# -# Token scanning -# - -scan_word() -{ - local in_param="${1}" - local res= - local word= - local quoted= - local lines= - local wordexp= - - word='' - quoted=false - lines=0 - while :; do - dbg "parsing word char '$c' at lineno $lineno" - case "${c}" in - '') - break - ;; - "${LF}") - if ! ${in_param} && ! ${quoted}; then - break - fi - lineno=$((${lineno} + 1)) - lines=$((${lines} + 1)) - word="${word}${c}" - ;; - ' '|"${HT}"|'&'|'|'|';'|'('|')'|'<'|'>') - if ! ${in_param} && ! ${quoted}; then - break - fi - word="${word}${c}" - ;; - '$') - case "${here_queue}" in *"${RS}"*) - if ${here_awaiting_end}; then - synerr '%s %s %s %s' \ - 'Word expansions' \ - 'not supported in' \ - 'here-document' \ - 'delimiters' - fi - esac - lgetc - if ! res=$(scan_wordexp); then - exit 1 - fi - ln_off=${res%%${RS}*} - res="${res#*${RS}}" - c="${res%%${RS}*}" - res="${res#*${RS}}" - wordexp="${res%%${RS}*}" - # We must advance lineno because scan_wordexp() - # was run in a subshell. - lineno=$((${lineno} + ${ln_off})) - word="${word}${wordexp}" - # scan_wordexp() leaves behind an unused - # character, so we should skip the lgetc() call - # below. - continue - ;; - '`') - synerr 'Backquoted (old-style) %s' \ - 'command substitution not supported' - break - ;; - \\) - word="${word}${c}" - lgetc - case "${c}" in '') - # Bash, ksh93, mksh, and zsh ignore a - # backslash at the end of a file, but - # dash and BusyBox ash include it in the - # word. To help with script - # portability, we'll throw an error - # (which is a reasonable thing to do - # anyway). - synerr 'Unexpected end of file %s' \ - 'after "\"' - ;; - esac - word="${word}${c}" - ;; - \') - word="${word}${c}" - while :; do - lgetc - word="${word}${c}" - case "${c}" in - '') - synerr '%s %s' \ - 'Unterminated' \ - 'quoted string' - ;; - \') - break - ;; - esac - done - ;; - '"') - word="${word}${c}" - if ${quoted}; then - quoted=false - else - quoted=true - fi - ;; - '}') - if ${in_param} && ! ${quoted}; then - break - fi - word="${word}${c}" - ;; - *) - word="${word}${c}" - ;; - esac - lgetc - done - - if ${quoted}; then - synerr 'Unterminated quoted string' - fi - - printf "%d${RS}%c${RS}%s" ${lines} "${c}" "${word}" -} - -scan_wordexp() -{ - local res= - local toks= - local param= - - wordexp='' - ln_off=0 - case "${c}" in - '{') - # Parameter expansion brace - scan_wordexp_param_brace - ;; - '(') - # Arithmetic expansion or command substitution - lgetc - case "${c}" in - '(') - # Arithmetic expansion - scan_wordexp_arith - ;; - *) - # Command substitution - if ! res="$(run_sublexer "sub${fname}" \ - ${lineno} "${start}" \ - "${c}")"; then - exit 1 - fi - ln_off=${res%%${RS}*} - res="${res#*${RS}}" - c="${res%%${RS}*}" - res="${res#*${RS}}" - toks="${res%%${RS}*}" - lineno=${ln_off} - wordexp="\$(${SOH}C${STX}${toks}" - wordexp="${wordexp}${ETX})" - # ")" is recognized in run_sublexer(). - ;; - esac - ;; - [@*#?$!A-Za-z0-9_-]) - if ! res="$(scan_param)"; then - exit 1 - fi - ln_off=${res%%${RS}*} - res="${res#*${RS}}" - c="${res%%${RS}*}" - res="${res#*${RS}}" - param="${res%%${RS}*}" - lineno=$((${lineno} + ${ln_off})) - wordexp="\$${param}" - ;; - esac - - printf "%d${RS}%c${RS}%s" ${ln_off} "${c}" "${wordexp}" - return 0 -} - -scan_wordexp_param_brace() -{ - local mod= - local res= - local param= - local word= - - mod=true - - lgetc - case "${c}" in - '#') - lgetc - case "${c}" in - [@*#?$!A-Za-z0-9_-]) - # String length expansion - if ! res="$(scan_param)"; then - exit 1 - fi - ln_off=${res%%${RS}*} - res="${res#*${RS}}" - c="${res%%${RS}*}" - res="${res#*${RS}}" - param="${res%%${RS}*}" - lineno=$((${lineno} + ${ln_off})) - # Disable modifications. - mod=false - ;; - *) - # Special parameter "#" - param='#' - ;; - esac - ;; - *) - if ! res="$(scan_param)"; then - exit 1 - fi - ln_off=${res%%${RS}*} - res="${res#*${RS}}" - c="${res%%${RS}*}" - res="${res#*${RS}}" - param="${res%%${RS}*}" - lineno=$((${lineno} + ${ln_off})) - ;; - esac - wordexp="\${${param}" - - # If modifications are allowed - if ${mod}; then - # Check for modifications. - mod=false - case "${c}" in - ':') - mod=true - wordexp="${wordexp}${c}" - lgetc - case "${c}" in '-'|'='|'?'|'+') - wordexp="${wordexp}${c}" - lgetc - ;; - esac - ;; - '-'|'='|'?'|'+') - mod=true - wordexp="${wordexp}${c}" - lgetc - ;; - '%') - mod=true - wordexp="${wordexp}${c}" - lgetc - case "${c}" in '%') - wordexp="${wordexp}${c}" - lgetc - ;; - esac - ;; - '#') - mod=true - wordexp="${wordexp}${c}" - lgetc - case "${c}" in '#') - wordexp="${wordexp}${c}" - lgetc - ;; - esac - ;; - esac - fi - - # If a modification was found - if ${mod}; then - # Get word. - if ! res="$(scan_word true)"; then - exit 1 - fi - ln_off=${res%%${RS}*} - res="${res#*${RS}}" - c="${res%%${RS}*}" - res="${res#*${RS}}" - word="${res%%${RS}*}" - # We must advance lineno because scan_word() was run in a - # subshell. - lineno=$((${lineno} + ${ln_off})) - wordexp="${wordexp}${word}" - dbg "param mod word: '$word'" - fi - - # Check for right brace. - case "${c}" in - '}') - wordexp="${wordexp}${c}" - lgetc - ;; - *) - synerr 'Missing "}"' - ;; - esac - - return 0 -} - -scan_param() -{ - local param= - - param='' - case "${c}" in - [@*#?$!0-]) - # Special parameter - param="${c}" - lgetc - ;; - [1-9]) - # Positional parameter - param="${param}${c}" - lgetc - while :; do - case "${c}" in [!0-9]) - break - ;; - esac - param="${param}${c}" - lgetc - done - ;; - [A-Za-z_]) - # Parameter name - param="${param}${c}" - lgetc - while :; do - case "${c}" in [!A-Za-z0-9_]) - break - ;; - esac - param="${param}${c}" - lgetc - done - ;; - *) - synerr 'Bad parameter name' - ;; - esac - - printf "%d${RS}%c${RS}%s" 0 "${c}" "${param}" - return 0 -} - -scan_wordexp_arith() -{ - local arith= - local paren_lvl= - local res= - local sub_wordexp= - - arith='' - paren_lvl=0 - while :; do - lgetc - case "${c}" in - '') - synerr 'end of file unexpected (%s)' \ - 'expecting "))"' - ;; - '(') - arith="${arith}${c}" - paren_lvl=$((${paren_lvl} + 1)) - ;; - ')') - if [ ${paren_lvl} -eq 0 ]; then - lgetc - case "${c}" in ')') - wordexp="\$((${arith}))" - lgetc - return 0 - ;; - esac - synerr 'Arithmetic expansion: ")" %s' \ - 'unexpected' - fi - arith="${arith}${c}" - paren_lvl=$((${paren_lvl} - 1)) - ;; - '$') - lgetc - if ! res=$(scan_wordexp); then - exit 1 - fi - ln_off=${res%%${RS}*} - res="${res#*${RS}}" - c="${res%%${RS}*}" - res="${res#*${RS}}" - sub_wordexp="${res%%${RS}*}" - # We must advance lineno because scan_wordexp() - # was run in a subshell. - lineno=$((${lineno} + ${ln_off})) - arith="${arith}${sub_wordexp}" - ;; - *) - arith="${arith}${c}" - ;; - esac - done -} - -# -# Interface -# - -# Check the current token. If it matches, add it to the syntax array. -accept() -{ - local t="${1}" - local rw= - - dbg "looking for $t, current tok ${tok%%${US}*}" - case "${t}" in - T_IF|T_THEN|T_ELSE|T_ELIF|T_FI|\ - T_DO|T_DONE|T_CASE|T_ESAC|T_WHILE|T_UNTIL|\ - T_FOR|T_LBRACE|T_RBRACE|T_BANG|T_IN) - dbg "looking for reserved word $t, have '$tok'" - if ! [ "x${tok%%${US}*}" = "x${t}" ]; then - # Reserved words are recognized as literal - # T_WORDs. - if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then - return 1 - fi - # T_WORD data unit must match reserved word - # exactly. - if ! [ "x${tok#T_WORD${US}}" = \ - "x$(toktext "${t}")" ]; then - return 1 - fi - # If the token matches the reserved word, - # replace it with the reserved word token. - tok="${t}" - fi - ;; - T_NAME) - # Names are recognized as literal T_WORDs. - if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then - return 1 - fi - # Validate name. - case "${tok%%${US}*}" in - [A-Za-z_][0-9A-Za-z_]*) - ;; - *) - return 1 - ;; - esac - tok="T_NAME${US}${tok#T_WORD${US}}" - ;; - T_FNAME) - # Function names are recognized as literal T_WORDs. - if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then - return 1 - fi - # Validate name. - case "${tok%%${US}*}" in - [A-Za-z_][0-9A-Za-z_]*) - ;; - *) - return 1 - ;; - esac - # Verify that the function name doesn't match any - # reserved words. - for rw in T_IF T_THEN T_ELSE T_ELIF T_FI T_DO T_DONE \ - T_CASE T_ESAC T_WHILE T_UNTIL T_FOR \ - T_LBRACE T_RBRACE T_BANG T_IN; do - if [ "x${tok#T_WORD${US}}" = \ - "x$(toktext "${rw}")" ]; then - tok="${rw}" - return 1 - fi - done - tok="T_FNAME${US}${tok#T_WORD${US}}" - ;; - T_CMDNAME) - # The first word of a simple command is to be checked - # for reserved words. - if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then - return 1 - fi - # Verify that the word doesn't match any reserved words. - for rw in T_IF T_THEN T_ELSE T_ELIF T_FI T_DO T_DONE \ - T_CASE T_ESAC T_WHILE T_UNTIL T_FOR \ - T_LBRACE T_RBRACE T_BANG T_IN; do - if [ "x${tok#T_WORD${US}}" = \ - "x$(toktext "${rw}")" ]; then - tok="${rw}" - return 1 - fi - done - ;; - *) - if ! [ "x${tok%%${US}*}" = "x${t}" ]; then - return 1 - fi - ;; - esac - - dbg "accept $t" - tokens="${tokens}${tok}${RS}" - next - return 0 -} - -expect() -{ - local t="${1}" - - if accept "${t}"; then - return 0 - else - synexp "${t}" - fi -} - -# Called by the lexer, not the parser -run_sublexer() -{ - local fn="${1}" - local ln="${2}" - local st="${3}" - local ch="${4}" - shift 4 - - # Initialize global variables. - fname="${fn}" - lineno=${ln} - start="${st}" - here_queue='' - here_awaiting_end=false - here_awaiting_word=false - tokens='' - - c="${ch}" - next - - #dbg=true - # If this returns (does not exit), there are no errors. - ${start} - case "${tok%${US}*}" in - T_RPAREN) - ;; - *) - synerr 'Missing ")"' - ;; - esac - - printf "%d${RS}%c${RS}%s" ${lineno} "${c}" "${tokens}" - return 0 -} - -run_lexer() -{ - local fn="${1}" - local st="${2}" - shift 2 - - # Initialize global variables. - fname="${fn}" - lineno=1 - start="${st}" - here_queue='' - here_awaiting_end=false - here_awaiting_word=false - tokens='' - - # Read the first character and recognize the first token. - lgetc - next - - # If this returns (does not exit), there are no errors. - ${start} - if ! accept T_EOF; then - synexp '' - fi - - # Return the tokens. - printf '%s' "${tokens}" - - return 0 -} -- cgit v0.9.1