diff options
Diffstat (limited to 'eshtrans/frontend/lexer.esh')
-rw-r--r-- | eshtrans/frontend/lexer.esh | 990 |
1 files changed, 990 insertions, 0 deletions
diff --git a/eshtrans/frontend/lexer.esh b/eshtrans/frontend/lexer.esh new file mode 100644 index 0000000..0991239 --- /dev/null +++ b/eshtrans/frontend/lexer.esh @@ -0,0 +1,990 @@ +# Eggshell lexer +# +# Copyright (C) 2016 Patrick "P. J." McDermott +# +# This file is part of the Eggshell Compiler. +# +# The Eggshell Compiler is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# The Eggshell Compiler is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty +# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with the Eggshell Compiler. If not, see +# <http://www.gnu.org/licenses/>. + +dbg=false + +fname= +lineno= +ln_off= +start= +c= +wordexp= +here_queue= +here_awaiting_end= +here_awaiting_word= +tok= +tokens= + +dbg() +{ + if ${dbg}; then + printf 'DEBUG: %s\n' "${@}" >&2 + fi +} + +# +# Error handling (used by scanning and interface functions) +# + +error() +{ + local fmt="${1}" + shift 1 + + case "${fname}" in + '-') + printf "stdin:%d: ${fmt}\n" ${lineno} "${@}" >&2 + ;; + *) + printf "%s:%d: ${fmt}\n" "${fname}" ${lineno} "${@}" >&2 + ;; + esac + + # The parser and lexer run in a subshell, so this just returns up to the + # caller like an exception. + exit 1 +} + +synexp() +{ + local t="${1}" + shift 1 + + if [ "x${t}" = 'x' ]; then + synerr '%s unexpected' "$(tokname "${tok}")" + else + synerr '%s unexpected (expecting %s)' "$(tokname "${tok}")" \ + "$(tokname "${t}")" + fi +} + +synerr() +{ + local fmt="${1}" + shift 1 + + error "Syntax error: ${fmt}" "${@}" +} + +# +# Input reading +# + +lgetc() +{ + c="$(dd bs=1 count=1 2>/dev/null; printf '.')" + c="${c%.}" +} + +# +# Token recognition +# + +next() +{ + if ${here_awaiting_word}; then + next_here + return + fi + while :; do + dbg "parsing char '$c' at lineno $lineno" + case "${c}" in + '') + lgetc + tok=T_EOF + return + ;; + "${LF}") + if ${here_awaiting_end}; then + synexp '' + else + case "${here_queue}" in *"${RS}"*) + here_awaiting_end=false + here_awaiting_word=true + ;; + esac + fi + lgetc + lineno=$((${lineno} + 1)) + tok=T_NEWLINE + return + ;; + ' '|"${HT}") + lgetc + continue + ;; + \\) + lgetc + case "${c}" in "${LF}") + lineno=$((${lineno} + 1)) + lgetc + continue + ;; + esac + next_word \\ + return + ;; + '#') + lgetc + while :; do + case "${c}" in "${LF}"|'') + break + ;; + esac + lgetc + done + continue + ;; + '&') + lgetc + case "${c}" in '&') + lgetc + tok=T_AND_IF + return + ;; + esac + tok=T_AND + return + ;; + '|') + lgetc + case "${c}" in '|') + lgetc + tok=T_OR_IF + return + ;; + esac + tok=T_PIPE + return + ;; + ';') + lgetc + case "${c}" in ';') + lgetc + tok=T_DSEMI + return + ;; + esac + dbg T_SEMI + tok=T_SEMI + return + ;; + '(') + lgetc + tok=T_LPAREN + return + ;; + ')') + lgetc + tok=T_RPAREN + return + ;; + '<'|'>') + next_io + return + ;; + *) + next_word '' + return + ;; + esac + lgetc + done +} + +next_here() +{ + local here= + local here_strip_tabs= + local here_end= + local here_escaped= + local line= + local word= + local res= + local wordexp= + + # Dequeue the here-document. + here="${here_queue%%${RS}*}" + here_strip_tabs="${here%%${US}*}" + here_end="${here%${US}*}" + here_end="$(printf '%s' "${here_end#*${US}}" | \ + sed 's/\\//g; s/"//g; s/'\''//g;')" # Stupid Vim: ')" + here_escaped="${here##*${US}}" + here_queue="${here_queue#*${RS}}" + here_awaiting_word=false + + line='' + word='' + while :; do + case "${c}" in + '') + # Bash throws a warning when EOF occurs in a + # here document. mksh throws an error. dash, + # BusyBox ash, ksh93, and zsh accept EOF as a + # delimiter. We aim for the lowest common + # denominator, so throw an error like mksh does. + synerr 'Here-document "%s" unclosed' \ + "${here_end}" + ;; + "${LF}") + word="${word}${line}" + case "${line}" in "${here_end}") + tok="T_WORD${US}${word}" + return + ;; + esac + word="${word}${c}" + line='' + ;; + "${HT}") + if ${here_strip_tabs}; then + case "${line}" in + '') + ;; + *) + line="${line}${c}" + ;; + esac + else + line="${line}${c}" + fi + ;; + '$') + if ! ${here_escaped}; then + lgetc + if ! res="$(scan_wordexp)"; then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + wordexp="${res%%${RS}*}" + lineno=$((${lineno} + ${ln_off})) + line="${line}${wordexp}" + continue + else + line="${line}${c}" + fi + ;; + *) + line="${line}${c}" + ;; + esac + lgetc + done +} + +next_io() +{ + case "${c}" in + '<') + lgetc + case "${c}" in + '<') + lgetc + case "${c}" in '-') + lgetc + tok=T_DLESSDASH + here_queue="${here_queue}true" + here_awaiting_end=true + here_awaiting_word=false + break + ;; + esac + tok=T_DLESS + here_queue="${here_queue}false" + here_awaiting_end=true + here_awaiting_word=false + break + ;; + '&') + lgetc + tok=T_LESSAND + break + ;; + '>') + lgetc + tok=T_LESSGREAT + break + ;; + esac + tok=T_LESS + break + ;; + '>') + lgetc + case "${c}" in + '>') + lgetc + tok=T_DGREAT + break + ;; + '&') + lgetc + tok=T_GREATAND + break + ;; + '|') + lgetc + tok=T_CLOBBER + break + ;; + esac + tok=T_GREAT + break + ;; + esac +} + +next_word() +{ + local prev_c="${1}" + shift 1 + local res= + local word= + + if ! res="$(scan_word false)"; then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + word="${prev_c}${res%%${RS}*}" + + # We must advance lineno because scan_word() was run in a subshell. + lineno=$((${lineno} + ${ln_off})) + tok="T_WORD${US}${word}" + + if ${here_awaiting_end}; then + here_queue="${here_queue}${US}${word}" + case "${word}" in + *\\*|*'"'*|*"'"*) + here_queue="${here_queue}${US}true" + ;; + *) + here_queue="${here_queue}${US}false" + ;; + esac + here_queue="${here_queue}${RS}" + here_awaiting_end=false + fi +} + +# +# Token scanning +# + +scan_word() +{ + local in_param="${1}" + shift 1 + local res= + local word= + local quoted= + local lines= + local wordexp= + + word='' + quoted=false + lines=0 + while :; do + dbg "parsing word char '$c' at lineno $lineno" + case "${c}" in + '') + break + ;; + "${LF}") + if ! ${in_param} && ! ${quoted}; then + break + fi + lineno=$((${lineno} + 1)) + lines=$((${lines} + 1)) + word="${word}${c}" + ;; + ' '|"${HT}"|'&'|'|'|';'|'('|')'|'<'|'>') + if ! ${in_param} && ! ${quoted}; then + break + fi + word="${word}${c}" + ;; + '$') + case "${here_queue}" in *"${RS}"*) + if ${here_awaiting_end}; then + synerr '%s %s %s %s' \ + 'Word expansions' \ + 'not supported in' \ + 'here-document' \ + 'delimiters' + fi + esac + lgetc + if ! res=$(scan_wordexp); then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + wordexp="${res%%${RS}*}" + # We must advance lineno because scan_wordexp() + # was run in a subshell. + lineno=$((${lineno} + ${ln_off})) + word="${word}${wordexp}" + # scan_wordexp() leaves behind an unused + # character, so we should skip the lgetc() call + # below. + continue + ;; + '`') + synerr 'Backquoted (old-style) %s' \ + 'command substitution not supported' + break + ;; + \\) + word="${word}${c}" + lgetc + case "${c}" in '') + # Bash, ksh93, mksh, and zsh ignore a + # backslash at the end of a file, but + # dash and BusyBox ash include it in the + # word. To help with script + # portability, we'll throw an error + # (which is a reasonable thing to do + # anyway). + synerr 'Unexpected end of file %s' \ + 'after "\"' + ;; + esac + word="${word}${c}" + ;; + \') + word="${word}${c}" + while :; do + lgetc + word="${word}${c}" + case "${c}" in + '') + synerr '%s %s' \ + 'Unterminated' \ + 'quoted string' + ;; + \') + break + ;; + esac + done + ;; + '"') + word="${word}${c}" + if ${quoted}; then + quoted=false + else + quoted=true + fi + ;; + '}') + if ${in_param} && ! ${quoted}; then + break + fi + word="${word}${c}" + ;; + *) + word="${word}${c}" + ;; + esac + lgetc + done + + if ${quoted}; then + synerr 'Unterminated quoted string' + fi + + printf "%d${RS}%c${RS}%s" ${lines} "${c}" "${word}" +} + +scan_wordexp() +{ + local res= + local toks= + local param= + + wordexp='' + ln_off=0 + case "${c}" in + '{') + # Parameter expansion brace + scan_wordexp_param_brace + ;; + '(') + # Arithmetic expansion or command substitution + lgetc + case "${c}" in + '(') + # Arithmetic expansion + scan_wordexp_arith + ;; + *) + # Command substitution + if ! res="$(run_sublexer "sub${fname}" \ + ${lineno} "${start}" \ + "${c}")"; then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + toks="${res%%${RS}*}" + lineno=${ln_off} + wordexp="\$(${SOH}C${STX}${toks}" + wordexp="${wordexp}${ETX})" + # ")" is recognized in run_sublexer(). + ;; + esac + ;; + [@*#?$!A-Za-z0-9_-]) + if ! res="$(scan_param)"; then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + param="${res%%${RS}*}" + lineno=$((${lineno} + ${ln_off})) + wordexp="\$${param}" + ;; + esac + + printf "%d${RS}%c${RS}%s" ${ln_off} "${c}" "${wordexp}" + return 0 +} + +scan_wordexp_param_brace() +{ + local mod= + local res= + local param= + local word= + + mod=true + + lgetc + case "${c}" in + '#') + lgetc + case "${c}" in + [@*#?$!A-Za-z0-9_-]) + # String length expansion + if ! res="$(scan_param)"; then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + param="${res%%${RS}*}" + lineno=$((${lineno} + ${ln_off})) + # Disable modifications. + mod=false + ;; + *) + # Special parameter "#" + param='#' + ;; + esac + ;; + *) + if ! res="$(scan_param)"; then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + param="${res%%${RS}*}" + lineno=$((${lineno} + ${ln_off})) + ;; + esac + wordexp="\${${param}" + + # If modifications are allowed + if ${mod}; then + # Check for modifications. + mod=false + case "${c}" in + ':') + mod=true + wordexp="${wordexp}${c}" + lgetc + case "${c}" in '-'|'='|'?'|'+') + wordexp="${wordexp}${c}" + lgetc + ;; + esac + ;; + '-'|'='|'?'|'+') + mod=true + wordexp="${wordexp}${c}" + lgetc + ;; + '%') + mod=true + wordexp="${wordexp}${c}" + lgetc + case "${c}" in '%') + wordexp="${wordexp}${c}" + lgetc + ;; + esac + ;; + '#') + mod=true + wordexp="${wordexp}${c}" + lgetc + case "${c}" in '#') + wordexp="${wordexp}${c}" + lgetc + ;; + esac + ;; + esac + fi + + # If a modification was found + if ${mod}; then + # Get word. + if ! res="$(scan_word true)"; then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + word="${res%%${RS}*}" + # We must advance lineno because scan_word() was run in a + # subshell. + lineno=$((${lineno} + ${ln_off})) + wordexp="${wordexp}${word}" + dbg "param mod word: '$word'" + fi + + # Check for right brace. + case "${c}" in + '}') + wordexp="${wordexp}${c}" + lgetc + ;; + *) + synerr 'Missing "}"' + ;; + esac + + return 0 +} + +scan_param() +{ + local param= + + param='' + case "${c}" in + [@*#?$!0-]) + # Special parameter + param="${c}" + lgetc + ;; + [1-9]) + # Positional parameter + param="${param}${c}" + lgetc + while :; do + case "${c}" in [!0-9]) + break + ;; + esac + param="${param}${c}" + lgetc + done + ;; + [A-Za-z_]) + # Parameter name + param="${param}${c}" + lgetc + while :; do + case "${c}" in [!A-Za-z0-9_]) + break + ;; + esac + param="${param}${c}" + lgetc + done + ;; + *) + synerr 'Bad parameter name' + ;; + esac + + printf "%d${RS}%c${RS}%s" 0 "${c}" "${param}" + return 0 +} + +scan_wordexp_arith() +{ + local arith= + local paren_lvl= + local res= + local sub_wordexp= + + arith='' + paren_lvl=0 + while :; do + lgetc + case "${c}" in + '') + synerr 'end of file unexpected (%s)' \ + 'expecting "))"' + ;; + '(') + arith="${arith}${c}" + paren_lvl=$((${paren_lvl} + 1)) + ;; + ')') + if [ ${paren_lvl} -eq 0 ]; then + lgetc + case "${c}" in ')') + wordexp="\$((${arith}))" + lgetc + return 0 + ;; + esac + synerr 'Arithmetic expansion: ")" %s' \ + 'unexpected' + fi + arith="${arith}${c}" + paren_lvl=$((${paren_lvl} - 1)) + ;; + '$') + lgetc + if ! res=$(scan_wordexp); then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + sub_wordexp="${res%%${RS}*}" + # We must advance lineno because scan_wordexp() + # was run in a subshell. + lineno=$((${lineno} + ${ln_off})) + arith="${arith}${sub_wordexp}" + ;; + *) + arith="${arith}${c}" + ;; + esac + done +} + +run_sublexer() +{ + local fn="${1}" + local ln="${2}" + local st="${3}" + local ch="${4}" + shift 4 + + # Initialize global variables. + fname="${fn}" + lineno=${ln} + start="${st}" + here_queue='' + here_awaiting_end=false + here_awaiting_word=false + tokens='' + + c="${ch}" + next + + #dbg=true + # If this returns (does not exit), there are no errors. + ${start} + case "${tok%${US}*}" in + T_RPAREN) + ;; + *) + synerr 'Missing ")"' + ;; + esac + + printf "%d${RS}%c${RS}%s" ${lineno} "${c}" "${tokens}" + return 0 +} + +# +# Interface +# + +run_lexer() +{ + local fn="${1}" + local st="${2}" + shift 2 + + # Initialize global variables. + fname="${fn}" + lineno=1 + start="${st}" + here_queue='' + here_awaiting_end=false + here_awaiting_word=false + tokens='' + + # Read the first character and recognize the first token. + lgetc + next + + if ! ${start}; then + # Unexpected EOF + synexp '' + fi + if ! accept T_EOF; then + synexp '' + fi + + # Return the tokens. + printf '%s' "${tokens}" + + return 0 +} + +accept() +{ + local t="${1}" + shift 1 + local rw= + + dbg "looking for $t, current tok ${tok%%${US}*}" + case "${t}" in + T_IF|T_THEN|T_ELSE|T_ELIF|T_FI|\ + T_DO|T_DONE|T_CASE|T_ESAC|T_WHILE|T_UNTIL|\ + T_FOR|T_LBRACE|T_RBRACE|T_BANG|T_IN) + dbg "looking for reserved word $t, have '$tok'" + if ! [ "x${tok%%${US}*}" = "x${t}" ]; then + # Reserved words are recognized as literal + # T_WORDs. + if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then + return 1 + fi + # T_WORD data unit must match reserved word + # exactly. + if ! [ "x${tok#T_WORD${US}}" = \ + "x$(toktext "${t}")" ]; then + return 1 + fi + # If the token matches the reserved word, + # replace it with the reserved word token. + tok="${t}" + fi + ;; + T_NAME) + # Names are recognized as literal T_WORDs. + if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then + return 1 + fi + # Validate name. + case "${tok%%${US}*}" in + [A-Za-z_][0-9A-Za-z_]*) + ;; + *) + return 1 + ;; + esac + tok="T_NAME${US}${tok#T_WORD${US}}" + ;; + T_FNAME) + # Function names are recognized as literal T_WORDs. + if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then + return 1 + fi + # Validate name. + case "${tok%%${US}*}" in + [A-Za-z_][0-9A-Za-z_]*) + ;; + *) + return 1 + ;; + esac + # Verify that the function name doesn't match any + # reserved words. + for rw in T_IF T_THEN T_ELSE T_ELIF T_FI T_DO T_DONE \ + T_CASE T_ESAC T_WHILE T_UNTIL T_FOR \ + T_LBRACE T_RBRACE T_BANG T_IN; do + if [ "x${tok#T_WORD${US}}" = \ + "x$(toktext "${rw}")" ]; then + tok="${rw}" + return 1 + fi + done + tok="T_FNAME${US}${tok#T_WORD${US}}" + ;; + T_CMDNAME) + # The first word of a simple command is to be checked + # for reserved words. + if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then + return 1 + fi + # Verify that the word doesn't match any reserved words. + for rw in T_IF T_THEN T_ELSE T_ELIF T_FI T_DO T_DONE \ + T_CASE T_ESAC T_WHILE T_UNTIL T_FOR \ + T_LBRACE T_RBRACE T_BANG T_IN; do + if [ "x${tok#T_WORD${US}}" = \ + "x$(toktext "${rw}")" ]; then + tok="${rw}" + return 1 + fi + done + ;; + *) + if ! [ "x${tok%%${US}*}" = "x${t}" ]; then + return 1 + fi + ;; + esac + + dbg "accept $t" + tokens="${tokens}${tok}${RS}" + next + return 0 +} + +expect() +{ + local t="${1}" + shift 1 + + if accept "${t}"; then + return 0 + else + synexp "${t}" + fi +} |