# Eggshell lexer # # Copyright (C) 2016 Patrick "P. J." McDermott # # This file is part of the Eggshell Compiler. # # The Eggshell Compiler is free software: you can redistribute it # and/or modify it under the terms of the GNU General Public License # as published by the Free Software Foundation, either version 3 of # the License, or (at your option) any later version. # # The Eggshell Compiler is distributed in the hope that it will be # useful, but WITHOUT ANY WARRANTY; without even the implied warranty # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with the Eggshell Compiler. If not, see # . #dbg=false fname= lineno= ln_off= start= c= wordexp= here_queue= here_awaiting_end= here_awaiting_word= tok= #dbg() #{ # if ${dbg}; then # printf 'DEBUG: %s\n' "${@}" >&2 # fi #} # # Error handling (used by scanning and interface functions) # error() { local fmt="${1}" shift 1 case "${fname}" in '-') printf "stdin:%d: ${fmt}\n" ${lineno} "${@}" >&2 ;; *) printf "%s:%d: ${fmt}\n" "${fname}" ${lineno} "${@}" >&2 ;; esac # The parser and lexer run in a subshell, so this just returns up to the # caller like an exception. exit 1 } synexp() { local t="${1}" shift 1 if [ "x${t}" = 'x' ]; then synerr '%s unexpected' "$(tokname "${tok}")" else synerr '%s unexpected (expecting %s)' "$(tokname "${tok}")" \ "$(tokname "${t}")" fi } synerr() { local fmt="${1}" shift 1 error "Syntax error: ${fmt}" "${@}" } # # Input reading # lgetc() { if [ ${lbufi} -ge ${lbufc} ]; then c='' else eval "c=\${lbufv_${lbufi}}" lbufi=$((${lbufi} + 1)) fi } lungetc() { lbufi=$((${lbufi} - 2)) eval "c=\${lbufv_${lbufi}}" lbufi=$((${lbufi} + 1)) } lsetc() { if [ ${lbufi} -ge ${lbufc} ]; then c='' else lbufi=$((${lbufi} - 1)) eval "c=\${lbufv_${lbufi}}" lbufi=$((${lbufi} + 1)) fi } # # Token recognition # next() { if ${here_awaiting_word}; then next_here return fi while :; do #dbg "parsing char '$c' at lineno $lineno" case "${c}" in '') lgetc tok=T_EOF return ;; "${LF}") if ${here_awaiting_end}; then synexp '' else case "${here_queue}" in *"${RS}"*) here_awaiting_end=false here_awaiting_word=true ;; esac fi lgetc lineno=$((${lineno} + 1)) tok=T_NEWLINE return ;; ' '|"${HT}") lgetc continue ;; \\) lgetc case "${c}" in "${LF}") lineno=$((${lineno} + 1)) lgetc continue ;; esac lungetc next_word return ;; '#') lgetc while :; do case "${c}" in "${LF}"|'') break ;; esac lgetc done continue ;; '&') lgetc case "${c}" in '&') lgetc tok=T_AND_IF return ;; esac tok=T_AND return ;; '|') lgetc case "${c}" in '|') lgetc tok=T_OR_IF return ;; esac tok=T_PIPE return ;; ';') lgetc case "${c}" in ';') lgetc tok=T_DSEMI return ;; esac #dbg T_SEMI tok=T_SEMI return ;; '(') lgetc tok=T_LPAREN return ;; ')') lgetc tok=T_RPAREN return ;; '<'|'>') next_io return ;; *) lungetc next_word return ;; esac lgetc done } next_here() { local here= local here_strip_tabs= local here_end= local here_escaped= local line= local word= local res= # Dequeue the here-document. here="${here_queue%%${RS}*}" here_strip_tabs="${here%%${US}*}" here_end="${here%${US}*}" here_end="$(printf '%s' "${here_end#*${US}}" | \ sed 's/\\//g; s/"//g; s/'\''//g;')" # Stupid Vim: ')" here_escaped="${here##*${US}}" here_queue="${here_queue#*${RS}}" here_awaiting_word=false line='' word='' while :; do case "${c}" in '') # Bash throws a warning when EOF occurs in a # here document. mksh throws an error. dash, # BusyBox ash, ksh93, and zsh accept EOF as a # delimiter. We aim for the lowest common # denominator, so throw an error like mksh does. synerr 'Here-document "%s" unclosed' \ "${here_end}" ;; "${LF}") word="${word}${line}" case "${line}" in "${here_end}") tok="T_WORD${US}${word}" return ;; esac word="${word}${c}" line='' ;; "${HT}") if ${here_strip_tabs}; then case "${line}" in '') ;; *) line="${line}${c}" ;; esac else line="${line}${c}" fi ;; '$') if ! ${here_escaped}; then lgetc if ! res="$(scan_wordexp)"; then exit 1 fi ln_off=${res%%${RS}*} res="${res#*${RS}}" lbufi="${res%%${RS}*}" lsetc res="${res#*${RS}}" lineno=$((${lineno} + ${ln_off})) line="${line}${res}" continue else line="${line}${c}" fi ;; *) line="${line}${c}" ;; esac lgetc done } next_io() { case "${c}" in '<') lgetc case "${c}" in '<') lgetc case "${c}" in '-') lgetc tok=T_DLESSDASH here_queue="${here_queue}true" here_awaiting_end=true here_awaiting_word=false break ;; esac tok=T_DLESS here_queue="${here_queue}false" here_awaiting_end=true here_awaiting_word=false break ;; '&') lgetc tok=T_LESSAND break ;; '>') lgetc tok=T_LESSGREAT break ;; esac tok=T_LESS break ;; '>') lgetc case "${c}" in '>') lgetc tok=T_DGREAT break ;; '&') lgetc tok=T_GREATAND break ;; '|') lgetc tok=T_CLOBBER break ;; esac tok=T_GREAT break ;; esac } next_word() { local res= if ! res="$(scan_word false)"; then exit 1 fi ln_off=${res%%${RS}*} res="${res#*${RS}}" lbufi="${res%%${RS}*}" lsetc res="${res#*${RS}}" # We must advance lineno because scan_word() was run in a subshell. lineno=$((${lineno} + ${ln_off})) tok="T_WORD${US}${res}" if ${here_awaiting_end}; then here_queue="${here_queue}${US}${res}" case "${res}" in *\\*|*'"'*|*"'"*) here_queue="${here_queue}${US}true" ;; *) here_queue="${here_queue}${US}false" ;; esac here_queue="${here_queue}${RS}" here_awaiting_end=false fi } # # Token scanning # scan_word() { local in_param="${1}" shift 1 local lines= local word= local quoted= local tmp_c= local res= lines=0 word='' quoted=false while :; do #dbg "parsing word char '$c' at lineno $lineno" case "${c}" in '') break ;; "${LF}") if ! ${in_param} && ! ${quoted}; then break fi lineno=$((${lineno} + 1)) lines=$((${lines} + 1)) word="${word}${c}" ;; ' '|"${HT}"|'&'|'|'|';'|'('|')'|'<'|'>') if ! ${in_param} && ! ${quoted}; then break fi word="${word}${c}" ;; '$') case "${here_queue}" in *"${RS}"*) if ${here_awaiting_end}; then synerr '%s %s %s %s' \ 'Word expansions' \ 'not supported in' \ 'here-document' \ 'delimiters' fi ;; esac lgetc if ! res="$(scan_wordexp)"; then exit 1 fi ln_off=${res%%${RS}*} res="${res#*${RS}}" lbufi="${res%%${RS}*}" lsetc res="${res#*${RS}}" # We must advance lineno because scan_wordexp() # was run in a subshell. lineno=$((${lineno} + ${ln_off})) lines=$((${lines} + ${ln_off})) word="${word}${res}" # scan_wordexp() leaves behind an unused # character, so we should skip the lgetc() call # below. continue ;; '`') synerr 'Backquoted (old-style) %s' \ 'command substitution not supported' break ;; \\) #dbg 'first backslash in word' word="${word}${c}" lgetc #dbg "next char: '$c'" case "${c}" in '') # Bash, ksh93, mksh, and zsh ignore a # backslash at the end of a file, but # dash and BusyBox ash include it in the # word. To help with script # portability, we'll throw an error # (which is a reasonable thing to do # anyway). synerr 'Unexpected end of file %s' \ 'after "\"' ;; esac word="${word}${c}" ;; \') word="${word}${c}" if ${quoted}; then lgetc continue fi while :; do lgetc word="${word}${c}" case "${c}" in '') synerr '%s %s' \ 'Unterminated' \ 'quoted string' ;; "${LF}") lineno=$((${lineno} +1)) lines=$((${lines} + 1)) ;; \') break ;; esac done ;; '"') word="${word}${c}" if ${quoted}; then quoted=false else quoted=true fi ;; '}') if ${in_param} && ! ${quoted}; then break fi word="${word}${c}" ;; *) word="${word}${c}" ;; esac lgetc done if ${quoted}; then synerr 'Unterminated quoted string' fi printf "%d${RS}%d${RS}%s" ${lines} ${lbufi} "${word}" } scan_wordexp() { local res= wordexp='' ln_off=0 case "${c}" in '{') # Parameter expansion brace scan_wordexp_param_brace ;; '(') # Arithmetic expansion or command substitution lgetc case "${c}" in '(') # Arithmetic expansion scan_wordexp_arith ;; *) # Command substitution if ! res="$(run_sublexer "sub${fname}" \ ${lineno} "${start}" \ ${lbufi})"; then exit 1 fi lbufi="${res##*${RS}}" lsetc res="${res%${RS}*}" ln_off=${res##*${RS}} res="${res%${RS}*}" ln_off=$((${ln_off} - ${lineno})) lineno=$((${lineno} + ${ln_off})) wordexp="\$(${SOH}C${STX}${res}" wordexp="${wordexp}${ETX})" # ")" is recognized in run_sublexer(). ;; esac ;; [@*#?\$!A-Za-z0-9_-]) if ! res="$(scan_param)"; then exit 1 fi ln_off=${res%%${RS}*} res="${res#*${RS}}" lbufi="${res%%${RS}*}" lsetc res="${res#*${RS}}" lineno=$((${lineno} + ${ln_off})) wordexp="\$${res}" ;; esac printf "%d${RS}%d${RS}%s" ${ln_off} ${lbufi} "${wordexp}" return 0 } scan_wordexp_param_brace() { local mod= local res= local param= mod=true lgetc case "${c}" in '#') lgetc case "${c}" in [@*#?\$!A-Za-z0-9_-]) # String length expansion if ! res="$(scan_param)"; then exit 1 fi ln_off=${res%%${RS}*} res="${res#*${RS}}" lbufi="${res%%${RS}*}" lsetc res="${res#*${RS}}" param="#${res}" lineno=$((${lineno} + ${ln_off})) # Disable modifications. mod=false ;; *) # Special parameter "#" param='#' ;; esac ;; *) if ! res="$(scan_param)"; then exit 1 fi ln_off=${res%%${RS}*} res="${res#*${RS}}" lbufi="${res%%${RS}*}" lsetc res="${res#*${RS}}" param="${res}" lineno=$((${lineno} + ${ln_off})) ;; esac wordexp="\${${param}" # If modifications are allowed if ${mod}; then # Check for modifications. mod=false case "${c}" in ':') mod=true wordexp="${wordexp}${c}" lgetc case "${c}" in '-'|'='|'?'|'+') wordexp="${wordexp}${c}" lgetc ;; esac ;; '-'|'='|'?'|'+') mod=true wordexp="${wordexp}${c}" lgetc ;; '%') mod=true wordexp="${wordexp}${c}" lgetc case "${c}" in '%') wordexp="${wordexp}${c}" lgetc ;; esac ;; '#') mod=true wordexp="${wordexp}${c}" lgetc case "${c}" in '#') wordexp="${wordexp}${c}" lgetc ;; esac ;; esac fi # If a modification was found if ${mod}; then # Get word. if ! res="$(scan_word true '')"; then exit 1 fi ln_off=${res%%${RS}*} res="${res#*${RS}}" lbufi="${res%%${RS}*}" lsetc res="${res#*${RS}}" # We must advance lineno because scan_word() was run in a # subshell. lineno=$((${lineno} + ${ln_off})) wordexp="${wordexp}${res}" #dbg "param mod word: '$res'" fi # Check for right brace. case "${c}" in '}') wordexp="${wordexp}${c}" lgetc ;; *) synerr 'Missing "}"' ;; esac return 0 } scan_param() { local param= param='' case "${c}" in [@*#?\$!0-]) # Special parameter param="${c}" lgetc ;; [1-9]) # Positional parameter param="${param}${c}" lgetc while :; do case "${c}" in [!0-9]) break ;; esac param="${param}${c}" lgetc done ;; [A-Za-z_]) # Parameter name param="${param}${c}" lgetc while :; do case "${c}" in [!A-Za-z0-9_]) break ;; esac param="${param}${c}" lgetc done ;; *) synerr 'Bad parameter name' ;; esac printf "%d${RS}%d${RS}%s" 0 ${lbufi} "${param}" return 0 } scan_wordexp_arith() { local arith= local paren_lvl= local res= arith='' paren_lvl=0 lgetc while :; do case "${c}" in '') synerr 'end of file unexpected (%s)' \ 'expecting "))"' ;; '(') arith="${arith}${c}" paren_lvl=$((${paren_lvl} + 1)) ;; ')') if [ ${paren_lvl} -eq 0 ]; then lgetc case "${c}" in ')') wordexp="\$((${arith}))" lgetc return 0 ;; esac synerr 'Arithmetic expansion: ")" %s' \ 'unexpected' fi arith="${arith}${c}" paren_lvl=$((${paren_lvl} - 1)) ;; '$') lgetc if ! res="$(scan_wordexp)"; then exit 1 fi ln_off=${res%%${RS}*} res="${res#*${RS}}" lbufi="${res%%${RS}*}" lsetc res="${res#*${RS}}" # We must advance lineno because scan_wordexp() # was run in a subshell. lineno=$((${lineno} + ${ln_off})) arith="${arith}${res}" continue ;; *) arith="${arith}${c}" ;; esac lgetc done } run_sublexer() { local fn="${1}" local ln="${2}" local st="${3}" local i="${4}" shift 4 # Initialize global variables. fname="${fn}" lineno=${ln} start="${st}" here_queue='' here_awaiting_end=false here_awaiting_word=false lbufi="${i}" lsetc next #dbg=true # If this returns (does not exit), there are no errors. ${start} case "${tok%${US}*}" in T_RPAREN) ;; *) synerr 'Missing ")"' ;; esac printf "${RS}%d${RS}%d" ${lineno} ${lbufi} return 0 } # # Interface # run_lexer() { local fn="${1}" local buf="${2}" local st="${3}" shift 3 # Initialize global variables. fname="${fn}" lineno=1 start="${st}" here_queue='' here_awaiting_end=false here_awaiting_word=false # Read file into array eval "$(printf '%s' "${buf}" | awk -v FS='' -v j=0 \ -v squote="'" -v esc_squote="'\\\\''" ' { for (i = 1; i <= NF; ++i) { sub(squote, esc_squote, $i); printf("lbufv_%d='\''%s'\''\n", j++, $i); }; printf("lbufv_%d='\''\n'\''\n", j++); } ')" lbufi=0 lbufc=${#buf} # Read the first character and recognize the first token. lgetc next if ! ${start}; then # Unexpected EOF synexp '' fi if ! accept T_EOF; then synexp '' fi return 0 } accept() { local t="${1}" shift 1 local rw= #dbg "looking for $t, current tok ${tok%%${US}*}" case "${t}" in T_IF|T_THEN|T_ELSE|T_ELIF|T_FI|T_DO|T_DONE|\ T_CASE|T_ESAC|T_WHILE|T_UNTIL|T_FOR|\ T_LBRACE|T_RBRACE|T_BANG|T_IN|\ T_STATIC|T_LOCAL|T_RETURN) #dbg "looking for reserved word $t, have '$tok'" if ! [ "x${tok%%${US}*}" = "x${t}" ]; then # Reserved words are recognized as literal # T_WORDs. if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then return 1 fi # T_WORD data unit must match reserved word # exactly. if ! [ "x${tok#T_WORD${US}}" = \ "x$(toktext "${t}")" ]; then return 1 fi # If the token matches the reserved word, # replace it with the reserved word token. tok="${t}" fi ;; T_NAME) # Names are recognized as literal T_WORDs. if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then return 1 fi # Validate name. case "${tok#*${US}}" in [!A-Za-z_]*) return 1 ;; *[!0-9A-Za-z_]*) return 1 ;; esac tok="T_NAME${US}${tok#T_WORD${US}}" ;; T_FNAME) # Function names are recognized as literal T_WORDs. if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then return 1 fi # Validate name. case "${tok#*${US}}" in [!A-Za-z_]*) return 1 ;; *[!0-9A-Za-z_]*) return 1 ;; esac # Verify that the function name doesn't match any # reserved words. for rw in T_IF T_THEN T_ELSE T_ELIF T_FI T_DO T_DONE \ T_CASE T_ESAC T_WHILE T_UNTIL T_FOR \ T_LBRACE T_RBRACE T_BANG T_IN \ T_STATIC T_LOCAL T_RETURN; do if [ "x${tok#T_WORD${US}}" = \ "x$(toktext "${rw}")" ]; then tok="${rw}" return 1 fi done tok="T_FNAME${US}${tok#T_WORD${US}}" ;; T_CMDNAME) # The first word of a simple command is to be checked # for reserved words. if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then return 1 fi # Verify that the word doesn't match any reserved words. for rw in T_IF T_THEN T_ELSE T_ELIF T_FI T_DO T_DONE \ T_CASE T_ESAC T_WHILE T_UNTIL T_FOR \ T_LBRACE T_RBRACE T_BANG T_IN \ T_STATIC T_LOCAL T_RETURN; do if [ "x${tok#T_WORD${US}}" = \ "x$(toktext "${rw}")" ]; then tok="${rw}" return 1 fi done tok="T_CMDNAME${US}${tok#T_WORD${US}}" ;; T_IO_NUMBER) # I/O numbers are recognized as literal T_WORDs. if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then return 1 fi # Validate number. case "${tok#*${US}}" in *[!0-9]*) return 1 ;; esac tok="T_IO_NUMBER${US}${tok#T_WORD${US}}" ;; *) if ! [ "x${tok%%${US}*}" = "x${t}" ]; then return 1 fi ;; esac #dbg "accept $t" printf '%s' "${tok}${RS}" next return 0 } expect() { local t="${1}" shift 1 if accept "${t}"; then return 0 else synexp "${t}" fi } inject() { local t="${1}" shift 1 printf '%s' "${t}${RS}" return 0 }