diff options
author | P. J. McDermott <pj@pehjota.net> | 2016-02-21 04:39:39 (EST) |
---|---|---|
committer | P. J. McDermott <pj@pehjota.net> | 2016-02-21 04:39:39 (EST) |
commit | c9f95bf852092d8b1640b92f1c31e84420bb51dd (patch) | |
tree | b47ff3e2a91fa39f234df6ddfd9559b6d4714de2 | |
parent | 4e6bfd6fe0d48ddf49cd61bb8cb31881a1e5e369 (diff) | |
download | eggshell-c9f95bf852092d8b1640b92f1c31e84420bb51dd.zip eggshell-c9f95bf852092d8b1640b92f1c31e84420bb51dd.tar.gz eggshell-c9f95bf852092d8b1640b92f1c31e84420bb51dd.tar.bz2 |
Copy everything into a new eshtrans/ directory
Split out and rename functions and variables where appropriate.
Also add license headers. (The old scripts under parsing/ can be used
under the same license.)
-rw-r--r-- | eshtrans/backend/codegen.esh | 94 | ||||
-rw-r--r-- | eshtrans/backend/main.esh | 31 | ||||
-rw-r--r-- | eshtrans/common.esh | 27 | ||||
-rw-r--r-- | eshtrans/frontend/lexer.esh | 990 | ||||
-rw-r--r-- | eshtrans/frontend/main.esh | 30 | ||||
-rw-r--r-- | eshtrans/frontend/parser.esh | 591 | ||||
-rw-r--r-- | eshtrans/main.esh | 101 | ||||
-rw-r--r-- | eshtrans/tokens.esh | 139 |
8 files changed, 2003 insertions, 0 deletions
diff --git a/eshtrans/backend/codegen.esh b/eshtrans/backend/codegen.esh new file mode 100644 index 0000000..dea169b --- /dev/null +++ b/eshtrans/backend/codegen.esh @@ -0,0 +1,94 @@ +# Shell command language code generator +# +# Copyright (C) 2016 Patrick "P. J." McDermott +# +# This file is part of the Eggshell Compiler. +# +# The Eggshell Compiler is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# The Eggshell Compiler is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty +# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with the Eggshell Compiler. If not, see +# <http://www.gnu.org/licenses/>. + +sc= + +sgetc() +{ + sc="$(dd bs=1 count=1 2>/dev/null; printf '.')" + sc="${sc%.}" +} + +codegen_sub() +{ + local array="${1}" + shift 1 + + IFS="${RS}" + for t in ${array}; do + toktext "${t}" + case "${t%${US}*}" in + T_NEWLINE) + ;; + *) + printf ' ' + ;; + esac + done + unset IFS +} + +# The token stack is encoded in a string in the following grammar: +# Terminal symbols: +# TOKEN +# Production rules: +# stack = tokens [ '<SOH>' type '<STX>' stack '<ETX>' [ tokens ] ] ; +# tokens = TOKEN { '<RS>' TOKEN } ; +# type = 'C' ; +# We need to recurse through this stack to get to all the tokens. +# Each element in the stack (an array of tokens) gets run through the codegen to +# become text that is inserted into the array below. +sh_parse_stack() +{ + local array= + + array='' + while :; do + sgetc + case "${sc}" in + '') + # EOF + break + ;; + "${SOH}") + # New stack element + sgetc + case "${sc}" in + 'C') + # Command substitution + sgetc # STX + array="${array}$(\ + sh_parse_stack)." + array="${array%.}" + ;; + esac + ;; + "${ETX}") + # End of stack element + break + ;; + *) + # Token character + array="${array}${sc}" + ;; + esac + done + codegen_sub "${array}" +} diff --git a/eshtrans/backend/main.esh b/eshtrans/backend/main.esh new file mode 100644 index 0000000..884a2c3 --- /dev/null +++ b/eshtrans/backend/main.esh @@ -0,0 +1,31 @@ +# Shell command language backend interface +# +# Copyright (C) 2016 Patrick "P. J." McDermott +# +# This file is part of the Eggshell Compiler. +# +# The Eggshell Compiler is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# The Eggshell Compiler is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty +# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with the Eggshell Compiler. If not, see +# <http://www.gnu.org/licenses/>. + +sh_codegen() +{ + local toks="${1}" + shift 1 + + if printf '%s' "${toks}" | sh_parse_stack; then + return 0 + else + return 1 + fi +} diff --git a/eshtrans/common.esh b/eshtrans/common.esh new file mode 100644 index 0000000..a0b73be --- /dev/null +++ b/eshtrans/common.esh @@ -0,0 +1,27 @@ +# Common constants +# +# Copyright (C) 2016 Patrick "P. J." McDermott +# +# This file is part of the Eggshell Compiler. +# +# The Eggshell Compiler is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# The Eggshell Compiler is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty +# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with the Eggshell Compiler. If not, see +# <http://www.gnu.org/licenses/>. + +SOH="$(printf '\001.')"; SOH="${SOH%.}" +STX="$(printf '\002.')"; STX="${STX%.}" +ETX="$(printf '\003.')"; ETX="${ETX%.}" + HT="$(printf '\t.')"; HT="${HT%.}" + LF="$(printf '\n.')"; LF="${LF%.}" + RS="$(printf '\036.')"; RS="${RS%.}" + US="$(printf '\037.')"; US="${US%.}" diff --git a/eshtrans/frontend/lexer.esh b/eshtrans/frontend/lexer.esh new file mode 100644 index 0000000..0991239 --- /dev/null +++ b/eshtrans/frontend/lexer.esh @@ -0,0 +1,990 @@ +# Eggshell lexer +# +# Copyright (C) 2016 Patrick "P. J." McDermott +# +# This file is part of the Eggshell Compiler. +# +# The Eggshell Compiler is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# The Eggshell Compiler is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty +# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with the Eggshell Compiler. If not, see +# <http://www.gnu.org/licenses/>. + +dbg=false + +fname= +lineno= +ln_off= +start= +c= +wordexp= +here_queue= +here_awaiting_end= +here_awaiting_word= +tok= +tokens= + +dbg() +{ + if ${dbg}; then + printf 'DEBUG: %s\n' "${@}" >&2 + fi +} + +# +# Error handling (used by scanning and interface functions) +# + +error() +{ + local fmt="${1}" + shift 1 + + case "${fname}" in + '-') + printf "stdin:%d: ${fmt}\n" ${lineno} "${@}" >&2 + ;; + *) + printf "%s:%d: ${fmt}\n" "${fname}" ${lineno} "${@}" >&2 + ;; + esac + + # The parser and lexer run in a subshell, so this just returns up to the + # caller like an exception. + exit 1 +} + +synexp() +{ + local t="${1}" + shift 1 + + if [ "x${t}" = 'x' ]; then + synerr '%s unexpected' "$(tokname "${tok}")" + else + synerr '%s unexpected (expecting %s)' "$(tokname "${tok}")" \ + "$(tokname "${t}")" + fi +} + +synerr() +{ + local fmt="${1}" + shift 1 + + error "Syntax error: ${fmt}" "${@}" +} + +# +# Input reading +# + +lgetc() +{ + c="$(dd bs=1 count=1 2>/dev/null; printf '.')" + c="${c%.}" +} + +# +# Token recognition +# + +next() +{ + if ${here_awaiting_word}; then + next_here + return + fi + while :; do + dbg "parsing char '$c' at lineno $lineno" + case "${c}" in + '') + lgetc + tok=T_EOF + return + ;; + "${LF}") + if ${here_awaiting_end}; then + synexp '' + else + case "${here_queue}" in *"${RS}"*) + here_awaiting_end=false + here_awaiting_word=true + ;; + esac + fi + lgetc + lineno=$((${lineno} + 1)) + tok=T_NEWLINE + return + ;; + ' '|"${HT}") + lgetc + continue + ;; + \\) + lgetc + case "${c}" in "${LF}") + lineno=$((${lineno} + 1)) + lgetc + continue + ;; + esac + next_word \\ + return + ;; + '#') + lgetc + while :; do + case "${c}" in "${LF}"|'') + break + ;; + esac + lgetc + done + continue + ;; + '&') + lgetc + case "${c}" in '&') + lgetc + tok=T_AND_IF + return + ;; + esac + tok=T_AND + return + ;; + '|') + lgetc + case "${c}" in '|') + lgetc + tok=T_OR_IF + return + ;; + esac + tok=T_PIPE + return + ;; + ';') + lgetc + case "${c}" in ';') + lgetc + tok=T_DSEMI + return + ;; + esac + dbg T_SEMI + tok=T_SEMI + return + ;; + '(') + lgetc + tok=T_LPAREN + return + ;; + ')') + lgetc + tok=T_RPAREN + return + ;; + '<'|'>') + next_io + return + ;; + *) + next_word '' + return + ;; + esac + lgetc + done +} + +next_here() +{ + local here= + local here_strip_tabs= + local here_end= + local here_escaped= + local line= + local word= + local res= + local wordexp= + + # Dequeue the here-document. + here="${here_queue%%${RS}*}" + here_strip_tabs="${here%%${US}*}" + here_end="${here%${US}*}" + here_end="$(printf '%s' "${here_end#*${US}}" | \ + sed 's/\\//g; s/"//g; s/'\''//g;')" # Stupid Vim: ')" + here_escaped="${here##*${US}}" + here_queue="${here_queue#*${RS}}" + here_awaiting_word=false + + line='' + word='' + while :; do + case "${c}" in + '') + # Bash throws a warning when EOF occurs in a + # here document. mksh throws an error. dash, + # BusyBox ash, ksh93, and zsh accept EOF as a + # delimiter. We aim for the lowest common + # denominator, so throw an error like mksh does. + synerr 'Here-document "%s" unclosed' \ + "${here_end}" + ;; + "${LF}") + word="${word}${line}" + case "${line}" in "${here_end}") + tok="T_WORD${US}${word}" + return + ;; + esac + word="${word}${c}" + line='' + ;; + "${HT}") + if ${here_strip_tabs}; then + case "${line}" in + '') + ;; + *) + line="${line}${c}" + ;; + esac + else + line="${line}${c}" + fi + ;; + '$') + if ! ${here_escaped}; then + lgetc + if ! res="$(scan_wordexp)"; then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + wordexp="${res%%${RS}*}" + lineno=$((${lineno} + ${ln_off})) + line="${line}${wordexp}" + continue + else + line="${line}${c}" + fi + ;; + *) + line="${line}${c}" + ;; + esac + lgetc + done +} + +next_io() +{ + case "${c}" in + '<') + lgetc + case "${c}" in + '<') + lgetc + case "${c}" in '-') + lgetc + tok=T_DLESSDASH + here_queue="${here_queue}true" + here_awaiting_end=true + here_awaiting_word=false + break + ;; + esac + tok=T_DLESS + here_queue="${here_queue}false" + here_awaiting_end=true + here_awaiting_word=false + break + ;; + '&') + lgetc + tok=T_LESSAND + break + ;; + '>') + lgetc + tok=T_LESSGREAT + break + ;; + esac + tok=T_LESS + break + ;; + '>') + lgetc + case "${c}" in + '>') + lgetc + tok=T_DGREAT + break + ;; + '&') + lgetc + tok=T_GREATAND + break + ;; + '|') + lgetc + tok=T_CLOBBER + break + ;; + esac + tok=T_GREAT + break + ;; + esac +} + +next_word() +{ + local prev_c="${1}" + shift 1 + local res= + local word= + + if ! res="$(scan_word false)"; then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + word="${prev_c}${res%%${RS}*}" + + # We must advance lineno because scan_word() was run in a subshell. + lineno=$((${lineno} + ${ln_off})) + tok="T_WORD${US}${word}" + + if ${here_awaiting_end}; then + here_queue="${here_queue}${US}${word}" + case "${word}" in + *\\*|*'"'*|*"'"*) + here_queue="${here_queue}${US}true" + ;; + *) + here_queue="${here_queue}${US}false" + ;; + esac + here_queue="${here_queue}${RS}" + here_awaiting_end=false + fi +} + +# +# Token scanning +# + +scan_word() +{ + local in_param="${1}" + shift 1 + local res= + local word= + local quoted= + local lines= + local wordexp= + + word='' + quoted=false + lines=0 + while :; do + dbg "parsing word char '$c' at lineno $lineno" + case "${c}" in + '') + break + ;; + "${LF}") + if ! ${in_param} && ! ${quoted}; then + break + fi + lineno=$((${lineno} + 1)) + lines=$((${lines} + 1)) + word="${word}${c}" + ;; + ' '|"${HT}"|'&'|'|'|';'|'('|')'|'<'|'>') + if ! ${in_param} && ! ${quoted}; then + break + fi + word="${word}${c}" + ;; + '$') + case "${here_queue}" in *"${RS}"*) + if ${here_awaiting_end}; then + synerr '%s %s %s %s' \ + 'Word expansions' \ + 'not supported in' \ + 'here-document' \ + 'delimiters' + fi + esac + lgetc + if ! res=$(scan_wordexp); then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + wordexp="${res%%${RS}*}" + # We must advance lineno because scan_wordexp() + # was run in a subshell. + lineno=$((${lineno} + ${ln_off})) + word="${word}${wordexp}" + # scan_wordexp() leaves behind an unused + # character, so we should skip the lgetc() call + # below. + continue + ;; + '`') + synerr 'Backquoted (old-style) %s' \ + 'command substitution not supported' + break + ;; + \\) + word="${word}${c}" + lgetc + case "${c}" in '') + # Bash, ksh93, mksh, and zsh ignore a + # backslash at the end of a file, but + # dash and BusyBox ash include it in the + # word. To help with script + # portability, we'll throw an error + # (which is a reasonable thing to do + # anyway). + synerr 'Unexpected end of file %s' \ + 'after "\"' + ;; + esac + word="${word}${c}" + ;; + \') + word="${word}${c}" + while :; do + lgetc + word="${word}${c}" + case "${c}" in + '') + synerr '%s %s' \ + 'Unterminated' \ + 'quoted string' + ;; + \') + break + ;; + esac + done + ;; + '"') + word="${word}${c}" + if ${quoted}; then + quoted=false + else + quoted=true + fi + ;; + '}') + if ${in_param} && ! ${quoted}; then + break + fi + word="${word}${c}" + ;; + *) + word="${word}${c}" + ;; + esac + lgetc + done + + if ${quoted}; then + synerr 'Unterminated quoted string' + fi + + printf "%d${RS}%c${RS}%s" ${lines} "${c}" "${word}" +} + +scan_wordexp() +{ + local res= + local toks= + local param= + + wordexp='' + ln_off=0 + case "${c}" in + '{') + # Parameter expansion brace + scan_wordexp_param_brace + ;; + '(') + # Arithmetic expansion or command substitution + lgetc + case "${c}" in + '(') + # Arithmetic expansion + scan_wordexp_arith + ;; + *) + # Command substitution + if ! res="$(run_sublexer "sub${fname}" \ + ${lineno} "${start}" \ + "${c}")"; then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + toks="${res%%${RS}*}" + lineno=${ln_off} + wordexp="\$(${SOH}C${STX}${toks}" + wordexp="${wordexp}${ETX})" + # ")" is recognized in run_sublexer(). + ;; + esac + ;; + [@*#?$!A-Za-z0-9_-]) + if ! res="$(scan_param)"; then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + param="${res%%${RS}*}" + lineno=$((${lineno} + ${ln_off})) + wordexp="\$${param}" + ;; + esac + + printf "%d${RS}%c${RS}%s" ${ln_off} "${c}" "${wordexp}" + return 0 +} + +scan_wordexp_param_brace() +{ + local mod= + local res= + local param= + local word= + + mod=true + + lgetc + case "${c}" in + '#') + lgetc + case "${c}" in + [@*#?$!A-Za-z0-9_-]) + # String length expansion + if ! res="$(scan_param)"; then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + param="${res%%${RS}*}" + lineno=$((${lineno} + ${ln_off})) + # Disable modifications. + mod=false + ;; + *) + # Special parameter "#" + param='#' + ;; + esac + ;; + *) + if ! res="$(scan_param)"; then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + param="${res%%${RS}*}" + lineno=$((${lineno} + ${ln_off})) + ;; + esac + wordexp="\${${param}" + + # If modifications are allowed + if ${mod}; then + # Check for modifications. + mod=false + case "${c}" in + ':') + mod=true + wordexp="${wordexp}${c}" + lgetc + case "${c}" in '-'|'='|'?'|'+') + wordexp="${wordexp}${c}" + lgetc + ;; + esac + ;; + '-'|'='|'?'|'+') + mod=true + wordexp="${wordexp}${c}" + lgetc + ;; + '%') + mod=true + wordexp="${wordexp}${c}" + lgetc + case "${c}" in '%') + wordexp="${wordexp}${c}" + lgetc + ;; + esac + ;; + '#') + mod=true + wordexp="${wordexp}${c}" + lgetc + case "${c}" in '#') + wordexp="${wordexp}${c}" + lgetc + ;; + esac + ;; + esac + fi + + # If a modification was found + if ${mod}; then + # Get word. + if ! res="$(scan_word true)"; then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + word="${res%%${RS}*}" + # We must advance lineno because scan_word() was run in a + # subshell. + lineno=$((${lineno} + ${ln_off})) + wordexp="${wordexp}${word}" + dbg "param mod word: '$word'" + fi + + # Check for right brace. + case "${c}" in + '}') + wordexp="${wordexp}${c}" + lgetc + ;; + *) + synerr 'Missing "}"' + ;; + esac + + return 0 +} + +scan_param() +{ + local param= + + param='' + case "${c}" in + [@*#?$!0-]) + # Special parameter + param="${c}" + lgetc + ;; + [1-9]) + # Positional parameter + param="${param}${c}" + lgetc + while :; do + case "${c}" in [!0-9]) + break + ;; + esac + param="${param}${c}" + lgetc + done + ;; + [A-Za-z_]) + # Parameter name + param="${param}${c}" + lgetc + while :; do + case "${c}" in [!A-Za-z0-9_]) + break + ;; + esac + param="${param}${c}" + lgetc + done + ;; + *) + synerr 'Bad parameter name' + ;; + esac + + printf "%d${RS}%c${RS}%s" 0 "${c}" "${param}" + return 0 +} + +scan_wordexp_arith() +{ + local arith= + local paren_lvl= + local res= + local sub_wordexp= + + arith='' + paren_lvl=0 + while :; do + lgetc + case "${c}" in + '') + synerr 'end of file unexpected (%s)' \ + 'expecting "))"' + ;; + '(') + arith="${arith}${c}" + paren_lvl=$((${paren_lvl} + 1)) + ;; + ')') + if [ ${paren_lvl} -eq 0 ]; then + lgetc + case "${c}" in ')') + wordexp="\$((${arith}))" + lgetc + return 0 + ;; + esac + synerr 'Arithmetic expansion: ")" %s' \ + 'unexpected' + fi + arith="${arith}${c}" + paren_lvl=$((${paren_lvl} - 1)) + ;; + '$') + lgetc + if ! res=$(scan_wordexp); then + exit 1 + fi + ln_off=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + sub_wordexp="${res%%${RS}*}" + # We must advance lineno because scan_wordexp() + # was run in a subshell. + lineno=$((${lineno} + ${ln_off})) + arith="${arith}${sub_wordexp}" + ;; + *) + arith="${arith}${c}" + ;; + esac + done +} + +run_sublexer() +{ + local fn="${1}" + local ln="${2}" + local st="${3}" + local ch="${4}" + shift 4 + + # Initialize global variables. + fname="${fn}" + lineno=${ln} + start="${st}" + here_queue='' + here_awaiting_end=false + here_awaiting_word=false + tokens='' + + c="${ch}" + next + + #dbg=true + # If this returns (does not exit), there are no errors. + ${start} + case "${tok%${US}*}" in + T_RPAREN) + ;; + *) + synerr 'Missing ")"' + ;; + esac + + printf "%d${RS}%c${RS}%s" ${lineno} "${c}" "${tokens}" + return 0 +} + +# +# Interface +# + +run_lexer() +{ + local fn="${1}" + local st="${2}" + shift 2 + + # Initialize global variables. + fname="${fn}" + lineno=1 + start="${st}" + here_queue='' + here_awaiting_end=false + here_awaiting_word=false + tokens='' + + # Read the first character and recognize the first token. + lgetc + next + + if ! ${start}; then + # Unexpected EOF + synexp '' + fi + if ! accept T_EOF; then + synexp '' + fi + + # Return the tokens. + printf '%s' "${tokens}" + + return 0 +} + +accept() +{ + local t="${1}" + shift 1 + local rw= + + dbg "looking for $t, current tok ${tok%%${US}*}" + case "${t}" in + T_IF|T_THEN|T_ELSE|T_ELIF|T_FI|\ + T_DO|T_DONE|T_CASE|T_ESAC|T_WHILE|T_UNTIL|\ + T_FOR|T_LBRACE|T_RBRACE|T_BANG|T_IN) + dbg "looking for reserved word $t, have '$tok'" + if ! [ "x${tok%%${US}*}" = "x${t}" ]; then + # Reserved words are recognized as literal + # T_WORDs. + if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then + return 1 + fi + # T_WORD data unit must match reserved word + # exactly. + if ! [ "x${tok#T_WORD${US}}" = \ + "x$(toktext "${t}")" ]; then + return 1 + fi + # If the token matches the reserved word, + # replace it with the reserved word token. + tok="${t}" + fi + ;; + T_NAME) + # Names are recognized as literal T_WORDs. + if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then + return 1 + fi + # Validate name. + case "${tok%%${US}*}" in + [A-Za-z_][0-9A-Za-z_]*) + ;; + *) + return 1 + ;; + esac + tok="T_NAME${US}${tok#T_WORD${US}}" + ;; + T_FNAME) + # Function names are recognized as literal T_WORDs. + if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then + return 1 + fi + # Validate name. + case "${tok%%${US}*}" in + [A-Za-z_][0-9A-Za-z_]*) + ;; + *) + return 1 + ;; + esac + # Verify that the function name doesn't match any + # reserved words. + for rw in T_IF T_THEN T_ELSE T_ELIF T_FI T_DO T_DONE \ + T_CASE T_ESAC T_WHILE T_UNTIL T_FOR \ + T_LBRACE T_RBRACE T_BANG T_IN; do + if [ "x${tok#T_WORD${US}}" = \ + "x$(toktext "${rw}")" ]; then + tok="${rw}" + return 1 + fi + done + tok="T_FNAME${US}${tok#T_WORD${US}}" + ;; + T_CMDNAME) + # The first word of a simple command is to be checked + # for reserved words. + if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then + return 1 + fi + # Verify that the word doesn't match any reserved words. + for rw in T_IF T_THEN T_ELSE T_ELIF T_FI T_DO T_DONE \ + T_CASE T_ESAC T_WHILE T_UNTIL T_FOR \ + T_LBRACE T_RBRACE T_BANG T_IN; do + if [ "x${tok#T_WORD${US}}" = \ + "x$(toktext "${rw}")" ]; then + tok="${rw}" + return 1 + fi + done + ;; + *) + if ! [ "x${tok%%${US}*}" = "x${t}" ]; then + return 1 + fi + ;; + esac + + dbg "accept $t" + tokens="${tokens}${tok}${RS}" + next + return 0 +} + +expect() +{ + local t="${1}" + shift 1 + + if accept "${t}"; then + return 0 + else + synexp "${t}" + fi +} diff --git a/eshtrans/frontend/main.esh b/eshtrans/frontend/main.esh new file mode 100644 index 0000000..b9f93a6 --- /dev/null +++ b/eshtrans/frontend/main.esh @@ -0,0 +1,30 @@ +# Eggshell frontend interface +# +# Copyright (C) 2016 Patrick "P. J." McDermott +# +# This file is part of the Eggshell Compiler. +# +# The Eggshell Compiler is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# The Eggshell Compiler is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty +# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with the Eggshell Compiler. If not, see +# <http://www.gnu.org/licenses/>. + +esh_parse() +{ + local fn="${1}" + shift 1 + + if run_lexer "${fn}" complete_command; then + return 0 + fi + return 1 +} diff --git a/eshtrans/frontend/parser.esh b/eshtrans/frontend/parser.esh new file mode 100644 index 0000000..d49fa77 --- /dev/null +++ b/eshtrans/frontend/parser.esh @@ -0,0 +1,591 @@ +# Eggshell parser +# +# Copyright (C) 2016 Patrick "P. J." McDermott +# +# This file is part of the Eggshell Compiler. +# +# The Eggshell Compiler is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# The Eggshell Compiler is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty +# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with the Eggshell Compiler. If not, see +# <http://www.gnu.org/licenses/>. + +ptrace=false + +# +# Function tracing +# + +ptrace_begn() +{ + local fn="${1}" + shift 1 + + if ${ptrace}; then + printf 'TRACE: BEGN %s()\n' "${fn}" >&2 + fi +} + +ptrace_pass() +{ + local fn="${1}" + shift 1 + + if ${ptrace}; then + printf 'TRACE: PASS %s()\n' "${fn}" >&2 + fi +} + +ptrace_fail() +{ + local fn="${1}" + shift 1 + + if ${ptrace}; then + printf 'TRACE: FAIL %s()\n' "${fn}" >&2 + fi +} + +# +# Parser +# + +complete_command() +{ + if list; then + separator + return 0 + fi + return 1 +} + +list() +{ + ptrace_begn list + if and_or; then + while separator && and_or; do + : + done + ptrace_pass list + return 0 + fi + ptrace_fail list + return 1 +} + +and_or() +{ + ptrace_begn and_or + if pipeline; then + while accept T_AND_IF || accept T_OR_IF; do + if ! linebreak || ! pipeline; then + ptrace_fail and_or + return 1 + fi + done + ptrace_pass and_or + return 0 + fi + ptrace_fail and_or + return 1 +} + +pipeline() +{ + ptrace_begn pipeline + accept T_BANG + if pipe_sequence; then + ptrace_pass pipeline + return 0 + fi + ptrace_fail pipeline + return 1 +} + +pipe_sequence() +{ + ptrace_begn pipe_sequence + if command; then + while accept T_PIPE; do + if ! linebreak || ! command; then + ptrace_fail pipe_sequence + return 1 + fi + done + ptrace_pass pipe_sequence + return 0 + fi + ptrace_fail pipe_sequence + return 1 +} + +command() +{ + ptrace_begn command + if simple_command; then + ptrace_pass command + return 0 + elif compound_command; then + redirect_list + ptrace_pass command + return 0 + fi + ptrace_fail command + return 1 +} + +compound_command() +{ + ptrace_begn compound_command + if brace_group; then + ptrace_pass compound_command + return 0 + elif subshell; then + ptrace_pass compound_command + return 0 + elif for_clause; then + ptrace_pass compound_command + return 0 + elif case_clause; then + ptrace_pass compound_command + return 0 + elif if_clause; then + ptrace_pass compound_command + return 0 + elif while_clause; then + ptrace_pass compound_command + return 0 + elif until_clause; then + ptrace_pass compound_command + return 0 + fi + ptrace_fail compound_command + return 1 +} + +subshell() +{ + ptrace_begn subshell + if accept T_LPAREN && compound_list && expect T_RPAREN; then + ptrace_pass subshell + return 0 + fi + ptrace_fail subshell + return 1 +} + +compound_list() +{ + ptrace_begn compound_list + newline_list + if term; then + separator + ptrace_pass compound_list + return 0 + fi + ptrace_fail compound_list + return 1 +} + +term() +{ + ptrace_begn term + if and_or; then + while separator; do + and_or + done + ptrace_pass term + return 0 + fi + ptrace_fail term + return 1 +} + +for_clause() +{ + ptrace_begn for_clause + if accept T_FOR; then + if expect T_NAME && linebreak; then + if accept T_IN; then + wordlist + if ! sequential_sep; then + ptrace_fail for_clause + return 1 + fi + fi + if do_group; then + ptrace_pass for_clause + return 0 + fi + fi + fi + ptrace_fail for_clause + return 1 +} + +wordlist() +{ + ptrace_begn wordlist + if accept T_WORD; then + while accept T_WORD; do :; done + ptrace_pass wordlist + return 0 + fi + ptrace_fail wordlist + return 1 +} + +case_clause() +{ + if accept T_CASE; then + if expect T_WORD && linebreak && expect T_IN && linebreak; then + case_list || case_list_ns + expect T_ESAC + return 0 + fi + fi + return 1 +} + +case_list_ns() +{ + if case_list && case_item_ns; then + return 0 + elif case_item_ns; then + return 0 + fi + return 1 +} + +case_list() +{ + if case_item; then + while case_item; do + : + done + return 0 + fi + return 1 +} + +case_item_ns() +{ + accept T_LPAREN + if pattern && expect RPAREN; then + compound_list + if linebreak; then + return 0 + fi + fi + return 1 +} + +case_item() +{ + accept T_LPAREN + if pattern && expect T_RPAREN; then + if compound_list || linebreak; then + if expect T_DSEMI && linebreak; then + return 0 + fi + fi + fi + return 1 +} + +pattern() +{ + if accept T_CMDNAME; then + while accept T_PIPE; do + expect T_WORD + done + return 0 + fi + return 1 +} + +if_clause() +{ + if accept T_IF; then + if compound_list && expect T_THEN && compound_list; then + else_part + expect T_FI + return 0 + fi + fi + return 1 +} + +else_part() +{ + while accept T_ELIF; do + if compound_list && expect T_THEN && compound_list; then + continue + fi + return 1 + done + if accept T_ELSE; then + if compound_list; then + return 0 + fi + fi + return 1 +} + +while_clause() +{ + if accept T_WHILE; then + if compound_list && do_group; then + return 0 + fi + fi + return 1 +} + +until_clause() +{ + if accept T_UNTIL; then + if compound_list && do_group; then + return 0 + fi + fi + return 1 +} + +function_body() +{ + ptrace_begn function_body + if compound_command; then + redirect_list + ptrace_pass function_body + return 0 + fi + ptrace_fail function_body + return 1 +} + +brace_group() +{ + ptrace_begn brace_group + if accept T_LBRACE && compound_list && expect T_RBRACE; then + ptrace_pass brace_group + return 0 + fi + ptrace_fail brace_group + return 1 +} + +do_group() +{ + ptrace_begn do_group + if accept T_DO && compound_list && expect T_DONE; then + ptrace_pass do_group + return 0 + fi + ptrace_fail do_group + return 1 +} + +simple_command() +{ + ptrace_begn simple_command + if cmd_prefix; then + if cmd_word; then + cmd_suffix + fi + ptrace_pass simple_command + return 0 + elif accept T_FNAME; then + if accept T_LPAREN; then + expect T_RPAREN + if linebreak && function_body; then + ptrace_pass simple_command + return 0 + fi + else + cmd_suffix + ptrace_pass simple_command + return 0 + fi + elif cmd_name; then + cmd_suffix + ptrace_pass simple_command + return 0 + fi + ptrace_fail simple_command + return 1 +} + +cmd_name() +{ + ptrace_begn cmd_name + # TODO: Assignment + if accept T_CMDNAME; then + ptrace_pass cmd_name + return 0 + fi + ptrace_fail cmd_name + return 1 +} + +cmd_word() +{ + ptrace_begn cmd_word + # TODO: Assignment + if accept T_WORD; then + ptrace_pass cmd_word + return 0 + fi + ptrace_fail cmd_word + return 1 +} + +cmd_prefix() +{ + ptrace_begn cmd_prefix + if io_redirect || accept T_ASSIGNMENT_WORD; then + while io_redirect || accept T_ASSIGNMENT_WORD; do + : + done + ptrace_pass cmd_prefix + return 0 + fi + ptrace_fail cmd_prefix + return 1 +} + +cmd_suffix() +{ + ptrace_begn cmd_suffix + if io_redirect || accept T_WORD; then + while io_redirect || accept T_WORD; do + : + done + ptrace_pass cmd_suffix + return 0 + fi + ptrace_fail cmd_suffix + return 1 +} + +redirect_list() +{ + ptrace_begn redirect_list + if io_redirect; then + while io_redirect; do + : + done + ptrace_pass redirect_list + return 0 + fi + ptrace_fail redirect_list + return 1 +} + +io_redirect() +{ + ptrace_begn io_redirect + if io_file || io_here; then + ptrace_pass io_redirect + return 0 + fi + ptrace_fail io_redirect + return 1 +} + +io_file() +{ + if accept T_LESS || accept T_LESSAND || accept T_GREAT || \ + accept T_GREATAND || accept T_DGREAT || \ + accept T_LESSGREAT || accept T_CLOBBER; then + if filename; then + return 0 + fi + fi + return 1 +} + +filename() +{ + if accept T_WORD; then + return 0 + fi + return 1 +} + +io_here() +{ + if accept T_DLESS || accept T_DLESSDASH; then + if here_end; then + return 0 + fi + fi + return 1 +} + +here_end() +{ + if accept T_WORD; then + return 0 + fi + return 1 +} + +newline_list() +{ + if accept T_NEWLINE; then + while accept T_NEWLINE; do + : + done + return 0 + fi + return 1 +} + +linebreak() +{ + newline_list + return 0 +} + +separator_op() +{ + if accept T_AND || accept T_SEMI; then + return 0 + fi + return 1 +} + +separator() +{ + if separator_op && linebreak; then + return 0 + elif newline_list; then + return 0 + fi + return 1 +} + +sequential_sep() +{ + ptrace_begn sequential_sep + if accept T_SEMI; then + if linebreak; then + ptrace_pass sequential_sep + return 0 + fi + elif newline_list; then + ptrace_pass sequential_sep + return 0 + fi + ptrace_fail sequential_sep + return 1 +} diff --git a/eshtrans/main.esh b/eshtrans/main.esh new file mode 100644 index 0000000..2a80504 --- /dev/null +++ b/eshtrans/main.esh @@ -0,0 +1,101 @@ +# Eggshell Compiler entry point +# +# Copyright (C) 2016 Patrick "P. J." McDermott +# +# This file is part of the Eggshell Compiler. +# +# The Eggshell Compiler is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# The Eggshell Compiler is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty +# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with the Eggshell Compiler. If not, see +# <http://www.gnu.org/licenses/>. + +try() +{ + local tokens= + local t= + + printf 'Trying script:\n' + printf '\t%s\n' "${@}" + if tokens="$(printf '%s\n' "${@}" | esh_parse -)"; then + printf 'Tokens: %s\n' "${tokens}" | sed " + s/${SOH}/<SOH>/g; s/${STX}/<STX>/g; s/${ETX}/<ETX>/g; + s/${RS}/<RS>/g; s/${US}/<US>/g; + " + IFS="${RS}" + for t in ${tokens}; do + printf 'Token: %s\n' "$(tokname "${t}")" + case "${t%${US}*}" in T_NAME|T_FNAME|T_CMDNAME|T_WORD) + printf ' "%s"\n' "${t#*${US}}" + ;; + esac + done + printf 'Generated code:\n' + IFS="${LF}" + printf '\t%s\n' $(sh_codegen "${tokens}") + unset IFS + else + printf 'FAIL\n' + fi + printf '\n\n' +} + +main() +{ +#try '"foo bar" && $baz || qux' '${quux%uux quuux' +#try '"foo bar" && $baz || qux' '${quux%uux } quuux' +#try 'foo ${bar}' +#try 'foo ${#bar}' +#try 'foo ${bar#baz}' +#try 'foo ${#bar#}' +#try 'foo ${^}' +#try 'foo `bar`' +#try 'foo &&' +#try '{ foo; }' +#try '( foo )' +#try 'for i in 1 2 3; do stuff; done' +#try 'if foo; then bar; fi' +#try 'if foo; then bar; elif baz; then qux; else quux; fi' +#try 'if ; then ; fi' +#try 'while foo; do bar; done' +#try 'while ; do ; done' +#try 'foo(){ bar; }' +#try 'case foo in bar) baz;; (qux) quux;; quux);; esac' +#try 'foo bar ( baz )' +#try 'foo $(bar)' +#try 'foo $(bar); baz' +#try 'foo $(bar)' 'baz' +#try 'foo $(bar) baz' +#try 'foo$(bar$(baz))qux' +#try 'foo $((1 + 1))' +#try '$((1 + 1))' +#try '$((1 + (1 + 1)))' +#try '$((1 + $(foo) + 1))' +#try '$((1' +#try 'foo <<EOF' 'bar' 'EOF' +#try 'foo <<-EOF' "${HT}bar" "${HT}EOF" +#try 'foo <<EOF' '$(bar)' 'EOF' +#try 'foo <<E"O"F' '$(bar)' 'EOF' +#try 'foo <<"EOF"' '$(bar)' 'EOF' +#try 'foo <<E\OF' '$(bar)' 'EOF' +#try 'foo <<\EOF' '$(bar)' 'EOF' +#try 'foo <<EOF1; bar <<EOF2' 'baz' 'EOF1' 'qux' 'EOF2' +#try '\foo' +#try '"foo bar" baz' +#try '"foo' +#try 'foo\" bar' +#try 'foo\' +#try "foo'" +#try 'foo\' 'bar' +try 'v=foo' +#try 'if &&' +#try 'if true; do' +} diff --git a/eshtrans/tokens.esh b/eshtrans/tokens.esh new file mode 100644 index 0000000..9da8bb8 --- /dev/null +++ b/eshtrans/tokens.esh @@ -0,0 +1,139 @@ +# Tokens query functions +# +# Copyright (C) 2016 Patrick "P. J." McDermott +# +# This file is part of the Eggshell Compiler. +# +# The Eggshell Compiler is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# The Eggshell Compiler is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty +# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with the Eggshell Compiler. If not, see +# <http://www.gnu.org/licenses/>. + +tokname() +{ + local t="${1}" + shift 1 + local n= + + case "${t%${US}*}" in + # Operators + T_EOF) n='end of file';; + T_NEWLINE) n='newline';; + T_AND) n='"&"';; + T_SEMI) n='";"';; + T_AND_IF) n='"&&"';; + T_OR_IF) n='"||"';; + T_DSEMI) n='";;"';; + T_LESS) n='"<"';; + T_GREAT) n='">"';; + T_DLESS) n='"<<"';; + T_DGREAT) n='">>"';; + T_LESS) n='"<"';; + T_LESSAND) n='"<&"';; + T_GREAT) n='">"';; + T_GREATAND) n='">&"';; + T_LESSGREAT) n='"<>"';; + T_DLESSDASH) n='"<<-"';; + T_CLOBBER) n='">|"';; + T_PIPE) n='"|"';; + T_LPAREN) n='"("';; + T_RPAREN) n='")"';; + # Reserved words + T_IF) n='"if"';; + T_THEN) n='"then"';; + T_ELSE) n='"else"';; + T_ELIF) n='"elif"';; + T_FI) n='"fi"';; + T_DO) n='"do"';; + T_DONE) n='"done"';; + T_CASE) n='"case"';; + T_ESAC) n='"esac"';; + T_WHILE) n='"while"';; + T_UNTIL) n='"until"';; + T_FOR) n='"for"';; + T_LBRACE) n='"{"';; + T_RBRACE) n='"}"';; + T_BANG) n='"!"';; + T_IN) n='"in"';; + # Special symbols + T_NAME) n='parameter name';; + T_FNAME) n='function name';; + T_CMDNAME) n='command name';; + T_IO_NUMBER) n='I/O number';; + T_WORD) n='word';; + T_ASSIGNMENT_WORD) n='assignment word';; + # Unknown + *) n='unknown token';; + esac + + printf '%s' "${n}" +} + +toktext() +{ + local t="${1}" + shift 1 + local n= + + case "${t%${US}*}" in + # Operators + T_EOF) n='';; + T_NEWLINE) n="${LF}";; + T_AND) n='&';; + T_SEMI) n=';';; + T_AND_IF) n='&&';; + T_OR_IF) n='||';; + T_DSEMI) n=';;';; + T_LESS) n='<';; + T_GREAT) n='>';; + T_DLESS) n='<<';; + T_DGREAT) n='>>';; + T_LESS) n='<';; + T_LESSAND) n='<&';; + T_GREAT) n='>';; + T_GREATAND) n='>&';; + T_LESSGREAT) n='<>';; + T_DLESSDASH) n='<<-';; + T_CLOBBER) n='>|';; + T_PIPE) n='|';; + T_LPAREN) n='(';; + T_RPAREN) n=')';; + # Reserved words + T_IF) n='if';; + T_THEN) n='then';; + T_ELSE) n='else';; + T_ELIF) n='elif';; + T_FI) n='fi';; + T_DO) n='do';; + T_DONE) n='done';; + T_CASE) n='case';; + T_ESAC) n='esac';; + T_WHILE) n='while';; + T_UNTIL) n='until';; + T_FOR) n='for';; + T_LBRACE) n='{';; + T_RBRACE) n='}';; + T_BANG) n='!';; + T_IN) n='in';; + # Special symbols + T_NAME) n="${t#*${US}}";; + T_FNAME) n="${t#*${US}}";; + T_CMDNAME) n="${t#*${US}}";; + T_IO_NUMBER) n="${t#*${US}}";; + T_WORD) n="${t#*${US}}";; + T_ASSIGNMENT_WORD) n="${t#*${US}}";; + # Unknown + *) n='';; + esac + + printf '%s' "${n}" +} |