summaryrefslogtreecommitdiffstats
path: root/parsing/lexer.sh
diff options
context:
space:
mode:
authorP. J. McDermott <pj@pehjota.net>2016-02-21 21:13:33 (EST)
committer P. J. McDermott <pj@pehjota.net>2016-02-21 21:13:33 (EST)
commit8a9a6865954ade85d4a55f955829ae08941c31b8 (patch)
tree966d1c1a8bc4c6f7ed0671b8c53f1272be77dc4f /parsing/lexer.sh
parent5649a9aa1ce56c0cfdcab088983f2d3a4cb32a4c (diff)
downloadeggshell-8a9a6865954ade85d4a55f955829ae08941c31b8.zip
eggshell-8a9a6865954ade85d4a55f955829ae08941c31b8.tar.gz
eggshell-8a9a6865954ade85d4a55f955829ae08941c31b8.tar.bz2
Remove old demo parsing code
Diffstat (limited to 'parsing/lexer.sh')
-rw-r--r--parsing/lexer.sh958
1 files changed, 0 insertions, 958 deletions
diff --git a/parsing/lexer.sh b/parsing/lexer.sh
deleted file mode 100644
index 886e7f8..0000000
--- a/parsing/lexer.sh
+++ /dev/null
@@ -1,958 +0,0 @@
-fname=
-lineno=
-ln_off=
-start=
-c=
-wordexp=
-here_queue=
-here_awaiting_end=
-here_awaiting_word=
-tok=
-tokens=
-
-#
-# Error handling (used by scanning and interface functions)
-#
-
-error()
-{
- local fmt="${1}"
- shift 1
-
- case "${fname}" in
- '-')
- printf "stdin:%d: ${fmt}\n" ${lineno} "${@}" >&2
- ;;
- *)
- printf "%s:%d: ${fmt}\n" "${fname}" ${lineno} "${@}" >&2
- ;;
- esac
-
- # The parser and lexer run in a subshell, so this just returns up to the
- # caller like an exception.
- exit 1
-}
-
-synexp()
-{
- local t="${1}"
- shift 1
-
- if [ "x${t}" = 'x' ]; then
- synerr '%s unexpected' "$(tokname "${tok}")"
- else
- synerr '%s unexpected (expecting %s)' "$(tokname "${tok}")" \
- "$(tokname "${t}")"
- fi
-}
-
-synerr()
-{
- local fmt="${1}"
- shift 1
-
- error "Syntax error: ${fmt}" "${@}"
-}
-
-#
-# Input reading
-#
-
-lgetc()
-{
- c="$(dd bs=1 count=1 2>/dev/null; printf '.')"
- c="${c%.}"
-}
-
-#
-# Token recognition
-#
-
-next()
-{
- if ${here_awaiting_word}; then
- next_here
- return
- fi
- while :; do
- dbg "parsing char '$c' at lineno $lineno"
- case "${c}" in
- '')
- lgetc
- tok=T_EOF
- return
- ;;
- "${LF}")
- if ${here_awaiting_end}; then
- synexp ''
- else
- case "${here_queue}" in *"${RS}"*)
- here_awaiting_end=false
- here_awaiting_word=true
- ;;
- esac
- fi
- lgetc
- lineno=$((${lineno} + 1))
- tok=T_NEWLINE
- return
- ;;
- ' '|"${HT}")
- lgetc
- continue
- ;;
- \\)
- lgetc
- case "${c}" in "${LF}")
- lineno=$((${lineno} + 1))
- lgetc
- continue
- ;;
- esac
- next_word \\
- return
- ;;
- '#')
- lgetc
- while :; do
- case "${c}" in "${LF}"|'')
- break
- ;;
- esac
- lgetc
- done
- continue
- ;;
- '&')
- lgetc
- case "${c}" in '&')
- lgetc
- tok=T_AND_IF
- return
- ;;
- esac
- tok=T_AND
- return
- ;;
- '|')
- lgetc
- case "${c}" in '|')
- lgetc
- tok=T_OR_IF
- return
- ;;
- esac
- tok=T_PIPE
- return
- ;;
- ';')
- lgetc
- case "${c}" in ';')
- lgetc
- tok=T_DSEMI
- return
- ;;
- esac
- dbg T_SEMI
- tok=T_SEMI
- return
- ;;
- '(')
- lgetc
- tok=T_LPAREN
- return
- ;;
- ')')
- lgetc
- tok=T_RPAREN
- return
- ;;
- '<'|'>')
- next_io
- return
- ;;
- *)
- next_word ''
- return
- ;;
- esac
- lgetc
- done
-}
-
-next_here()
-{
- local here=
- local here_strip_tabs=
- local here_end=
- local here_escaped=
- local line=
- local word=
- local res=
- local wordexp=
-
- # Dequeue the here-document.
- here="${here_queue%%${RS}*}"
- here_strip_tabs="${here%%${US}*}"
- here_end="${here%${US}*}"
- here_end="$(printf '%s' "${here_end#*${US}}" | \
- sed 's/\\//g; s/"//g; s/'\''//g;')" # Stupid Vim: ')"
- here_escaped="${here##*${US}}"
- here_queue="${here_queue#*${RS}}"
- here_awaiting_word=false
-
- line=''
- word=''
- while :; do
- case "${c}" in
- '')
- # Bash throws a warning when EOF occurs in a
- # here document. mksh throws an error. dash,
- # BusyBox ash, ksh93, and zsh accept EOF as a
- # delimiter. We aim for the lowest common
- # denominator, so throw an error like mksh does.
- synerr 'Here-document "%s" unclosed' \
- "${here_end}"
- ;;
- "${LF}")
- word="${word}${line}"
- case "${line}" in "${here_end}")
- tok="T_WORD${US}${word}"
- return
- ;;
- esac
- word="${word}${c}"
- line=''
- ;;
- "${HT}")
- if ${here_strip_tabs}; then
- case "${line}" in
- '')
- ;;
- *)
- line="${line}${c}"
- ;;
- esac
- else
- line="${line}${c}"
- fi
- ;;
- '$')
- if ! ${here_escaped}; then
- lgetc
- if ! res="$(scan_wordexp)"; then
- exit 1
- fi
- ln_off=${res%%${RS}*}
- res="${res#*${RS}}"
- c="${res%%${RS}*}"
- res="${res#*${RS}}"
- wordexp="${res%%${RS}*}"
- lineno=$((${lineno} + ${ln_off}))
- line="${line}${wordexp}"
- continue
- else
- line="${line}${c}"
- fi
- ;;
- *)
- line="${line}${c}"
- ;;
- esac
- lgetc
- done
-}
-
-next_io()
-{
- case "${c}" in
- '<')
- lgetc
- case "${c}" in
- '<')
- lgetc
- case "${c}" in '-')
- lgetc
- tok=T_DLESSDASH
- here_queue="${here_queue}true"
- here_awaiting_end=true
- here_awaiting_word=false
- break
- ;;
- esac
- tok=T_DLESS
- here_queue="${here_queue}false"
- here_awaiting_end=true
- here_awaiting_word=false
- break
- ;;
- '&')
- lgetc
- tok=T_LESSAND
- break
- ;;
- '>')
- lgetc
- tok=T_LESSGREAT
- break
- ;;
- esac
- tok=T_LESS
- break
- ;;
- '>')
- lgetc
- case "${c}" in
- '>')
- lgetc
- tok=T_DGREAT
- break
- ;;
- '&')
- lgetc
- tok=T_GREATAND
- break
- ;;
- '|')
- lgetc
- tok=T_CLOBBER
- break
- ;;
- esac
- tok=T_GREAT
- break
- ;;
- esac
-}
-
-next_word()
-{
- local prev_c="${1}"
- shift 1
- local res=
- local word=
-
- if ! res="$(scan_word false)"; then
- exit 1
- fi
- ln_off=${res%%${RS}*}
- res="${res#*${RS}}"
- c="${res%%${RS}*}"
- res="${res#*${RS}}"
- word="${prev_c}${res%%${RS}*}"
-
- # We must advance lineno because scan_word() was run in a subshell.
- lineno=$((${lineno} + ${ln_off}))
- tok="T_WORD${US}${word}"
-
- if ${here_awaiting_end}; then
- here_queue="${here_queue}${US}${word}"
- case "${word}" in
- *\\*|*'"'*|*"'"*)
- here_queue="${here_queue}${US}true"
- ;;
- *)
- here_queue="${here_queue}${US}false"
- ;;
- esac
- here_queue="${here_queue}${RS}"
- here_awaiting_end=false
- fi
-}
-
-#
-# Token scanning
-#
-
-scan_word()
-{
- local in_param="${1}"
- local res=
- local word=
- local quoted=
- local lines=
- local wordexp=
-
- word=''
- quoted=false
- lines=0
- while :; do
- dbg "parsing word char '$c' at lineno $lineno"
- case "${c}" in
- '')
- break
- ;;
- "${LF}")
- if ! ${in_param} && ! ${quoted}; then
- break
- fi
- lineno=$((${lineno} + 1))
- lines=$((${lines} + 1))
- word="${word}${c}"
- ;;
- ' '|"${HT}"|'&'|'|'|';'|'('|')'|'<'|'>')
- if ! ${in_param} && ! ${quoted}; then
- break
- fi
- word="${word}${c}"
- ;;
- '$')
- case "${here_queue}" in *"${RS}"*)
- if ${here_awaiting_end}; then
- synerr '%s %s %s %s' \
- 'Word expansions' \
- 'not supported in' \
- 'here-document' \
- 'delimiters'
- fi
- esac
- lgetc
- if ! res=$(scan_wordexp); then
- exit 1
- fi
- ln_off=${res%%${RS}*}
- res="${res#*${RS}}"
- c="${res%%${RS}*}"
- res="${res#*${RS}}"
- wordexp="${res%%${RS}*}"
- # We must advance lineno because scan_wordexp()
- # was run in a subshell.
- lineno=$((${lineno} + ${ln_off}))
- word="${word}${wordexp}"
- # scan_wordexp() leaves behind an unused
- # character, so we should skip the lgetc() call
- # below.
- continue
- ;;
- '`')
- synerr 'Backquoted (old-style) %s' \
- 'command substitution not supported'
- break
- ;;
- \\)
- word="${word}${c}"
- lgetc
- case "${c}" in '')
- # Bash, ksh93, mksh, and zsh ignore a
- # backslash at the end of a file, but
- # dash and BusyBox ash include it in the
- # word. To help with script
- # portability, we'll throw an error
- # (which is a reasonable thing to do
- # anyway).
- synerr 'Unexpected end of file %s' \
- 'after "\"'
- ;;
- esac
- word="${word}${c}"
- ;;
- \')
- word="${word}${c}"
- while :; do
- lgetc
- word="${word}${c}"
- case "${c}" in
- '')
- synerr '%s %s' \
- 'Unterminated' \
- 'quoted string'
- ;;
- \')
- break
- ;;
- esac
- done
- ;;
- '"')
- word="${word}${c}"
- if ${quoted}; then
- quoted=false
- else
- quoted=true
- fi
- ;;
- '}')
- if ${in_param} && ! ${quoted}; then
- break
- fi
- word="${word}${c}"
- ;;
- *)
- word="${word}${c}"
- ;;
- esac
- lgetc
- done
-
- if ${quoted}; then
- synerr 'Unterminated quoted string'
- fi
-
- printf "%d${RS}%c${RS}%s" ${lines} "${c}" "${word}"
-}
-
-scan_wordexp()
-{
- local res=
- local toks=
- local param=
-
- wordexp=''
- ln_off=0
- case "${c}" in
- '{')
- # Parameter expansion brace
- scan_wordexp_param_brace
- ;;
- '(')
- # Arithmetic expansion or command substitution
- lgetc
- case "${c}" in
- '(')
- # Arithmetic expansion
- scan_wordexp_arith
- ;;
- *)
- # Command substitution
- if ! res="$(run_sublexer "sub${fname}" \
- ${lineno} "${start}" \
- "${c}")"; then
- exit 1
- fi
- ln_off=${res%%${RS}*}
- res="${res#*${RS}}"
- c="${res%%${RS}*}"
- res="${res#*${RS}}"
- toks="${res%%${RS}*}"
- lineno=${ln_off}
- wordexp="\$(${SOH}C${STX}${toks}"
- wordexp="${wordexp}${ETX})"
- # ")" is recognized in run_sublexer().
- ;;
- esac
- ;;
- [@*#?$!A-Za-z0-9_-])
- if ! res="$(scan_param)"; then
- exit 1
- fi
- ln_off=${res%%${RS}*}
- res="${res#*${RS}}"
- c="${res%%${RS}*}"
- res="${res#*${RS}}"
- param="${res%%${RS}*}"
- lineno=$((${lineno} + ${ln_off}))
- wordexp="\$${param}"
- ;;
- esac
-
- printf "%d${RS}%c${RS}%s" ${ln_off} "${c}" "${wordexp}"
- return 0
-}
-
-scan_wordexp_param_brace()
-{
- local mod=
- local res=
- local param=
- local word=
-
- mod=true
-
- lgetc
- case "${c}" in
- '#')
- lgetc
- case "${c}" in
- [@*#?$!A-Za-z0-9_-])
- # String length expansion
- if ! res="$(scan_param)"; then
- exit 1
- fi
- ln_off=${res%%${RS}*}
- res="${res#*${RS}}"
- c="${res%%${RS}*}"
- res="${res#*${RS}}"
- param="${res%%${RS}*}"
- lineno=$((${lineno} + ${ln_off}))
- # Disable modifications.
- mod=false
- ;;
- *)
- # Special parameter "#"
- param='#'
- ;;
- esac
- ;;
- *)
- if ! res="$(scan_param)"; then
- exit 1
- fi
- ln_off=${res%%${RS}*}
- res="${res#*${RS}}"
- c="${res%%${RS}*}"
- res="${res#*${RS}}"
- param="${res%%${RS}*}"
- lineno=$((${lineno} + ${ln_off}))
- ;;
- esac
- wordexp="\${${param}"
-
- # If modifications are allowed
- if ${mod}; then
- # Check for modifications.
- mod=false
- case "${c}" in
- ':')
- mod=true
- wordexp="${wordexp}${c}"
- lgetc
- case "${c}" in '-'|'='|'?'|'+')
- wordexp="${wordexp}${c}"
- lgetc
- ;;
- esac
- ;;
- '-'|'='|'?'|'+')
- mod=true
- wordexp="${wordexp}${c}"
- lgetc
- ;;
- '%')
- mod=true
- wordexp="${wordexp}${c}"
- lgetc
- case "${c}" in '%')
- wordexp="${wordexp}${c}"
- lgetc
- ;;
- esac
- ;;
- '#')
- mod=true
- wordexp="${wordexp}${c}"
- lgetc
- case "${c}" in '#')
- wordexp="${wordexp}${c}"
- lgetc
- ;;
- esac
- ;;
- esac
- fi
-
- # If a modification was found
- if ${mod}; then
- # Get word.
- if ! res="$(scan_word true)"; then
- exit 1
- fi
- ln_off=${res%%${RS}*}
- res="${res#*${RS}}"
- c="${res%%${RS}*}"
- res="${res#*${RS}}"
- word="${res%%${RS}*}"
- # We must advance lineno because scan_word() was run in a
- # subshell.
- lineno=$((${lineno} + ${ln_off}))
- wordexp="${wordexp}${word}"
- dbg "param mod word: '$word'"
- fi
-
- # Check for right brace.
- case "${c}" in
- '}')
- wordexp="${wordexp}${c}"
- lgetc
- ;;
- *)
- synerr 'Missing "}"'
- ;;
- esac
-
- return 0
-}
-
-scan_param()
-{
- local param=
-
- param=''
- case "${c}" in
- [@*#?$!0-])
- # Special parameter
- param="${c}"
- lgetc
- ;;
- [1-9])
- # Positional parameter
- param="${param}${c}"
- lgetc
- while :; do
- case "${c}" in [!0-9])
- break
- ;;
- esac
- param="${param}${c}"
- lgetc
- done
- ;;
- [A-Za-z_])
- # Parameter name
- param="${param}${c}"
- lgetc
- while :; do
- case "${c}" in [!A-Za-z0-9_])
- break
- ;;
- esac
- param="${param}${c}"
- lgetc
- done
- ;;
- *)
- synerr 'Bad parameter name'
- ;;
- esac
-
- printf "%d${RS}%c${RS}%s" 0 "${c}" "${param}"
- return 0
-}
-
-scan_wordexp_arith()
-{
- local arith=
- local paren_lvl=
- local res=
- local sub_wordexp=
-
- arith=''
- paren_lvl=0
- while :; do
- lgetc
- case "${c}" in
- '')
- synerr 'end of file unexpected (%s)' \
- 'expecting "))"'
- ;;
- '(')
- arith="${arith}${c}"
- paren_lvl=$((${paren_lvl} + 1))
- ;;
- ')')
- if [ ${paren_lvl} -eq 0 ]; then
- lgetc
- case "${c}" in ')')
- wordexp="\$((${arith}))"
- lgetc
- return 0
- ;;
- esac
- synerr 'Arithmetic expansion: ")" %s' \
- 'unexpected'
- fi
- arith="${arith}${c}"
- paren_lvl=$((${paren_lvl} - 1))
- ;;
- '$')
- lgetc
- if ! res=$(scan_wordexp); then
- exit 1
- fi
- ln_off=${res%%${RS}*}
- res="${res#*${RS}}"
- c="${res%%${RS}*}"
- res="${res#*${RS}}"
- sub_wordexp="${res%%${RS}*}"
- # We must advance lineno because scan_wordexp()
- # was run in a subshell.
- lineno=$((${lineno} + ${ln_off}))
- arith="${arith}${sub_wordexp}"
- ;;
- *)
- arith="${arith}${c}"
- ;;
- esac
- done
-}
-
-#
-# Interface
-#
-
-# Check the current token. If it matches, add it to the syntax array.
-accept()
-{
- local t="${1}"
- local rw=
-
- dbg "looking for $t, current tok ${tok%%${US}*}"
- case "${t}" in
- T_IF|T_THEN|T_ELSE|T_ELIF|T_FI|\
- T_DO|T_DONE|T_CASE|T_ESAC|T_WHILE|T_UNTIL|\
- T_FOR|T_LBRACE|T_RBRACE|T_BANG|T_IN)
- dbg "looking for reserved word $t, have '$tok'"
- if ! [ "x${tok%%${US}*}" = "x${t}" ]; then
- # Reserved words are recognized as literal
- # T_WORDs.
- if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
- return 1
- fi
- # T_WORD data unit must match reserved word
- # exactly.
- if ! [ "x${tok#T_WORD${US}}" = \
- "x$(toktext "${t}")" ]; then
- return 1
- fi
- # If the token matches the reserved word,
- # replace it with the reserved word token.
- tok="${t}"
- fi
- ;;
- T_NAME)
- # Names are recognized as literal T_WORDs.
- if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
- return 1
- fi
- # Validate name.
- case "${tok%%${US}*}" in
- [A-Za-z_][0-9A-Za-z_]*)
- ;;
- *)
- return 1
- ;;
- esac
- tok="T_NAME${US}${tok#T_WORD${US}}"
- ;;
- T_FNAME)
- # Function names are recognized as literal T_WORDs.
- if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
- return 1
- fi
- # Validate name.
- case "${tok%%${US}*}" in
- [A-Za-z_][0-9A-Za-z_]*)
- ;;
- *)
- return 1
- ;;
- esac
- # Verify that the function name doesn't match any
- # reserved words.
- for rw in T_IF T_THEN T_ELSE T_ELIF T_FI T_DO T_DONE \
- T_CASE T_ESAC T_WHILE T_UNTIL T_FOR \
- T_LBRACE T_RBRACE T_BANG T_IN; do
- if [ "x${tok#T_WORD${US}}" = \
- "x$(toktext "${rw}")" ]; then
- tok="${rw}"
- return 1
- fi
- done
- tok="T_FNAME${US}${tok#T_WORD${US}}"
- ;;
- T_CMDNAME)
- # The first word of a simple command is to be checked
- # for reserved words.
- if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
- return 1
- fi
- # Verify that the word doesn't match any reserved words.
- for rw in T_IF T_THEN T_ELSE T_ELIF T_FI T_DO T_DONE \
- T_CASE T_ESAC T_WHILE T_UNTIL T_FOR \
- T_LBRACE T_RBRACE T_BANG T_IN; do
- if [ "x${tok#T_WORD${US}}" = \
- "x$(toktext "${rw}")" ]; then
- tok="${rw}"
- return 1
- fi
- done
- ;;
- *)
- if ! [ "x${tok%%${US}*}" = "x${t}" ]; then
- return 1
- fi
- ;;
- esac
-
- dbg "accept $t"
- tokens="${tokens}${tok}${RS}"
- next
- return 0
-}
-
-expect()
-{
- local t="${1}"
-
- if accept "${t}"; then
- return 0
- else
- synexp "${t}"
- fi
-}
-
-# Called by the lexer, not the parser
-run_sublexer()
-{
- local fn="${1}"
- local ln="${2}"
- local st="${3}"
- local ch="${4}"
- shift 4
-
- # Initialize global variables.
- fname="${fn}"
- lineno=${ln}
- start="${st}"
- here_queue=''
- here_awaiting_end=false
- here_awaiting_word=false
- tokens=''
-
- c="${ch}"
- next
-
- #dbg=true
- # If this returns (does not exit), there are no errors.
- ${start}
- case "${tok%${US}*}" in
- T_RPAREN)
- ;;
- *)
- synerr 'Missing ")"'
- ;;
- esac
-
- printf "%d${RS}%c${RS}%s" ${lineno} "${c}" "${tokens}"
- return 0
-}
-
-run_lexer()
-{
- local fn="${1}"
- local st="${2}"
- shift 2
-
- # Initialize global variables.
- fname="${fn}"
- lineno=1
- start="${st}"
- here_queue=''
- here_awaiting_end=false
- here_awaiting_word=false
- tokens=''
-
- # Read the first character and recognize the first token.
- lgetc
- next
-
- # If this returns (does not exit), there are no errors.
- ${start}
- if ! accept T_EOF; then
- synexp ''
- fi
-
- # Return the tokens.
- printf '%s' "${tokens}"
-
- return 0
-}