summaryrefslogtreecommitdiffstats
path: root/parsing/lexer.sh
diff options
context:
space:
mode:
Diffstat (limited to 'parsing/lexer.sh')
-rw-r--r--parsing/lexer.sh457
1 files changed, 457 insertions, 0 deletions
diff --git a/parsing/lexer.sh b/parsing/lexer.sh
new file mode 100644
index 0000000..3081a79
--- /dev/null
+++ b/parsing/lexer.sh
@@ -0,0 +1,457 @@
+fname=
+lineno=
+c=
+tok=
+tokens=
+
+pgetc()
+{
+ c="$(dd bs=1 count=1 2>/dev/null; printf '.')"
+ c="${c%.}"
+}
+
+next()
+{
+ while :; do
+ echo "parsing char '$c'" >&2
+ case "${c}" in
+ '')
+ pgetc
+ tok=T_EOF
+ return
+ ;;
+ "${LF}")
+ pgetc
+ lineno=$((${lineno} + 1))
+ tok=T_NEWLINE
+ return
+ ;;
+ ' '|"${HT}")
+ pgetc
+ continue
+ ;;
+ \\)
+ pgetc
+ case "${c}" in "${LF}")
+ lineno=$((${lineno} + 1))
+ pgetc
+ continue
+ ;;
+ esac
+ next_word
+ return
+ ;;
+ '#')
+ pgetc
+ while :; do
+ case "${c}" in "${LF}"|'')
+ break
+ ;;
+ esac
+ pgetc
+ done
+ continue
+ ;;
+ '&')
+ pgetc
+ case "${c}" in '&')
+ pgetc
+ tok=T_AND_IF
+ return
+ ;;
+ esac
+ tok=T_AND
+ return
+ ;;
+ '|')
+ pgetc
+ case "${c}" in '|')
+ pgetc
+ tok=T_OR_IF
+ return
+ ;;
+ esac
+ tok=T_PIPE
+ return
+ ;;
+ ';')
+ pgetc
+ case "${c}" in ';')
+ pgetc
+ tok=T_DSEMI
+ return
+ ;;
+ esac
+ tok=T_SEMI
+ return
+ ;;
+ '(')
+ pgetc
+ tok=T_LPAREN
+ return
+ ;;
+ ')')
+ pgetc
+ tok=T_RPAREN
+ return
+ ;;
+ '<'|'>')
+ next_io
+ return
+ ;;
+ *)
+ next_word
+ return
+ ;;
+ esac
+ pgetc
+ done
+}
+
+next_io()
+{
+ case "${c}" in
+ '<')
+ pgetc
+ case "${c}" in '<')
+ pgetc
+ case "${c}" in '-')
+ pgetc
+ tok=T_DLESSDASH
+ ;;
+ esac
+ tok=T_DLESS
+ ;;
+ esac
+ tok=T_LESS
+ ;;
+ '>')
+ pgetc
+ case "${c}" in '>')
+ pgetc
+ tok=T_DGREAT
+ ;;
+ esac
+ tok=T_GREAT
+ ;;
+ esac
+}
+
+next_word()
+{
+ local res=
+ local lineno_offset=
+ local word=
+
+ res="$(scan_word false)"
+ lineno_offset=${res%%${RS}*}
+ res="${res#*${RS}}"
+ c="${res%%${RS}*}"
+ res="${res#*${RS}}"
+ word="${res%%${RS}*}"
+
+ # We must advance lineno because scan_word() was run in a subshell.
+ lineno=$((${lineno} + ${lineno_offset}))
+ tok="T_WORD${US}${word}"
+}
+
+scan_word()
+{
+ local in_param="${1}"
+ local res=
+ local word=
+ local quoted=
+ local lines=
+ local lineno_offset=
+ local wordexp=
+
+ word=''
+ quoted=false
+ lines=0
+ while :; do
+ echo "parsing word char '$c'" >&2
+ case "${c}" in
+ '')
+ break
+ ;;
+ "${LF}")
+ if ! ${in_param} && ! ${quoted}; then
+ break
+ fi
+ lineno=$((${lineno} + 1))
+ lines=$((${lines} + 1))
+ word="${word}${c}"
+ ;;
+ ' '|"${HT}")
+ if ! ${in_param} && ! ${quoted}; then
+ break
+ fi
+ word="${word}${c}"
+ ;;
+ '$')
+ pgetc
+ res=$(scan_wordexp)
+ lineno_offset=${res%%${RS}*}
+ res="${res#*${RS}}"
+ c="${res%%${RS}*}"
+ res="${res#*${RS}}"
+ wordexp="${res%%${RS}*}"
+ # We must advance lineno because scan_wordexp()
+ # was run in a subshell.
+ lineno=$((${lineno} + ${lineno_offset}))
+ word="${word}${wordexp}"
+ # scan_wordexp() leaves behind an unused
+ # character, so we should skip the pgetc() call
+ # below.
+ continue
+ ;;
+ \')
+ word="${word}${c}"
+ while :; do
+ pgetc
+ word="${word}${c}"
+ case "${c}" in \')
+ break
+ ;;
+ esac
+ done
+ ;;
+ '"')
+ word="${word}${c}"
+ if ${quoted}; then
+ quoted=false
+ else
+ quoted=true
+ fi
+ ;;
+ '}')
+ if ${in_param} && ! ${quoted}; then
+ break
+ fi
+ word="${word}${c}"
+ ;;
+ *)
+ word="${word}${c}"
+ ;;
+ esac
+ pgetc
+ done
+
+ printf "%d${RS}%s${RS}%s" ${lines} "${c}" "${word}"
+}
+
+scan_wordexp()
+{
+ local wordexp=
+ local lineno_offset=
+ local mod=
+ local res=
+ local word=
+
+ wordexp=''
+ lineno_offset=0
+ case "${c}" in
+ '{')
+ # Parameter expansion brace
+ pgetc
+ case "${c}" in
+ '#')
+ pgetc
+ case "${c}" in
+ [@*#?$!A-Za-z0-9_-])
+ # String length
+ # expansion
+ next_param
+ ;;
+ *)
+ # Special parameter "#"
+ param='#'
+ ;;
+ esac
+ ;;
+ *)
+ next_param
+ ;;
+ esac
+ wordexp="\${${param}"
+ # Check for modifications
+ mod=false
+ case "${c}" in
+ ':')
+ mod=true
+ wordexp="${wordexp}${c}"
+ pgetc
+ case "${c}" in '-'|'='|'?'|'+')
+ wordexp="${wordexp}${c}"
+ pgetc
+ ;;
+ esac
+ ;;
+ '-'|'='|'?'|'+')
+ mod=true
+ wordexp="${wordexp}${c}"
+ pgetc
+ ;;
+ '%')
+ mod=true
+ wordexp="${wordexp}${c}"
+ pgetc
+ case "${c}" in '%')
+ wordexp="${wordexp}${c}"
+ pgetc
+ ;;
+ esac
+ ;;
+ '#')
+ mod=true
+ wordexp="${wordexp}${c}"
+ pgetc
+ case "${c}" in '#')
+ wordexp="${wordexp}${c}"
+ pgetc
+ ;;
+ esac
+ ;;
+ esac
+ if ${mod}; then
+ # Get word.
+ res="$(scan_word true)"
+ lineno_offset=${res%%${RS}*}
+ res="${res#*${RS}}"
+ c="${res%%${RS}*}"
+ res="${res#*${RS}}"
+ word="${res%%${RS}*}"
+ # We must advance lineno because scan_word() was
+ # run in a subshell.
+ lineno=$((${lineno} + ${lineno_offset}))
+ wordexp="${wordexp}${word}"
+ echo "param mod word: '$word'" >&2
+ fi
+ # Check for right brace.
+ case "${c}" in
+ '}')
+ wordexp="${wordexp}${c}"
+ pgetc
+ ;;
+ *)
+ synerr 'Missing "}"'
+ ;;
+ esac
+ ;;
+ '(')
+ ;;
+ [@*#?$!A-Za-z0-9_-])
+ next_param
+ wordexp="\$${param}"
+ ;;
+ esac
+
+ printf "%d${RS}%s${RS}%s" ${lineno_offset} "${c}" "${wordexp}"
+
+}
+
+next_param()
+{
+ param=''
+ case "${c}" in
+ [@*#?$!0-])
+ # Special parameter
+ param="${c}"
+ pgetc
+ ;;
+ [1-9])
+ # Positional parameter
+ param="${param}${c}"
+ pgetc
+ while :; do
+ case "${c}" in [!0-9])
+ break
+ ;;
+ esac
+ param="${param}${c}"
+ pgetc
+ done
+ ;;
+ [A-Za-z_])
+ # Parameter name
+ param="${param}${c}"
+ pgetc
+ while :; do
+ case "${c}" in [!A-Za-z0-9_])
+ break
+ ;;
+ esac
+ param="${param}${c}"
+ pgetc
+ done
+ ;;
+ esac
+}
+
+# Check the current token. If it matches, add it to the syntax array.
+accept()
+{
+ local t="${1}"
+
+ if [ "x${tok%%${US}*}" = "x${t}" ]; then
+ echo "accept $t" >&2
+ tokens="${tokens}${tok}${RS}"
+ next
+ return 0
+ fi
+ return 1
+}
+
+error()
+{
+ local fmt="${1}"
+ shift 1
+
+ case "${fname}" in
+ '-')
+ printf "stdin:%d: ${fmt}\n" ${lineno} "${@}" >&2
+ ;;
+ *)
+ printf "%s:%d: ${fmt}\n" "${fname}" ${lineno} "${@}" >&2
+ ;;
+ esac
+}
+
+init_lexer()
+{
+ local fn="${1}"
+ shift 1
+
+ fname="${fn}"
+ lineno=1
+ tokens=''
+ pgetc
+ next
+}
+
+get_tokens()
+{
+ printf '%s' "${tokens}"
+ return 0
+}
+
+synexp()
+{
+ local t="${1}"
+ shift 1
+
+ if [ "x${t}" = 'x' ]; then
+ synerr '%s unexpected' "$(tokname "${tok}")"
+ else
+ synerr '%s unexpected (expecting %s)' "$(tokname "${tok}")" \
+ "$(tokname "${t}")"
+ fi
+}
+
+synerr()
+{
+ local fmt="${1}"
+ shift 1
+
+ error "Syntax error: ${fmt}" "${@}"
+}