summaryrefslogtreecommitdiffstats
path: root/parsing
diff options
context:
space:
mode:
authorP. J. McDermott <pj@pehjota.net>2016-02-19 04:01:32 (EST)
committer P. J. McDermott <pj@pehjota.net>2016-02-19 04:01:32 (EST)
commitf6e55a026abf33867141896b1e227e791942c2a3 (patch)
treebe82488bad31428eaf3a98583bbf56aaa0610804 /parsing
parenta9f84d9757aad50db3f1d7e11c6e779b920ed96b (diff)
downloadeggshell-f6e55a026abf33867141896b1e227e791942c2a3.zip
eggshell-f6e55a026abf33867141896b1e227e791942c2a3.tar.gz
eggshell-f6e55a026abf33867141896b1e227e791942c2a3.tar.bz2
[WIP] Add lexer/parser demo
Diffstat (limited to 'parsing')
-rw-r--r--parsing/lexer.sh457
-rw-r--r--parsing/parse.sh133
-rw-r--r--parsing/tokens.sh55
3 files changed, 645 insertions, 0 deletions
diff --git a/parsing/lexer.sh b/parsing/lexer.sh
new file mode 100644
index 0000000..3081a79
--- /dev/null
+++ b/parsing/lexer.sh
@@ -0,0 +1,457 @@
+fname=
+lineno=
+c=
+tok=
+tokens=
+
+pgetc()
+{
+ c="$(dd bs=1 count=1 2>/dev/null; printf '.')"
+ c="${c%.}"
+}
+
+next()
+{
+ while :; do
+ echo "parsing char '$c'" >&2
+ case "${c}" in
+ '')
+ pgetc
+ tok=T_EOF
+ return
+ ;;
+ "${LF}")
+ pgetc
+ lineno=$((${lineno} + 1))
+ tok=T_NEWLINE
+ return
+ ;;
+ ' '|"${HT}")
+ pgetc
+ continue
+ ;;
+ \\)
+ pgetc
+ case "${c}" in "${LF}")
+ lineno=$((${lineno} + 1))
+ pgetc
+ continue
+ ;;
+ esac
+ next_word
+ return
+ ;;
+ '#')
+ pgetc
+ while :; do
+ case "${c}" in "${LF}"|'')
+ break
+ ;;
+ esac
+ pgetc
+ done
+ continue
+ ;;
+ '&')
+ pgetc
+ case "${c}" in '&')
+ pgetc
+ tok=T_AND_IF
+ return
+ ;;
+ esac
+ tok=T_AND
+ return
+ ;;
+ '|')
+ pgetc
+ case "${c}" in '|')
+ pgetc
+ tok=T_OR_IF
+ return
+ ;;
+ esac
+ tok=T_PIPE
+ return
+ ;;
+ ';')
+ pgetc
+ case "${c}" in ';')
+ pgetc
+ tok=T_DSEMI
+ return
+ ;;
+ esac
+ tok=T_SEMI
+ return
+ ;;
+ '(')
+ pgetc
+ tok=T_LPAREN
+ return
+ ;;
+ ')')
+ pgetc
+ tok=T_RPAREN
+ return
+ ;;
+ '<'|'>')
+ next_io
+ return
+ ;;
+ *)
+ next_word
+ return
+ ;;
+ esac
+ pgetc
+ done
+}
+
+next_io()
+{
+ case "${c}" in
+ '<')
+ pgetc
+ case "${c}" in '<')
+ pgetc
+ case "${c}" in '-')
+ pgetc
+ tok=T_DLESSDASH
+ ;;
+ esac
+ tok=T_DLESS
+ ;;
+ esac
+ tok=T_LESS
+ ;;
+ '>')
+ pgetc
+ case "${c}" in '>')
+ pgetc
+ tok=T_DGREAT
+ ;;
+ esac
+ tok=T_GREAT
+ ;;
+ esac
+}
+
+next_word()
+{
+ local res=
+ local lineno_offset=
+ local word=
+
+ res="$(scan_word false)"
+ lineno_offset=${res%%${RS}*}
+ res="${res#*${RS}}"
+ c="${res%%${RS}*}"
+ res="${res#*${RS}}"
+ word="${res%%${RS}*}"
+
+ # We must advance lineno because scan_word() was run in a subshell.
+ lineno=$((${lineno} + ${lineno_offset}))
+ tok="T_WORD${US}${word}"
+}
+
+scan_word()
+{
+ local in_param="${1}"
+ local res=
+ local word=
+ local quoted=
+ local lines=
+ local lineno_offset=
+ local wordexp=
+
+ word=''
+ quoted=false
+ lines=0
+ while :; do
+ echo "parsing word char '$c'" >&2
+ case "${c}" in
+ '')
+ break
+ ;;
+ "${LF}")
+ if ! ${in_param} && ! ${quoted}; then
+ break
+ fi
+ lineno=$((${lineno} + 1))
+ lines=$((${lines} + 1))
+ word="${word}${c}"
+ ;;
+ ' '|"${HT}")
+ if ! ${in_param} && ! ${quoted}; then
+ break
+ fi
+ word="${word}${c}"
+ ;;
+ '$')
+ pgetc
+ res=$(scan_wordexp)
+ lineno_offset=${res%%${RS}*}
+ res="${res#*${RS}}"
+ c="${res%%${RS}*}"
+ res="${res#*${RS}}"
+ wordexp="${res%%${RS}*}"
+ # We must advance lineno because scan_wordexp()
+ # was run in a subshell.
+ lineno=$((${lineno} + ${lineno_offset}))
+ word="${word}${wordexp}"
+ # scan_wordexp() leaves behind an unused
+ # character, so we should skip the pgetc() call
+ # below.
+ continue
+ ;;
+ \')
+ word="${word}${c}"
+ while :; do
+ pgetc
+ word="${word}${c}"
+ case "${c}" in \')
+ break
+ ;;
+ esac
+ done
+ ;;
+ '"')
+ word="${word}${c}"
+ if ${quoted}; then
+ quoted=false
+ else
+ quoted=true
+ fi
+ ;;
+ '}')
+ if ${in_param} && ! ${quoted}; then
+ break
+ fi
+ word="${word}${c}"
+ ;;
+ *)
+ word="${word}${c}"
+ ;;
+ esac
+ pgetc
+ done
+
+ printf "%d${RS}%s${RS}%s" ${lines} "${c}" "${word}"
+}
+
+scan_wordexp()
+{
+ local wordexp=
+ local lineno_offset=
+ local mod=
+ local res=
+ local word=
+
+ wordexp=''
+ lineno_offset=0
+ case "${c}" in
+ '{')
+ # Parameter expansion brace
+ pgetc
+ case "${c}" in
+ '#')
+ pgetc
+ case "${c}" in
+ [@*#?$!A-Za-z0-9_-])
+ # String length
+ # expansion
+ next_param
+ ;;
+ *)
+ # Special parameter "#"
+ param='#'
+ ;;
+ esac
+ ;;
+ *)
+ next_param
+ ;;
+ esac
+ wordexp="\${${param}"
+ # Check for modifications
+ mod=false
+ case "${c}" in
+ ':')
+ mod=true
+ wordexp="${wordexp}${c}"
+ pgetc
+ case "${c}" in '-'|'='|'?'|'+')
+ wordexp="${wordexp}${c}"
+ pgetc
+ ;;
+ esac
+ ;;
+ '-'|'='|'?'|'+')
+ mod=true
+ wordexp="${wordexp}${c}"
+ pgetc
+ ;;
+ '%')
+ mod=true
+ wordexp="${wordexp}${c}"
+ pgetc
+ case "${c}" in '%')
+ wordexp="${wordexp}${c}"
+ pgetc
+ ;;
+ esac
+ ;;
+ '#')
+ mod=true
+ wordexp="${wordexp}${c}"
+ pgetc
+ case "${c}" in '#')
+ wordexp="${wordexp}${c}"
+ pgetc
+ ;;
+ esac
+ ;;
+ esac
+ if ${mod}; then
+ # Get word.
+ res="$(scan_word true)"
+ lineno_offset=${res%%${RS}*}
+ res="${res#*${RS}}"
+ c="${res%%${RS}*}"
+ res="${res#*${RS}}"
+ word="${res%%${RS}*}"
+ # We must advance lineno because scan_word() was
+ # run in a subshell.
+ lineno=$((${lineno} + ${lineno_offset}))
+ wordexp="${wordexp}${word}"
+ echo "param mod word: '$word'" >&2
+ fi
+ # Check for right brace.
+ case "${c}" in
+ '}')
+ wordexp="${wordexp}${c}"
+ pgetc
+ ;;
+ *)
+ synerr 'Missing "}"'
+ ;;
+ esac
+ ;;
+ '(')
+ ;;
+ [@*#?$!A-Za-z0-9_-])
+ next_param
+ wordexp="\$${param}"
+ ;;
+ esac
+
+ printf "%d${RS}%s${RS}%s" ${lineno_offset} "${c}" "${wordexp}"
+
+}
+
+next_param()
+{
+ param=''
+ case "${c}" in
+ [@*#?$!0-])
+ # Special parameter
+ param="${c}"
+ pgetc
+ ;;
+ [1-9])
+ # Positional parameter
+ param="${param}${c}"
+ pgetc
+ while :; do
+ case "${c}" in [!0-9])
+ break
+ ;;
+ esac
+ param="${param}${c}"
+ pgetc
+ done
+ ;;
+ [A-Za-z_])
+ # Parameter name
+ param="${param}${c}"
+ pgetc
+ while :; do
+ case "${c}" in [!A-Za-z0-9_])
+ break
+ ;;
+ esac
+ param="${param}${c}"
+ pgetc
+ done
+ ;;
+ esac
+}
+
+# Check the current token. If it matches, add it to the syntax array.
+accept()
+{
+ local t="${1}"
+
+ if [ "x${tok%%${US}*}" = "x${t}" ]; then
+ echo "accept $t" >&2
+ tokens="${tokens}${tok}${RS}"
+ next
+ return 0
+ fi
+ return 1
+}
+
+error()
+{
+ local fmt="${1}"
+ shift 1
+
+ case "${fname}" in
+ '-')
+ printf "stdin:%d: ${fmt}\n" ${lineno} "${@}" >&2
+ ;;
+ *)
+ printf "%s:%d: ${fmt}\n" "${fname}" ${lineno} "${@}" >&2
+ ;;
+ esac
+}
+
+init_lexer()
+{
+ local fn="${1}"
+ shift 1
+
+ fname="${fn}"
+ lineno=1
+ tokens=''
+ pgetc
+ next
+}
+
+get_tokens()
+{
+ printf '%s' "${tokens}"
+ return 0
+}
+
+synexp()
+{
+ local t="${1}"
+ shift 1
+
+ if [ "x${t}" = 'x' ]; then
+ synerr '%s unexpected' "$(tokname "${tok}")"
+ else
+ synerr '%s unexpected (expecting %s)' "$(tokname "${tok}")" \
+ "$(tokname "${t}")"
+ fi
+}
+
+synerr()
+{
+ local fmt="${1}"
+ shift 1
+
+ error "Syntax error: ${fmt}" "${@}"
+}
diff --git a/parsing/parse.sh b/parsing/parse.sh
new file mode 100644
index 0000000..2ac4215
--- /dev/null
+++ b/parsing/parse.sh
@@ -0,0 +1,133 @@
+HT="$(printf '\t.')"; HT="${HT%.}"
+LF="$(printf '\n.')"; LF="${LF%.}"
+RS="$(printf '\036.')"; RS="${RS%.}"
+US="$(printf '\037.')"; US="${US%.}"
+
+. ./tokens.sh
+. ./lexer.sh
+
+complete_command()
+{
+ if list; then
+ separator
+ return 0
+ fi
+ return 1
+}
+
+list()
+{
+ if and_or; then
+ while separator_op; do
+ if ! and_or; then
+ return 1
+ fi
+ done
+ return 0
+ fi
+ return 1
+}
+and_or()
+{
+ if pipeline; then
+ while accept T_AND_IF || accept T_OR_IF; do
+ if ! linebreak || ! pipeline; then
+ return 1
+ fi
+ done
+ return 0
+ fi
+ return 1
+}
+pipeline()
+{
+ accept T_BANG
+ if pipe_sequence; then
+ return 0
+ fi
+ return 1
+}
+
+pipe_sequence()
+{
+ if command; then
+ while accept T_PIPE; do
+ if ! linebreak || ! command; then
+ return 1
+ fi
+ done
+ return 0
+ fi
+ return 1
+}
+
+command()
+{
+ # XXX: Unfinished
+ accept T_WORD
+}
+
+
+
+newline_list()
+{
+ if accept T_NEWLINE; then
+ while accept T_NEWLINE; do
+ :
+ done
+ return 0
+ fi
+ return 1
+}
+linebreak()
+{
+ newline_list
+ return 0
+}
+
+separator_op()
+{
+ if accept T_AND || accept T_SEMI; then
+ return 0
+ fi
+ return 1
+}
+
+separator()
+{
+ if separator_op && linebreak; then
+ return 0
+ elif newline_list; then
+ return 0
+ fi
+ return 1
+}
+
+parse()
+{
+ local fn="${1}"
+ shift 1
+
+ init_lexer "${fn}"
+ while complete_command; do :; done
+ if :; then # TODO: Test for EOF or errors
+ get_tokens
+ return 0
+ fi
+ return 1
+}
+
+if tokens="$(printf '%s\n' '"foo bar" && $baz || qux' '${quux%uux } quuux' | \
+ parse -)"; then
+ IFS="${RS}"
+ for t in ${tokens}; do
+ printf 'Token: %s\n' "$(tokname "${t}")"
+ case "${t%${US}*}" in T_WORD)
+ printf ' "%s"\n' "${t#T_WORD${US}}"
+ ;;
+ esac
+ done
+ unset IFS
+else
+ echo FAIL
+fi
diff --git a/parsing/tokens.sh b/parsing/tokens.sh
new file mode 100644
index 0000000..37d741b
--- /dev/null
+++ b/parsing/tokens.sh
@@ -0,0 +1,55 @@
+tokname()
+{
+ local t="${1}"
+ shift 1
+ local n=
+
+ case "${t%${US}*}" in
+ # Operators
+ T_EOF) n='end of file';;
+ T_NEWLINE) n='newline';;
+ T_AND) n='&';;
+ T_SEMI) n=';';;
+ T_AND_IF) n='&&';;
+ T_OR_IF) n='||';;
+ T_DSEMI) n=';;';;
+ T_LESS) n='<';;
+ T_GREAT) n='>';;
+ T_DLESS) n='<<';;
+ T_DGREAT) n='>>';;
+ T_LESSAND) n='<&';;
+ T_GREATAND) n='>&';;
+ T_LESSGREAT) n='<>';;
+ T_DLESSDASH) n='<<-';;
+ T_CLOBBER) n='>|';;
+ T_PIPE) n='|';;
+ T_LPAREN) n='(';;
+ T_RPAREN) n=')';;
+ # Reserved words
+ T_IF) n='if';;
+ T_THEN) n='then';;
+ T_ELSE) n='else';;
+ T_ELIF) n='elif';;
+ T_FI) n='fi';;
+ T_DO) n='do';;
+ T_DONE) n='done';;
+ T_CASE) n='case';;
+ T_ESAC) n='esac';;
+ T_WHILE) n='while';;
+ T_UNTIL) n='until';;
+ T_FOR) n='for';;
+ T_LBRACE) n='{';;
+ T_RBRACE) n='}';;
+ T_BANG) n='!';;
+ T_IN) n='in';;
+ # Special symbols
+ T_NAME) n='name';;
+ T_IO_NUMBER) n='I/O number';;
+ T_WORD) n='word';;
+ T_ASSIGNMENT_WORD) n='assignment word';;
+ # Unknown
+ *) n='unknown token';;
+ esac
+
+ printf '%s' "${n}"
+}