From f6e55a026abf33867141896b1e227e791942c2a3 Mon Sep 17 00:00:00 2001 From: P. J. McDermott Date: Fri, 19 Feb 2016 04:01:32 -0500 Subject: [WIP] Add lexer/parser demo --- (limited to 'parsing/lexer.sh') diff --git a/parsing/lexer.sh b/parsing/lexer.sh new file mode 100644 index 0000000..3081a79 --- /dev/null +++ b/parsing/lexer.sh @@ -0,0 +1,457 @@ +fname= +lineno= +c= +tok= +tokens= + +pgetc() +{ + c="$(dd bs=1 count=1 2>/dev/null; printf '.')" + c="${c%.}" +} + +next() +{ + while :; do + echo "parsing char '$c'" >&2 + case "${c}" in + '') + pgetc + tok=T_EOF + return + ;; + "${LF}") + pgetc + lineno=$((${lineno} + 1)) + tok=T_NEWLINE + return + ;; + ' '|"${HT}") + pgetc + continue + ;; + \\) + pgetc + case "${c}" in "${LF}") + lineno=$((${lineno} + 1)) + pgetc + continue + ;; + esac + next_word + return + ;; + '#') + pgetc + while :; do + case "${c}" in "${LF}"|'') + break + ;; + esac + pgetc + done + continue + ;; + '&') + pgetc + case "${c}" in '&') + pgetc + tok=T_AND_IF + return + ;; + esac + tok=T_AND + return + ;; + '|') + pgetc + case "${c}" in '|') + pgetc + tok=T_OR_IF + return + ;; + esac + tok=T_PIPE + return + ;; + ';') + pgetc + case "${c}" in ';') + pgetc + tok=T_DSEMI + return + ;; + esac + tok=T_SEMI + return + ;; + '(') + pgetc + tok=T_LPAREN + return + ;; + ')') + pgetc + tok=T_RPAREN + return + ;; + '<'|'>') + next_io + return + ;; + *) + next_word + return + ;; + esac + pgetc + done +} + +next_io() +{ + case "${c}" in + '<') + pgetc + case "${c}" in '<') + pgetc + case "${c}" in '-') + pgetc + tok=T_DLESSDASH + ;; + esac + tok=T_DLESS + ;; + esac + tok=T_LESS + ;; + '>') + pgetc + case "${c}" in '>') + pgetc + tok=T_DGREAT + ;; + esac + tok=T_GREAT + ;; + esac +} + +next_word() +{ + local res= + local lineno_offset= + local word= + + res="$(scan_word false)" + lineno_offset=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + word="${res%%${RS}*}" + + # We must advance lineno because scan_word() was run in a subshell. + lineno=$((${lineno} + ${lineno_offset})) + tok="T_WORD${US}${word}" +} + +scan_word() +{ + local in_param="${1}" + local res= + local word= + local quoted= + local lines= + local lineno_offset= + local wordexp= + + word='' + quoted=false + lines=0 + while :; do + echo "parsing word char '$c'" >&2 + case "${c}" in + '') + break + ;; + "${LF}") + if ! ${in_param} && ! ${quoted}; then + break + fi + lineno=$((${lineno} + 1)) + lines=$((${lines} + 1)) + word="${word}${c}" + ;; + ' '|"${HT}") + if ! ${in_param} && ! ${quoted}; then + break + fi + word="${word}${c}" + ;; + '$') + pgetc + res=$(scan_wordexp) + lineno_offset=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + wordexp="${res%%${RS}*}" + # We must advance lineno because scan_wordexp() + # was run in a subshell. + lineno=$((${lineno} + ${lineno_offset})) + word="${word}${wordexp}" + # scan_wordexp() leaves behind an unused + # character, so we should skip the pgetc() call + # below. + continue + ;; + \') + word="${word}${c}" + while :; do + pgetc + word="${word}${c}" + case "${c}" in \') + break + ;; + esac + done + ;; + '"') + word="${word}${c}" + if ${quoted}; then + quoted=false + else + quoted=true + fi + ;; + '}') + if ${in_param} && ! ${quoted}; then + break + fi + word="${word}${c}" + ;; + *) + word="${word}${c}" + ;; + esac + pgetc + done + + printf "%d${RS}%s${RS}%s" ${lines} "${c}" "${word}" +} + +scan_wordexp() +{ + local wordexp= + local lineno_offset= + local mod= + local res= + local word= + + wordexp='' + lineno_offset=0 + case "${c}" in + '{') + # Parameter expansion brace + pgetc + case "${c}" in + '#') + pgetc + case "${c}" in + [@*#?$!A-Za-z0-9_-]) + # String length + # expansion + next_param + ;; + *) + # Special parameter "#" + param='#' + ;; + esac + ;; + *) + next_param + ;; + esac + wordexp="\${${param}" + # Check for modifications + mod=false + case "${c}" in + ':') + mod=true + wordexp="${wordexp}${c}" + pgetc + case "${c}" in '-'|'='|'?'|'+') + wordexp="${wordexp}${c}" + pgetc + ;; + esac + ;; + '-'|'='|'?'|'+') + mod=true + wordexp="${wordexp}${c}" + pgetc + ;; + '%') + mod=true + wordexp="${wordexp}${c}" + pgetc + case "${c}" in '%') + wordexp="${wordexp}${c}" + pgetc + ;; + esac + ;; + '#') + mod=true + wordexp="${wordexp}${c}" + pgetc + case "${c}" in '#') + wordexp="${wordexp}${c}" + pgetc + ;; + esac + ;; + esac + if ${mod}; then + # Get word. + res="$(scan_word true)" + lineno_offset=${res%%${RS}*} + res="${res#*${RS}}" + c="${res%%${RS}*}" + res="${res#*${RS}}" + word="${res%%${RS}*}" + # We must advance lineno because scan_word() was + # run in a subshell. + lineno=$((${lineno} + ${lineno_offset})) + wordexp="${wordexp}${word}" + echo "param mod word: '$word'" >&2 + fi + # Check for right brace. + case "${c}" in + '}') + wordexp="${wordexp}${c}" + pgetc + ;; + *) + synerr 'Missing "}"' + ;; + esac + ;; + '(') + ;; + [@*#?$!A-Za-z0-9_-]) + next_param + wordexp="\$${param}" + ;; + esac + + printf "%d${RS}%s${RS}%s" ${lineno_offset} "${c}" "${wordexp}" + +} + +next_param() +{ + param='' + case "${c}" in + [@*#?$!0-]) + # Special parameter + param="${c}" + pgetc + ;; + [1-9]) + # Positional parameter + param="${param}${c}" + pgetc + while :; do + case "${c}" in [!0-9]) + break + ;; + esac + param="${param}${c}" + pgetc + done + ;; + [A-Za-z_]) + # Parameter name + param="${param}${c}" + pgetc + while :; do + case "${c}" in [!A-Za-z0-9_]) + break + ;; + esac + param="${param}${c}" + pgetc + done + ;; + esac +} + +# Check the current token. If it matches, add it to the syntax array. +accept() +{ + local t="${1}" + + if [ "x${tok%%${US}*}" = "x${t}" ]; then + echo "accept $t" >&2 + tokens="${tokens}${tok}${RS}" + next + return 0 + fi + return 1 +} + +error() +{ + local fmt="${1}" + shift 1 + + case "${fname}" in + '-') + printf "stdin:%d: ${fmt}\n" ${lineno} "${@}" >&2 + ;; + *) + printf "%s:%d: ${fmt}\n" "${fname}" ${lineno} "${@}" >&2 + ;; + esac +} + +init_lexer() +{ + local fn="${1}" + shift 1 + + fname="${fn}" + lineno=1 + tokens='' + pgetc + next +} + +get_tokens() +{ + printf '%s' "${tokens}" + return 0 +} + +synexp() +{ + local t="${1}" + shift 1 + + if [ "x${t}" = 'x' ]; then + synerr '%s unexpected' "$(tokname "${tok}")" + else + synerr '%s unexpected (expecting %s)' "$(tokname "${tok}")" \ + "$(tokname "${t}")" + fi +} + +synerr() +{ + local fmt="${1}" + shift 1 + + error "Syntax error: ${fmt}" "${@}" +} -- cgit v0.9.1