1 files changed, 990 insertions, 0 deletions
diff --git a/eshtrans/frontend/lexer.esh b/eshtrans/frontend/lexer.esh
new file mode 100644
index 0000000..0991239
--- /dev/null
+++ b/eshtrans/frontend/lexer.esh
@@ -0,0 +1,990 @@
+# Eggshell lexer
+#
+# Copyright (C) 2016  Patrick "P. J." McDermott
+#
+# This file is part of the Eggshell Compiler.
+#
+# The Eggshell Compiler is free software: you can redistribute it
+# and/or modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation, either version 3 of
+# the License, or (at your option) any later version.
+#
+# The Eggshell Compiler is distributed in the hope that it will be
+# useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with the Eggshell Compiler.  If not, see
+# <http://www.gnu.org/licenses/>.
+
+dbg=false
+
+fname=
+lineno=
+ln_off=
+start=
+c=
+wordexp=
+here_queue=
+here_awaiting_end=
+here_awaiting_word=
+tok=
+tokens=
+
+dbg()
+{
+	if ${dbg}; then
+		printf 'DEBUG: %s\n' "${@}" >&2
+	fi
+}
+
+#
+# Error handling (used by scanning and interface functions)
+#
+
+error()
+{
+	local fmt="${1}"
+	shift 1
+
+	case "${fname}" in
+		'-')
+			printf "stdin:%d: ${fmt}\n" ${lineno} "${@}" >&2
+			;;
+		*)
+			printf "%s:%d: ${fmt}\n" "${fname}" ${lineno} "${@}" >&2
+			;;
+	esac
+
+	# The parser and lexer run in a subshell, so this just returns up to the
+	# caller like an exception.
+	exit 1
+}
+
+synexp()
+{
+	local t="${1}"
+	shift 1
+
+	if [ "x${t}" = 'x' ]; then
+		synerr '%s unexpected' "$(tokname "${tok}")"
+	else
+		synerr '%s unexpected (expecting %s)' "$(tokname "${tok}")" \
+			"$(tokname "${t}")"
+	fi
+}
+
+synerr()
+{
+	local fmt="${1}"
+	shift 1
+
+	error "Syntax error: ${fmt}" "${@}"
+}
+
+#
+# Input reading
+#
+
+lgetc()
+{
+	c="$(dd bs=1 count=1 2>/dev/null; printf '.')"
+	c="${c%.}"
+}
+
+#
+# Token recognition
+#
+
+next()
+{
+	if ${here_awaiting_word}; then
+		next_here
+		return
+	fi
+	while :; do
+		dbg "parsing char '$c' at lineno $lineno"
+		case "${c}" in
+			'')
+				lgetc
+				tok=T_EOF
+				return
+				;;
+			"${LF}")
+				if ${here_awaiting_end}; then
+					synexp ''
+				else
+					case "${here_queue}" in *"${RS}"*)
+						here_awaiting_end=false
+						here_awaiting_word=true
+						;;
+					esac
+				fi
+				lgetc
+				lineno=$((${lineno} + 1))
+				tok=T_NEWLINE
+				return
+				;;
+			' '|"${HT}")
+				lgetc
+				continue
+				;;
+			\\)
+				lgetc
+				case "${c}" in "${LF}")
+					lineno=$((${lineno} + 1))
+					lgetc
+					continue
+					;;
+				esac
+				next_word \\
+				return
+				;;
+			'#')
+				lgetc
+				while :; do
+					case "${c}" in "${LF}"|'')
+						break
+						;;
+					esac
+					lgetc
+				done
+				continue
+				;;
+			'&')
+				lgetc
+				case "${c}" in '&')
+					lgetc
+					tok=T_AND_IF
+					return
+					;;
+				esac
+				tok=T_AND
+				return
+				;;
+			'|')
+				lgetc
+				case "${c}" in '|')
+					lgetc
+					tok=T_OR_IF
+					return
+					;;
+				esac
+				tok=T_PIPE
+				return
+				;;
+			';')
+				lgetc
+				case "${c}" in ';')
+					lgetc
+					tok=T_DSEMI
+					return
+					;;
+				esac
+				dbg T_SEMI
+				tok=T_SEMI
+				return
+				;;
+			'(')
+				lgetc
+				tok=T_LPAREN
+				return
+				;;
+			')')
+				lgetc
+				tok=T_RPAREN
+				return
+				;;
+			'<'|'>')
+				next_io
+				return
+				;;
+			*)
+				next_word ''
+				return
+				;;
+		esac
+		lgetc
+	done
+}
+
+next_here()
+{
+	local here=
+	local here_strip_tabs=
+	local here_end=
+	local here_escaped=
+	local line=
+	local word=
+	local res=
+	local wordexp=
+
+	# Dequeue the here-document.
+	here="${here_queue%%${RS}*}"
+	here_strip_tabs="${here%%${US}*}"
+	here_end="${here%${US}*}"
+	here_end="$(printf '%s' "${here_end#*${US}}" | \
+		sed 's/\\//g; s/"//g; s/'\''//g;')"  # Stupid Vim: ')"
+	here_escaped="${here##*${US}}"
+	here_queue="${here_queue#*${RS}}"
+	here_awaiting_word=false
+
+	line=''
+	word=''
+	while :; do
+		case "${c}" in
+			'')
+				# Bash throws a warning when EOF occurs in a
+				# here document.  mksh throws an error.  dash,
+				# BusyBox ash, ksh93, and zsh accept EOF as a
+				# delimiter.  We aim for the lowest common
+				# denominator, so throw an error like mksh does.
+				synerr 'Here-document "%s" unclosed' \
+					"${here_end}"
+				;;
+			"${LF}")
+				word="${word}${line}"
+				case "${line}" in "${here_end}")
+					tok="T_WORD${US}${word}"
+					return
+					;;
+				esac
+				word="${word}${c}"
+				line=''
+				;;
+			"${HT}")
+				if ${here_strip_tabs}; then
+					case "${line}" in
+						'')
+							;;
+						*)
+							line="${line}${c}"
+							;;
+					esac
+				else
+					line="${line}${c}"
+				fi
+				;;
+			'$')
+				if ! ${here_escaped}; then
+					lgetc
+					if ! res="$(scan_wordexp)"; then
+						exit 1
+					fi
+					ln_off=${res%%${RS}*}
+					res="${res#*${RS}}"
+					c="${res%%${RS}*}"
+					res="${res#*${RS}}"
+					wordexp="${res%%${RS}*}"
+					lineno=$((${lineno} + ${ln_off}))
+					line="${line}${wordexp}"
+					continue
+				else
+					line="${line}${c}"
+				fi
+				;;
+			*)
+				line="${line}${c}"
+				;;
+		esac
+		lgetc
+	done
+}
+
+next_io()
+{
+	case "${c}" in
+		'<')
+			lgetc
+			case "${c}" in
+				'<')
+					lgetc
+					case "${c}" in '-')
+						lgetc
+						tok=T_DLESSDASH
+						here_queue="${here_queue}true"
+						here_awaiting_end=true
+						here_awaiting_word=false
+						break
+						;;
+					esac
+					tok=T_DLESS
+					here_queue="${here_queue}false"
+					here_awaiting_end=true
+					here_awaiting_word=false
+					break
+					;;
+				'&')
+					lgetc
+					tok=T_LESSAND
+					break
+					;;
+				'>')
+					lgetc
+					tok=T_LESSGREAT
+					break
+					;;
+			esac
+			tok=T_LESS
+			break
+			;;
+		'>')
+			lgetc
+			case "${c}" in
+				'>')
+					lgetc
+					tok=T_DGREAT
+					break
+					;;
+				'&')
+					lgetc
+					tok=T_GREATAND
+					break
+					;;
+				'|')
+					lgetc
+					tok=T_CLOBBER
+					break
+					;;
+			esac
+			tok=T_GREAT
+			break
+			;;
+	esac
+}
+
+next_word()
+{
+	local prev_c="${1}"
+	shift 1
+	local res=
+	local word=
+
+	if ! res="$(scan_word false)"; then
+		exit 1
+	fi
+	ln_off=${res%%${RS}*}
+	res="${res#*${RS}}"
+	c="${res%%${RS}*}"
+	res="${res#*${RS}}"
+	word="${prev_c}${res%%${RS}*}"
+
+	# We must advance lineno because scan_word() was run in a subshell.
+	lineno=$((${lineno} + ${ln_off}))
+	tok="T_WORD${US}${word}"
+
+	if ${here_awaiting_end}; then
+		here_queue="${here_queue}${US}${word}"
+		case "${word}" in
+			*\\*|*'"'*|*"'"*)
+				here_queue="${here_queue}${US}true"
+				;;
+			*)
+				here_queue="${here_queue}${US}false"
+				;;
+		esac
+		here_queue="${here_queue}${RS}"
+		here_awaiting_end=false
+	fi
+}
+
+#
+# Token scanning
+#
+
+scan_word()
+{
+	local in_param="${1}"
+	shift 1
+	local res=
+	local word=
+	local quoted=
+	local lines=
+	local wordexp=
+
+	word=''
+	quoted=false
+	lines=0
+	while :; do
+		dbg "parsing word char '$c' at lineno $lineno"
+		case "${c}" in
+			'')
+				break
+				;;
+			"${LF}")
+				if ! ${in_param} && ! ${quoted}; then
+					break
+				fi
+				lineno=$((${lineno} + 1))
+				lines=$((${lines} + 1))
+				word="${word}${c}"
+				;;
+			' '|"${HT}"|'&'|'|'|';'|'('|')'|'<'|'>')
+				if ! ${in_param} && ! ${quoted}; then
+					break
+				fi
+				word="${word}${c}"
+				;;
+			'$')
+				case "${here_queue}" in *"${RS}"*)
+					if ${here_awaiting_end}; then
+						synerr '%s %s %s %s' \
+							'Word expansions' \
+							'not supported in' \
+							'here-document' \
+							'delimiters'
+					fi
+				esac
+				lgetc
+				if ! res=$(scan_wordexp); then
+					exit 1
+				fi
+				ln_off=${res%%${RS}*}
+				res="${res#*${RS}}"
+				c="${res%%${RS}*}"
+				res="${res#*${RS}}"
+				wordexp="${res%%${RS}*}"
+				# We must advance lineno because scan_wordexp()
+				# was run in a subshell.
+				lineno=$((${lineno} + ${ln_off}))
+				word="${word}${wordexp}"
+				# scan_wordexp() leaves behind an unused
+				# character, so we should skip the lgetc() call
+				# below.
+				continue
+				;;
+			'`')
+				synerr 'Backquoted (old-style) %s' \
+					'command substitution not supported'
+				break
+				;;
+			\\)
+				word="${word}${c}"
+				lgetc
+				case "${c}" in '')
+					# Bash, ksh93, mksh, and zsh ignore a
+					# backslash at the end of a file, but
+					# dash and BusyBox ash include it in the
+					# word.  To help with script
+					# portability, we'll throw an error
+					# (which is a reasonable thing to do
+					# anyway).
+					synerr 'Unexpected end of file %s' \
+						'after "\"'
+					;;
+				esac
+				word="${word}${c}"
+				;;
+			\')
+				word="${word}${c}"
+				while :; do
+					lgetc
+					word="${word}${c}"
+					case "${c}" in
+						'')
+							synerr '%s %s' \
+								'Unterminated' \
+								'quoted string'
+							;;
+						\')
+							break
+							;;
+					esac
+				done
+				;;
+			'"')
+				word="${word}${c}"
+				if ${quoted}; then
+					quoted=false
+				else
+					quoted=true
+				fi
+				;;
+			'}')
+				if ${in_param} && ! ${quoted}; then
+					break
+				fi
+				word="${word}${c}"
+				;;
+			*)
+				word="${word}${c}"
+				;;
+		esac
+		lgetc
+	done
+
+	if ${quoted}; then
+		synerr 'Unterminated quoted string'
+	fi
+
+	printf "%d${RS}%c${RS}%s" ${lines} "${c}" "${word}"
+}
+
+scan_wordexp()
+{
+	local res=
+	local toks=
+	local param=
+
+	wordexp=''
+	ln_off=0
+	case "${c}" in
+		'{')
+			# Parameter expansion brace
+			scan_wordexp_param_brace
+			;;
+		'(')
+			# Arithmetic expansion or command substitution
+			lgetc
+			case "${c}" in
+				'(')
+					# Arithmetic expansion
+					scan_wordexp_arith
+					;;
+				*)
+					# Command substitution
+					if ! res="$(run_sublexer "sub${fname}" \
+							${lineno} "${start}" \
+							"${c}")"; then
+						exit 1
+					fi
+					ln_off=${res%%${RS}*}
+					res="${res#*${RS}}"
+					c="${res%%${RS}*}"
+					res="${res#*${RS}}"
+					toks="${res%%${RS}*}"
+					lineno=${ln_off}
+					wordexp="\$(${SOH}C${STX}${toks}"
+					wordexp="${wordexp}${ETX})"
+					# ")" is recognized in run_sublexer().
+					;;
+			esac
+			;;
+		[@*#?$!A-Za-z0-9_-])
+			if ! res="$(scan_param)"; then
+				exit 1
+			fi
+			ln_off=${res%%${RS}*}
+			res="${res#*${RS}}"
+			c="${res%%${RS}*}"
+			res="${res#*${RS}}"
+			param="${res%%${RS}*}"
+			lineno=$((${lineno} + ${ln_off}))
+			wordexp="\$${param}"
+			;;
+	esac
+
+	printf "%d${RS}%c${RS}%s" ${ln_off} "${c}" "${wordexp}"
+	return 0
+}
+
+scan_wordexp_param_brace()
+{
+	local mod=
+	local res=
+	local param=
+	local word=
+
+	mod=true
+
+	lgetc
+	case "${c}" in
+		'#')
+			lgetc
+			case "${c}" in
+				[@*#?$!A-Za-z0-9_-])
+					# String length expansion
+					if ! res="$(scan_param)"; then
+						exit 1
+					fi
+					ln_off=${res%%${RS}*}
+					res="${res#*${RS}}"
+					c="${res%%${RS}*}"
+					res="${res#*${RS}}"
+					param="${res%%${RS}*}"
+					lineno=$((${lineno} + ${ln_off}))
+					# Disable modifications.
+					mod=false
+					;;
+				*)
+					# Special parameter "#"
+					param='#'
+					;;
+			esac
+			;;
+		*)
+			if ! res="$(scan_param)"; then
+				exit 1
+			fi
+			ln_off=${res%%${RS}*}
+			res="${res#*${RS}}"
+			c="${res%%${RS}*}"
+			res="${res#*${RS}}"
+			param="${res%%${RS}*}"
+			lineno=$((${lineno} + ${ln_off}))
+			;;
+	esac
+	wordexp="\${${param}"
+
+	# If modifications are allowed
+	if ${mod}; then
+		# Check for modifications.
+		mod=false
+		case "${c}" in
+			':')
+				mod=true
+				wordexp="${wordexp}${c}"
+				lgetc
+				case "${c}" in '-'|'='|'?'|'+')
+					wordexp="${wordexp}${c}"
+					lgetc
+				;;
+				esac
+				;;
+			'-'|'='|'?'|'+')
+				mod=true
+				wordexp="${wordexp}${c}"
+				lgetc
+				;;
+			'%')
+				mod=true
+				wordexp="${wordexp}${c}"
+				lgetc
+				case "${c}" in '%')
+					wordexp="${wordexp}${c}"
+					lgetc
+					;;
+				esac
+				;;
+			'#')
+				mod=true
+				wordexp="${wordexp}${c}"
+				lgetc
+				case "${c}" in '#')
+					wordexp="${wordexp}${c}"
+					lgetc
+					;;
+				esac
+				;;
+		esac
+	fi
+
+	# If a modification was found
+	if ${mod}; then
+		# Get word.
+		if ! res="$(scan_word true)"; then
+			exit 1
+		fi
+		ln_off=${res%%${RS}*}
+		res="${res#*${RS}}"
+		c="${res%%${RS}*}"
+		res="${res#*${RS}}"
+		word="${res%%${RS}*}"
+		# We must advance lineno because scan_word() was run in a
+		# subshell.
+		lineno=$((${lineno} + ${ln_off}))
+		wordexp="${wordexp}${word}"
+		dbg "param mod word: '$word'"
+	fi
+
+	# Check for right brace.
+	case "${c}" in
+		'}')
+			wordexp="${wordexp}${c}"
+			lgetc
+			;;
+		*)
+			synerr 'Missing "}"'
+			;;
+	esac
+
+	return 0
+}
+
+scan_param()
+{
+	local param=
+
+	param=''
+	case "${c}" in
+		[@*#?$!0-])
+			# Special parameter
+			param="${c}"
+			lgetc
+			;;
+		[1-9])
+			# Positional parameter
+			param="${param}${c}"
+			lgetc
+			while :; do
+				case "${c}" in [!0-9])
+					break
+					;;
+				esac
+				param="${param}${c}"
+				lgetc
+			done
+			;;
+		[A-Za-z_])
+			# Parameter name
+			param="${param}${c}"
+			lgetc
+			while :; do
+				case "${c}" in [!A-Za-z0-9_])
+					break
+					;;
+				esac
+				param="${param}${c}"
+				lgetc
+			done
+			;;
+		*)
+			synerr 'Bad parameter name'
+			;;
+	esac
+
+	printf "%d${RS}%c${RS}%s" 0 "${c}" "${param}"
+	return 0
+}
+
+scan_wordexp_arith()
+{
+	local arith=
+	local paren_lvl=
+	local res=
+	local sub_wordexp=
+
+	arith=''
+	paren_lvl=0
+	while :; do
+		lgetc
+		case "${c}" in
+			'')
+				synerr 'end of file unexpected (%s)' \
+					'expecting "))"'
+				;;
+			'(')
+				arith="${arith}${c}"
+				paren_lvl=$((${paren_lvl} + 1))
+				;;
+			')')
+				if [ ${paren_lvl} -eq 0 ]; then
+					lgetc
+					case "${c}" in ')')
+						wordexp="\$((${arith}))"
+						lgetc
+						return 0
+						;;
+					esac
+					synerr 'Arithmetic expansion: ")" %s' \
+						'unexpected'
+				fi
+				arith="${arith}${c}"
+				paren_lvl=$((${paren_lvl} - 1))
+				;;
+			'$')
+				lgetc
+				if ! res=$(scan_wordexp); then
+					exit 1
+				fi
+				ln_off=${res%%${RS}*}
+				res="${res#*${RS}}"
+				c="${res%%${RS}*}"
+				res="${res#*${RS}}"
+				sub_wordexp="${res%%${RS}*}"
+				# We must advance lineno because scan_wordexp()
+				# was run in a subshell.
+				lineno=$((${lineno} + ${ln_off}))
+				arith="${arith}${sub_wordexp}"
+				;;
+			*)
+				arith="${arith}${c}"
+				;;
+		esac
+	done
+}
+
+run_sublexer()
+{
+	local fn="${1}"
+	local ln="${2}"
+	local st="${3}"
+	local ch="${4}"
+	shift 4
+
+	# Initialize global variables.
+	fname="${fn}"
+	lineno=${ln}
+	start="${st}"
+	here_queue=''
+	here_awaiting_end=false
+	here_awaiting_word=false
+	tokens=''
+
+	c="${ch}"
+	next
+
+	#dbg=true
+	# If this returns (does not exit), there are no errors.
+	${start}
+	case "${tok%${US}*}" in
+		T_RPAREN)
+			;;
+		*)
+			synerr 'Missing ")"'
+			;;
+	esac
+
+	printf "%d${RS}%c${RS}%s" ${lineno} "${c}" "${tokens}"
+	return 0
+}
+
+#
+# Interface
+#
+
+run_lexer()
+{
+	local fn="${1}"
+	local st="${2}"
+	shift 2
+
+	# Initialize global variables.
+	fname="${fn}"
+	lineno=1
+	start="${st}"
+	here_queue=''
+	here_awaiting_end=false
+	here_awaiting_word=false
+	tokens=''
+
+	# Read the first character and recognize the first token.
+	lgetc
+	next
+
+	if ! ${start}; then
+		# Unexpected EOF
+		synexp ''
+	fi
+	if ! accept T_EOF; then
+		synexp ''
+	fi
+
+	# Return the tokens.
+	printf '%s' "${tokens}"
+
+	return 0
+}
+
+accept()
+{
+	local t="${1}"
+	shift 1
+	local rw=
+
+	dbg "looking for $t, current tok ${tok%%${US}*}"
+	case "${t}" in
+		T_IF|T_THEN|T_ELSE|T_ELIF|T_FI|\
+		T_DO|T_DONE|T_CASE|T_ESAC|T_WHILE|T_UNTIL|\
+		T_FOR|T_LBRACE|T_RBRACE|T_BANG|T_IN)
+			dbg "looking for reserved word $t, have '$tok'"
+			if ! [ "x${tok%%${US}*}" = "x${t}" ]; then
+				# Reserved words are recognized as literal
+				# T_WORDs.
+				if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
+					return 1
+				fi
+				# T_WORD data unit must match reserved word
+				# exactly.
+				if ! [ "x${tok#T_WORD${US}}" = \
+						"x$(toktext "${t}")" ]; then
+					return 1
+				fi
+				# If the token matches the reserved word,
+				# replace it with the reserved word token.
+				tok="${t}"
+			fi
+			;;
+		T_NAME)
+			# Names are recognized as literal T_WORDs.
+			if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
+				return 1
+			fi
+			# Validate name.
+			case "${tok%%${US}*}" in
+				[A-Za-z_][0-9A-Za-z_]*)
+					;;
+				*)
+					return 1
+					;;
+			esac
+			tok="T_NAME${US}${tok#T_WORD${US}}"
+			;;
+		T_FNAME)
+			# Function names are recognized as literal T_WORDs.
+			if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
+				return 1
+			fi
+			# Validate name.
+			case "${tok%%${US}*}" in
+				[A-Za-z_][0-9A-Za-z_]*)
+					;;
+				*)
+					return 1
+					;;
+			esac
+			# Verify that the function name doesn't match any
+			# reserved words.
+			for rw in T_IF T_THEN T_ELSE T_ELIF T_FI T_DO T_DONE \
+					T_CASE T_ESAC T_WHILE T_UNTIL T_FOR \
+					T_LBRACE T_RBRACE T_BANG T_IN; do
+				if [ "x${tok#T_WORD${US}}" = \
+						"x$(toktext "${rw}")" ]; then
+					tok="${rw}"
+					return 1
+				fi
+			done
+			tok="T_FNAME${US}${tok#T_WORD${US}}"
+			;;
+		T_CMDNAME)
+			# The first word of a simple command is to be checked
+			# for reserved words.
+			if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
+				return 1
+			fi
+			# Verify that the word doesn't match any reserved words.
+			for rw in T_IF T_THEN T_ELSE T_ELIF T_FI T_DO T_DONE \
+					T_CASE T_ESAC T_WHILE T_UNTIL T_FOR \
+					T_LBRACE T_RBRACE T_BANG T_IN; do
+				if [ "x${tok#T_WORD${US}}" = \
+						"x$(toktext "${rw}")" ]; then
+					tok="${rw}"
+					return 1
+				fi
+			done
+			;;
+		*)
+			if ! [ "x${tok%%${US}*}" = "x${t}" ]; then
+				return 1
+			fi
+			;;
+	esac
+
+	dbg "accept $t"
+	tokens="${tokens}${tok}${RS}"
+	next
+	return 0
+}
+
+expect()
+{
+	local t="${1}"
+	shift 1
+
+	if accept "${t}"; then
+		return 0
+	else
+		synexp "${t}"
+	fi
+}