# Eggshell lexer
#
# Copyright (C) 2016  Patrick "P. J." McDermott
#
# This file is part of the Eggshell Compiler.
#
# The Eggshell Compiler is free software: you can redistribute it
# and/or modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation, either version 3 of
# the License, or (at your option) any later version.
#
# The Eggshell Compiler is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with the Eggshell Compiler.  If not, see
# <http://www.gnu.org/licenses/>.

#dbg=false

fname=
lineno=
ln_off=
start=
c=
wordexp=
here_queue=
here_awaiting_end=
here_awaiting_word=
tok=

#dbg()
#{
#	if ${dbg}; then
#		printf 'DEBUG: %s\n' "${@}" >&2
#	fi
#}

#
# Error handling (used by scanning and interface functions)
#

error()
{
	local fmt="${1}"
	shift 1

	case "${fname}" in
		'-')
			printf "stdin:%d: ${fmt}\n" ${lineno} "${@}" >&2
			;;
		*)
			printf "%s:%d: ${fmt}\n" "${fname}" ${lineno} "${@}" >&2
			;;
	esac

	# The parser and lexer run in a subshell, so this just returns up to the
	# caller like an exception.
	exit 1
}

synexp()
{
	local t="${1}"
	shift 1

	if [ "x${t}" = 'x' ]; then
		synerr '%s unexpected' "$(tokname "${tok}")"
	else
		synerr '%s unexpected (expecting %s)' "$(tokname "${tok}")" \
			"$(tokname "${t}")"
	fi
}

synerr()
{
	local fmt="${1}"
	shift 1

	error "Syntax error: ${fmt}" "${@}"
}

#
# Input reading
#

lgetc()
{
	if [ ${lbufi} -ge ${lbufc} ]; then
		c=''
	else
		eval "c=\${lbufv_${lbufi}}"
		#echo "LGETC:$lineno: $lbufi '$c'" >&2
		lbufi=$((${lbufi} + 1))
	fi
}

lungetc()
{
	lbufi=$((${lbufi} - 2))
	eval "c=\${lbufv_${lbufi}}"
	#echo "LUNGETC:$lineno: $lbufi '$c'" >&2
	lbufi=$((${lbufi} + 1))
}

lsetc()
{
	lbufi=$((${lbufi} - 1))
	eval "c=\${lbufv_${lbufi}}"
	#echo "LSETC:$lineno: $lbufi '$c'" >&2
	lbufi=$((${lbufi} + 1))
}

#
# Token recognition
#

next()
{
	if ${here_awaiting_word}; then
		next_here
		return
	fi
	while :; do
		#dbg "parsing char '$c' at lineno $lineno"
		case "${c}" in
			'')
				lgetc
				tok=T_EOF
				return
				;;
			"${LF}")
				if ${here_awaiting_end}; then
					synexp ''
				else
					case "${here_queue}" in *"${RS}"*)
						here_awaiting_end=false
						here_awaiting_word=true
						;;
					esac
				fi
				lgetc
				lineno=$((${lineno} + 1))
				tok=T_NEWLINE
				return
				;;
			' '|"${HT}")
				lgetc
				continue
				;;
			\\)
				lgetc
				case "${c}" in "${LF}")
					lineno=$((${lineno} + 1))
					lgetc
					continue
					;;
				esac
				lungetc
				next_word
				return
				;;
			'#')
				lgetc
				while :; do
					case "${c}" in "${LF}"|'')
						break
						;;
					esac
					lgetc
				done
				continue
				;;
			'&')
				lgetc
				case "${c}" in '&')
					lgetc
					tok=T_AND_IF
					return
					;;
				esac
				tok=T_AND
				return
				;;
			'|')
				lgetc
				case "${c}" in '|')
					lgetc
					tok=T_OR_IF
					return
					;;
				esac
				tok=T_PIPE
				return
				;;
			';')
				lgetc
				case "${c}" in ';')
					lgetc
					tok=T_DSEMI
					return
					;;
				esac
				#dbg T_SEMI
				tok=T_SEMI
				return
				;;
			'(')
				lgetc
				tok=T_LPAREN
				return
				;;
			')')
				lgetc
				tok=T_RPAREN
				return
				;;
			'<'|'>')
				next_io
				return
				;;
			',')
				lgetc
				tok=T_COMMA
				return
				;;
			*)
				next_word
				return
				;;
		esac
		lgetc
	done
}

next_here()
{
	local here=
	local here_strip_tabs=
	local here_end=
	local here_escaped=
	local line=
	local word=
	local res=

	# Dequeue the here-document.
	here="${here_queue%%${RS}*}"
	here_strip_tabs="${here%%${US}*}"
	here_end="${here%${US}*}"
	here_end="$(printf '%s' "${here_end#*${US}}" | \
		sed 's/\\//g; s/"//g; s/'\''//g;')"  # Stupid Vim: ')"
	here_escaped="${here##*${US}}"
	here_queue="${here_queue#*${RS}}"
	here_awaiting_word=false

	line=''
	word=''
	while :; do
		case "${c}" in
			'')
				# Bash throws a warning when EOF occurs in a
				# here document.  mksh throws an error.  dash,
				# BusyBox ash, ksh93, and zsh accept EOF as a
				# delimiter.  We aim for the lowest common
				# denominator, so throw an error like mksh does.
				synerr 'Here-document "%s" unclosed' \
					"${here_end}"
				;;
			"${LF}")
				word="${word}${line}"
				case "${line}" in "${here_end}")
					tok="T_WORD${US}${word}"
					return
					;;
				esac
				word="${word}${c}"
				line=''
				;;
			"${HT}")
				if ${here_strip_tabs}; then
					case "${line}" in
						'')
							;;
						*)
							line="${line}${c}"
							;;
					esac
				else
					line="${line}${c}"
				fi
				;;
			'$')
				if ! ${here_escaped}; then
					lgetc
					if ! res="$(scan_wordexp)"; then
						exit 1
					fi
					ln_off=${res%%${RS}*}
					res="${res#*${RS}}"
					lbufi="${res%%${RS}*}"
					lsetc
					res="${res#*${RS}}"
					lineno=$((${lineno} + ${ln_off}))
					line="${line}${res}"
					continue
				else
					line="${line}${c}"
				fi
				;;
			*)
				line="${line}${c}"
				;;
		esac
		lgetc
	done
}

next_io()
{
	case "${c}" in
		'<')
			lgetc
			case "${c}" in
				'<')
					lgetc
					case "${c}" in '-')
						lgetc
						tok=T_DLESSDASH
						here_queue="${here_queue}true"
						here_awaiting_end=true
						here_awaiting_word=false
						break
						;;
					esac
					tok=T_DLESS
					here_queue="${here_queue}false"
					here_awaiting_end=true
					here_awaiting_word=false
					break
					;;
				'&')
					lgetc
					tok=T_LESSAND
					break
					;;
				'>')
					lgetc
					tok=T_LESSGREAT
					break
					;;
			esac
			tok=T_LESS
			break
			;;
		'>')
			lgetc
			case "${c}" in
				'>')
					lgetc
					tok=T_DGREAT
					break
					;;
				'&')
					lgetc
					tok=T_GREATAND
					break
					;;
				'|')
					lgetc
					tok=T_CLOBBER
					break
					;;
			esac
			tok=T_GREAT
			break
			;;
	esac
}

next_word()
{
	local res=

	if ! res="$(scan_word false)"; then
		exit 1
	fi
	ln_off=${res%%${RS}*}
	res="${res#*${RS}}"
	lbufi="${res%%${RS}*}"
	lsetc
	res="${res#*${RS}}"

	# We must advance lineno because scan_word() was run in a subshell.
	lineno=$((${lineno} + ${ln_off}))
	tok="T_WORD${US}${res}"

	if ${here_awaiting_end}; then
		here_queue="${here_queue}${US}${res}"
		case "${res}" in
			*\\*|*'"'*|*"'"*)
				here_queue="${here_queue}${US}true"
				;;
			*)
				here_queue="${here_queue}${US}false"
				;;
		esac
		here_queue="${here_queue}${RS}"
		here_awaiting_end=false
	fi
}

#
# Token scanning
#

scan_word()
{
	local in_param="${1}"
	shift 1
	local lines=
	local word=
	local quoted=
	local tmp_c=
	local res=

	lines=0
	word=''
	quoted=false

	while :; do
		#dbg "parsing word char '$c' at lineno $lineno"
		case "${c}" in
			'')
				break
				;;
			"${LF}")
				if ! ${in_param} && ! ${quoted}; then
					break
				fi
				lineno=$((${lineno} + 1))
				lines=$((${lines} + 1))
				word="${word}${c}"
				;;
			' '|"${HT}"|'&'|'|'|';'|'('|')'|'<'|'>'|',')
				if ! ${in_param} && ! ${quoted}; then
					break
				fi
				word="${word}${c}"
				;;
			'$')
				case "${here_queue}" in *"${RS}"*)
					if ${here_awaiting_end}; then
						synerr '%s %s %s %s' \
							'Word expansions' \
							'not supported in' \
							'here-document' \
							'delimiters'
					fi
					;;
				esac
				lgetc
				if ! res="$(scan_wordexp)"; then
					exit 1
				fi
				ln_off=${res%%${RS}*}
				res="${res#*${RS}}"
				lbufi="${res%%${RS}*}"
				lsetc
				res="${res#*${RS}}"
				# We must advance lineno because scan_wordexp()
				# was run in a subshell.
				lineno=$((${lineno} + ${ln_off}))
				lines=$((${lines} + ${ln_off}))
				word="${word}${res}"
				# scan_wordexp() leaves behind an unused
				# character, so we should skip the lgetc() call
				# below.
				continue
				;;
			'`')
				synerr 'Backquoted (old-style) %s' \
					'command substitution not supported'
				break
				;;
			\\)
				#dbg 'first backslash in word'
				word="${word}${c}"
				lgetc
				#dbg "next char: '$c'"
				case "${c}" in '')
					# Bash, ksh93, mksh, and zsh ignore a
					# backslash at the end of a file, but
					# dash and BusyBox ash include it in the
					# word.  To help with script
					# portability, we'll throw an error
					# (which is a reasonable thing to do
					# anyway).
					synerr 'Unexpected end of file %s' \
						'after "\"'
					;;
				esac
				word="${word}${c}"
				;;
			\')
				word="${word}${c}"
				if ${quoted}; then
					lgetc
					continue
				fi
				while :; do
					lgetc
					word="${word}${c}"
					case "${c}" in
						'')
							synerr '%s %s' \
								'Unterminated' \
								'quoted string'
							;;
						"${LF}")
							lineno=$((${lineno} +1))
							lines=$((${lines} + 1))
							;;
						\')
							break
							;;
					esac
				done
				;;
			'"')
				word="${word}${c}"
				if ${quoted}; then
					quoted=false
				else
					quoted=true
				fi
				;;
			'}')
				if ${in_param} && ! ${quoted}; then
					break
				fi
				word="${word}${c}"
				;;
			*)
				word="${word}${c}"
				;;
		esac
		lgetc
	done

	if ${quoted}; then
		synerr 'Unterminated quoted string'
	fi

	printf "%d${RS}%d${RS}%s" ${lines} ${lbufi} "${word}"
}

scan_wordexp()
{
	local res=

	wordexp=''
	ln_off=0
	case "${c}" in
		'{')
			# Parameter expansion brace
			scan_wordexp_param_brace
			;;
		'(')
			# Arithmetic expansion or command substitution
			lgetc
			case "${c}" in
				'(')
					# Arithmetic expansion
					scan_wordexp_arith
					;;
				*)
					# Command substitution
					if ! res="$(run_sublexer "sub${fname}" \
							${lineno} "${start}" \
							${lbufi})"; then
						exit 1
					fi
					lbufi="${res##*${RS}}"
					lsetc
					res="${res%${RS}*}"
					ln_off=${res##*${RS}}
					res="${res%${RS}*}"
					ln_off=$((${ln_off} - ${lineno}))
					lineno=$((${lineno} + ${ln_off}))
					wordexp="\$(${SOH}C${STX}${res}"
					wordexp="${wordexp}${ETX})"
					# ")" is recognized in run_sublexer().
					;;
			esac
			;;
		[@*#?\$!A-Za-z0-9_-])
			if ! res="$(scan_param)"; then
				exit 1
			fi
			ln_off=${res%%${RS}*}
			res="${res#*${RS}}"
			lbufi="${res%%${RS}*}"
			lsetc
			res="${res#*${RS}}"
			lineno=$((${lineno} + ${ln_off}))
			wordexp="\$${res}"
			;;
	esac

	printf "%d${RS}%d${RS}%s" ${ln_off} ${lbufi} "${wordexp}"
	return 0
}

scan_wordexp_param_brace()
{
	local mod=
	local res=
	local param=

	mod=true

	lgetc
	case "${c}" in
		'#')
			lgetc
			case "${c}" in
				[@*#?\$!A-Za-z0-9_-])
					# String length expansion
					if ! res="$(scan_param)"; then
						exit 1
					fi
					ln_off=${res%%${RS}*}
					res="${res#*${RS}}"
					lbufi="${res%%${RS}*}"
					lsetc
					res="${res#*${RS}}"
					param="#${res}"
					lineno=$((${lineno} + ${ln_off}))
					# Disable modifications.
					mod=false
					;;
				*)
					# Special parameter "#"
					param='#'
					;;
			esac
			;;
		*)
			if ! res="$(scan_param)"; then
				exit 1
			fi
			ln_off=${res%%${RS}*}
			res="${res#*${RS}}"
			lbufi="${res%%${RS}*}"
			lsetc
			res="${res#*${RS}}"
			param="${res}"
			lineno=$((${lineno} + ${ln_off}))
			;;
	esac
	wordexp="\${${param}"

	# If modifications are allowed
	if ${mod}; then
		# Check for modifications.
		mod=false
		case "${c}" in
			':')
				mod=true
				wordexp="${wordexp}${c}"
				lgetc
				case "${c}" in '-'|'='|'?'|'+')
					wordexp="${wordexp}${c}"
					lgetc
				;;
				esac
				;;
			'-'|'='|'?'|'+')
				mod=true
				wordexp="${wordexp}${c}"
				lgetc
				;;
			'%')
				mod=true
				wordexp="${wordexp}${c}"
				lgetc
				case "${c}" in '%')
					wordexp="${wordexp}${c}"
					lgetc
					;;
				esac
				;;
			'#')
				mod=true
				wordexp="${wordexp}${c}"
				lgetc
				case "${c}" in '#')
					wordexp="${wordexp}${c}"
					lgetc
					;;
				esac
				;;
		esac
	fi

	# If a modification was found
	if ${mod}; then
		# Get word.
		if ! res="$(scan_word true)"; then
			exit 1
		fi
		ln_off=${res%%${RS}*}
		res="${res#*${RS}}"
		lbufi="${res%%${RS}*}"
		lsetc
		res="${res#*${RS}}"
		# We must advance lineno because scan_word() was run in a
		# subshell.
		lineno=$((${lineno} + ${ln_off}))
		wordexp="${wordexp}${res}"
		#dbg "param mod word: '$res'"
	fi

	# Check for right brace.
	case "${c}" in
		'}')
			wordexp="${wordexp}${c}"
			lgetc
			;;
		*)
			synerr 'Missing "}"'
			;;
	esac

	return 0
}

scan_param()
{
	local param=

	param=''
	case "${c}" in
		[@*#?\$!0-])
			# Special parameter
			param="${c}"
			lgetc
			;;
		[1-9])
			# Positional parameter
			param="${param}${c}"
			lgetc
			while :; do
				case "${c}" in [!0-9])
					break
					;;
				esac
				param="${param}${c}"
				lgetc
			done
			;;
		[A-Za-z_])
			# Parameter name
			param="${param}${c}"
			lgetc
			while :; do
				case "${c}" in [!A-Za-z0-9_])
					break
					;;
				esac
				param="${param}${c}"
				lgetc
			done
			;;
		*)
			synerr 'Bad parameter name'
			;;
	esac

	printf "%d${RS}%d${RS}%s" 0 ${lbufi} "${param}"
	return 0
}

scan_wordexp_arith()
{
	local arith=
	local paren_lvl=
	local res=

	arith=''
	paren_lvl=0
	lgetc
	while :; do
		case "${c}" in
			'')
				synerr 'end of file unexpected (%s)' \
					'expecting "))"'
				;;
			'(')
				arith="${arith}${c}"
				paren_lvl=$((${paren_lvl} + 1))
				;;
			')')
				if [ ${paren_lvl} -eq 0 ]; then
					lgetc
					case "${c}" in ')')
						wordexp="\$((${arith}))"
						lgetc
						return 0
						;;
					esac
					synerr 'Arithmetic expansion: ")" %s' \
						'unexpected'
				fi
				arith="${arith}${c}"
				paren_lvl=$((${paren_lvl} - 1))
				;;
			'$')
				lgetc
				if ! res="$(scan_wordexp)"; then
					exit 1
				fi
				ln_off=${res%%${RS}*}
				res="${res#*${RS}}"
				lbufi="${res%%${RS}*}"
				lsetc
				res="${res#*${RS}}"
				# We must advance lineno because scan_wordexp()
				# was run in a subshell.
				lineno=$((${lineno} + ${ln_off}))
				arith="${arith}${res}"
				continue
				;;
			*)
				arith="${arith}${c}"
				;;
		esac
		lgetc
	done
}

run_sublexer()
{
	local fn="${1}"
	local ln="${2}"
	local st="${3}"
	local i="${4}"
	shift 4

	# Initialize global variables.
	fname="${fn}"
	lineno=${ln}
	start="${st}"
	here_queue=''
	here_awaiting_end=false
	here_awaiting_word=false

	lbufi="${i}"
	lsetc
	next

	#dbg=true
	# If this returns (does not exit), there are no errors.
	${start}
	case "${tok%${US}*}" in
		T_RPAREN)
			;;
		*)
			synerr 'Missing ")"'
			;;
	esac

	printf "${RS}%d${RS}%d" ${lineno} ${lbufi}
	return 0
}

#
# Interface
#

run_lexer()
{
	local fn="${1}"
	local buf="${2}"
	local st="${3}"
	shift 3

	# Initialize global variables.
	fname="${fn}"
	lineno=1
	start="${st}"
	here_queue=''
	here_awaiting_end=false
	here_awaiting_word=false

	# Read file into array
	eval "$(printf '%s' "${buf}" | awk -v FS='' -v j=0 \
		-v squote="'" -v esc_squote="'\\\\''" '
		{
			for (i = 1; i <= NF; ++i) {
				sub(squote, esc_squote, $i);
				printf("lbufv_%d=" squote "%s" squote "\n",
					j++, $i);
			};
			printf("lbufv_%d=" squote "\n" squote "\n", j++);
		}
		')"
	lbufi=0
	lbufc=${#buf}

	# Read the first character and recognize the first token.
	lgetc
	next

	if ! ${start}; then
		# Unexpected EOF
		synexp ''
	fi
	if ! accept T_EOF; then
		synexp ''
	fi

	return 0
}

accept()
{
	local t="${1}"
	shift 1
	local rw=

	#dbg "looking for $t, current tok ${tok%%${US}*}"
	case "${t}" in
		T_IF|T_THEN|T_ELSE|T_ELIF|T_FI|T_DO|T_DONE|\
		T_CASE|T_ESAC|T_WHILE|T_UNTIL|T_FOR|\
		T_LBRACE|T_RBRACE|T_BANG|T_IN|\
		T_USE|T_STATIC|T_LOCAL|T_RETURN)
			#dbg "looking for reserved word $t, have '$tok'"
			if ! [ "x${tok%%${US}*}" = "x${t}" ]; then
				# Reserved words are recognized as literal
				# T_WORDs.
				if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
					return 1
				fi
				# T_WORD data unit must match reserved word
				# exactly.
				if ! [ "x${tok#T_WORD${US}}" = \
						"x$(toktext "${t}")" ]; then
					return 1
				fi
				# If the token matches the reserved word,
				# replace it with the reserved word token.
				tok="${t}"
			fi
			;;
		T_VOID)
			# Types are recognized as literal T_WORDs.
			if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
				return 1
			fi
			# Validate type.
			case "${tok#*${US}}" in
				'void')
					;;
				*)
					return 1
					;;
			esac
			tok="T_VOID${US}${tok#T_WORD${US}}"
			;;
		T_TYPE)
			# Types are recognized as literal T_WORDs.
			if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
				return 1
			fi
			# Validate type.
			case "${tok#*${US}}" in
				'bool'|'int'|'string')
					;;
				*)
					return 1
					;;
			esac
			tok="T_TYPE${US}${tok#T_WORD${US}}"
			;;
		T_NAME)
			# Names are recognized as literal T_WORDs.
			if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
				return 1
			fi
			# Validate name.
			case "${tok#*${US}}" in
				[!A-Za-z_]*)
					return 1
					;;
				*[!0-9A-Za-z_]*)
					return 1
					;;
			esac
			tok="T_NAME${US}${tok#T_WORD${US}}"
			;;
		T_FNAME)
			# Function names are recognized as literal T_WORDs.
			if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
				return 1
			fi
			# Validate name.
			case "${tok#*${US}}" in
				[!A-Za-z_]*)
					return 1
					;;
				*[!0-9A-Za-z_]*)
					return 1
					;;
			esac
			# Verify that the function name doesn't match any
			# reserved words.
			for rw in T_IF T_THEN T_ELSE T_ELIF T_FI T_DO T_DONE \
					T_CASE T_ESAC T_WHILE T_UNTIL T_FOR \
					T_LBRACE T_RBRACE T_BANG T_IN \
					T_USE T_STATIC T_LOCAL T_RETURN; do
				if [ "x${tok#T_WORD${US}}" = \
						"x$(toktext "${rw}")" ]; then
					tok="${rw}"
					return 1
				fi
			done
			tok="T_FNAME${US}${tok#T_WORD${US}}"
			;;
		T_CMDNAME)
			# The first word of a simple command is to be checked
			# for reserved words.
			if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
				return 1
			fi
			# Verify that the word doesn't match any reserved words.
			for rw in T_IF T_THEN T_ELSE T_ELIF T_FI T_DO T_DONE \
					T_CASE T_ESAC T_WHILE T_UNTIL T_FOR \
					T_LBRACE T_RBRACE T_BANG T_IN \
					T_USE T_STATIC T_LOCAL T_RETURN; do
				if [ "x${tok#T_WORD${US}}" = \
						"x$(toktext "${rw}")" ]; then
					tok="${rw}"
					return 1
				fi
			done
			tok="T_CMDNAME${US}${tok#T_WORD${US}}"
			;;
		T_IO_NUMBER)
			# I/O numbers are recognized as literal T_WORDs.
			if ! [ "x${tok%%${US}*}" = 'xT_WORD' ]; then
				return 1
			fi
			# Validate number.
			case "${tok#*${US}}" in
				*[!0-9]*)
					return 1
					;;
			esac
			tok="T_IO_NUMBER${US}${tok#T_WORD${US}}"
			;;
		*)
			if ! [ "x${tok%%${US}*}" = "x${t}" ]; then
				return 1
			fi
			;;
	esac

	#dbg "accept $t"
	printf '%s' "${tok}${RS}"
	next
	return 0
}

expect()
{
	local t="${1}"
	shift 1

	if accept "${t}"; then
		return 0
	else
		synexp "${t}"
	fi
}

inject()
{
	local t="${1}"
	shift 1

	printf '%s' "${t}${RS}"
	return 0
}