From f6e55a026abf33867141896b1e227e791942c2a3 Mon Sep 17 00:00:00 2001
From: P. J. McDermott <pj@pehjota.net>
Date: Fri, 19 Feb 2016 04:01:32 -0500
Subject: [WIP] Add lexer/parser demo

---
(limited to 'parsing')

diff --git a/parsing/lexer.sh b/parsing/lexer.sh
new file mode 100644
index 0000000..3081a79
--- /dev/null
+++ b/parsing/lexer.sh
@@ -0,0 +1,457 @@
+fname=
+lineno=
+c=
+tok=
+tokens=
+
+pgetc()
+{
+	c="$(dd bs=1 count=1 2>/dev/null; printf '.')"
+	c="${c%.}"
+}
+
+next()
+{
+	while :; do
+		echo "parsing char '$c'" >&2
+		case "${c}" in
+			'')
+				pgetc
+				tok=T_EOF
+				return
+				;;
+			"${LF}")
+				pgetc
+				lineno=$((${lineno} + 1))
+				tok=T_NEWLINE
+				return
+				;;
+			' '|"${HT}")
+				pgetc
+				continue
+				;;
+			\\)
+				pgetc
+				case "${c}" in "${LF}")
+					lineno=$((${lineno} + 1))
+					pgetc
+					continue
+					;;
+				esac
+				next_word
+				return
+				;;
+			'#')
+				pgetc
+				while :; do
+					case "${c}" in "${LF}"|'')
+						break
+						;;
+					esac
+					pgetc
+				done
+				continue
+				;;
+			'&')
+				pgetc
+				case "${c}" in '&')
+					pgetc
+					tok=T_AND_IF
+					return
+					;;
+				esac
+				tok=T_AND
+				return
+				;;
+			'|')
+				pgetc
+				case "${c}" in '|')
+					pgetc
+					tok=T_OR_IF
+					return
+					;;
+				esac
+				tok=T_PIPE
+				return
+				;;
+			';')
+				pgetc
+				case "${c}" in ';')
+					pgetc
+					tok=T_DSEMI
+					return
+					;;
+				esac
+				tok=T_SEMI
+				return
+				;;
+			'(')
+				pgetc
+				tok=T_LPAREN
+				return
+				;;
+			')')
+				pgetc
+				tok=T_RPAREN
+				return
+				;;
+			'<'|'>')
+				next_io
+				return
+				;;
+			*)
+				next_word
+				return
+				;;
+		esac
+		pgetc
+	done
+}
+
+next_io()
+{
+	case "${c}" in
+		'<')
+			pgetc
+			case "${c}" in '<')
+				pgetc
+				case "${c}" in '-')
+					pgetc
+					tok=T_DLESSDASH
+					;;
+				esac
+				tok=T_DLESS
+				;;
+			esac
+			tok=T_LESS
+			;;
+		'>')
+			pgetc
+			case "${c}" in '>')
+				pgetc
+				tok=T_DGREAT
+				;;
+			esac
+			tok=T_GREAT
+			;;
+	esac
+}
+
+next_word()
+{
+	local res=
+	local lineno_offset=
+	local word=
+
+	res="$(scan_word false)"
+	lineno_offset=${res%%${RS}*}
+	res="${res#*${RS}}"
+	c="${res%%${RS}*}"
+	res="${res#*${RS}}"
+	word="${res%%${RS}*}"
+
+	# We must advance lineno because scan_word() was run in a subshell.
+	lineno=$((${lineno} + ${lineno_offset}))
+	tok="T_WORD${US}${word}"
+}
+
+scan_word()
+{
+	local in_param="${1}"
+	local res=
+	local word=
+	local quoted=
+	local lines=
+	local lineno_offset=
+	local wordexp=
+
+	word=''
+	quoted=false
+	lines=0
+	while :; do
+		echo "parsing word char '$c'" >&2
+		case "${c}" in
+			'')
+				break
+				;;
+			"${LF}")
+				if ! ${in_param} && ! ${quoted}; then
+					break
+				fi
+				lineno=$((${lineno} + 1))
+				lines=$((${lines} + 1))
+				word="${word}${c}"
+				;;
+			' '|"${HT}")
+				if ! ${in_param} && ! ${quoted}; then
+					break
+				fi
+				word="${word}${c}"
+				;;
+			'$')
+				pgetc
+				res=$(scan_wordexp)
+				lineno_offset=${res%%${RS}*}
+				res="${res#*${RS}}"
+				c="${res%%${RS}*}"
+				res="${res#*${RS}}"
+				wordexp="${res%%${RS}*}"
+				# We must advance lineno because scan_wordexp()
+				# was run in a subshell.
+				lineno=$((${lineno} + ${lineno_offset}))
+				word="${word}${wordexp}"
+				# scan_wordexp() leaves behind an unused
+				# character, so we should skip the pgetc() call
+				# below.
+				continue
+				;;
+			\')
+				word="${word}${c}"
+				while :; do
+					pgetc
+					word="${word}${c}"
+					case "${c}" in \')
+						break
+						;;
+					esac
+				done
+				;;
+			'"')
+				word="${word}${c}"
+				if ${quoted}; then
+					quoted=false
+				else
+					quoted=true
+				fi
+				;;
+			'}')
+				if ${in_param} && ! ${quoted}; then
+					break
+				fi
+				word="${word}${c}"
+				;;
+			*)
+				word="${word}${c}"
+				;;
+		esac
+		pgetc
+	done
+
+	printf "%d${RS}%s${RS}%s" ${lines} "${c}" "${word}"
+}
+
+scan_wordexp()
+{
+	local wordexp=
+	local lineno_offset=
+	local mod=
+	local res=
+	local word=
+
+	wordexp=''
+	lineno_offset=0
+	case "${c}" in
+		'{')
+			# Parameter expansion brace
+			pgetc
+			case "${c}" in
+				'#')
+					pgetc
+					case "${c}" in
+						[@*#?$!A-Za-z0-9_-])
+							# String length
+							# expansion
+							next_param
+							;;
+						*)
+							# Special parameter "#"
+							param='#'
+							;;
+					esac
+					;;
+				*)
+					next_param
+					;;
+			esac
+			wordexp="\${${param}"
+			# Check for modifications
+			mod=false
+			case "${c}" in
+				':')
+					mod=true
+					wordexp="${wordexp}${c}"
+					pgetc
+					case "${c}" in '-'|'='|'?'|'+')
+						wordexp="${wordexp}${c}"
+						pgetc
+						;;
+					esac
+					;;
+				'-'|'='|'?'|'+')
+					mod=true
+					wordexp="${wordexp}${c}"
+					pgetc
+					;;
+				'%')
+					mod=true
+					wordexp="${wordexp}${c}"
+					pgetc
+					case "${c}" in '%')
+						wordexp="${wordexp}${c}"
+						pgetc
+						;;
+					esac
+					;;
+				'#')
+					mod=true
+					wordexp="${wordexp}${c}"
+					pgetc
+					case "${c}" in '#')
+						wordexp="${wordexp}${c}"
+						pgetc
+						;;
+					esac
+					;;
+			esac
+			if ${mod}; then
+				# Get word.
+				res="$(scan_word true)"
+				lineno_offset=${res%%${RS}*}
+				res="${res#*${RS}}"
+				c="${res%%${RS}*}"
+				res="${res#*${RS}}"
+				word="${res%%${RS}*}"
+				# We must advance lineno because scan_word() was
+				# run in a subshell.
+				lineno=$((${lineno} + ${lineno_offset}))
+				wordexp="${wordexp}${word}"
+				echo "param mod word: '$word'" >&2
+			fi
+			# Check for right brace.
+			case "${c}" in
+				'}')
+					wordexp="${wordexp}${c}"
+					pgetc
+					;;
+				*)
+					synerr 'Missing "}"'
+					;;
+			esac
+			;;
+		'(')
+			;;
+		[@*#?$!A-Za-z0-9_-])
+			next_param
+			wordexp="\$${param}"
+			;;
+	esac
+
+	printf "%d${RS}%s${RS}%s" ${lineno_offset} "${c}" "${wordexp}"
+
+}
+
+next_param()
+{
+	param=''
+	case "${c}" in
+		[@*#?$!0-])
+			# Special parameter
+			param="${c}"
+			pgetc
+			;;
+		[1-9])
+			# Positional parameter
+			param="${param}${c}"
+			pgetc
+			while :; do
+				case "${c}" in [!0-9])
+					break
+					;;
+				esac
+				param="${param}${c}"
+				pgetc
+			done
+			;;
+		[A-Za-z_])
+			# Parameter name
+			param="${param}${c}"
+			pgetc
+			while :; do
+				case "${c}" in [!A-Za-z0-9_])
+					break
+					;;
+				esac
+				param="${param}${c}"
+				pgetc
+			done
+			;;
+	esac
+}
+
+# Check the current token.  If it matches, add it to the syntax array.
+accept()
+{
+	local t="${1}"
+
+	if [ "x${tok%%${US}*}" = "x${t}" ]; then
+		echo "accept $t" >&2
+		tokens="${tokens}${tok}${RS}"
+		next
+		return 0
+	fi
+	return 1
+}
+
+error()
+{
+	local fmt="${1}"
+	shift 1
+
+	case "${fname}" in
+		'-')
+			printf "stdin:%d: ${fmt}\n" ${lineno} "${@}" >&2
+			;;
+		*)
+			printf "%s:%d: ${fmt}\n" "${fname}" ${lineno} "${@}" >&2
+			;;
+	esac
+}
+
+init_lexer()
+{
+	local fn="${1}"
+	shift 1
+
+	fname="${fn}"
+	lineno=1
+	tokens=''
+	pgetc
+	next
+}
+
+get_tokens()
+{
+	printf '%s' "${tokens}"
+	return 0
+}
+
+synexp()
+{
+	local t="${1}"
+	shift 1
+
+	if [ "x${t}" = 'x' ]; then
+		synerr '%s unexpected' "$(tokname "${tok}")"
+	else
+		synerr '%s unexpected (expecting %s)' "$(tokname "${tok}")" \
+			"$(tokname "${t}")"
+	fi
+}
+
+synerr()
+{
+	local fmt="${1}"
+	shift 1
+
+	error "Syntax error: ${fmt}" "${@}"
+}
diff --git a/parsing/parse.sh b/parsing/parse.sh
new file mode 100644
index 0000000..2ac4215
--- /dev/null
+++ b/parsing/parse.sh
@@ -0,0 +1,133 @@
+HT="$(printf '\t.')";   HT="${HT%.}"
+LF="$(printf '\n.')";   LF="${LF%.}"
+RS="$(printf '\036.')"; RS="${RS%.}"
+US="$(printf '\037.')"; US="${US%.}"
+
+. ./tokens.sh
+. ./lexer.sh
+
+complete_command()
+{
+	if list; then
+		separator
+		return 0
+	fi
+	return 1
+}
+
+list()
+{
+	if and_or; then
+		while separator_op; do
+			if ! and_or; then
+				return 1
+			fi
+		done
+		return 0
+	fi
+	return 1
+}
+and_or()
+{
+	if pipeline; then
+		while accept T_AND_IF || accept T_OR_IF; do
+			if ! linebreak || ! pipeline; then
+				return 1
+			fi
+		done
+		return 0
+	fi
+	return 1
+}
+pipeline()
+{
+	accept T_BANG
+	if pipe_sequence; then
+		return 0
+	fi
+	return 1
+}
+
+pipe_sequence()
+{
+	if command; then
+		while accept T_PIPE; do
+			if ! linebreak || ! command; then
+				return 1
+			fi
+		done
+		return 0
+	fi
+	return 1
+}
+
+command()
+{
+	# XXX: Unfinished
+	accept T_WORD
+}
+
+
+
+newline_list()
+{
+	if accept T_NEWLINE; then
+		while accept T_NEWLINE; do
+			:
+		done
+		return 0
+	fi
+	return 1
+}
+linebreak()
+{
+	newline_list
+	return 0
+}
+
+separator_op()
+{
+	if accept T_AND || accept T_SEMI; then
+		return 0
+	fi
+	return 1
+}
+
+separator()
+{
+	if separator_op && linebreak; then
+		return 0
+	elif newline_list; then
+		return 0
+	fi
+	return 1
+}
+
+parse()
+{
+	local fn="${1}"
+	shift 1
+
+	init_lexer "${fn}"
+	while complete_command; do :; done
+	if :; then  # TODO: Test for EOF or errors
+		get_tokens
+		return 0
+	fi
+	return 1
+}
+
+if tokens="$(printf '%s\n' '"foo bar" && $baz || qux' '${quux%uux } quuux' | \
+		parse -)"; then
+	IFS="${RS}"
+	for t in ${tokens}; do
+		printf 'Token: %s\n' "$(tokname "${t}")"
+		case "${t%${US}*}" in T_WORD)
+			printf '       "%s"\n' "${t#T_WORD${US}}"
+			;;
+		esac
+	done
+	unset IFS
+else
+	echo FAIL
+fi
diff --git a/parsing/tokens.sh b/parsing/tokens.sh
new file mode 100644
index 0000000..37d741b
--- /dev/null
+++ b/parsing/tokens.sh
@@ -0,0 +1,55 @@
+tokname()
+{
+	local t="${1}"
+	shift 1
+	local n=
+
+	case "${t%${US}*}" in
+		# Operators
+		T_EOF)       n='end of file';;
+		T_NEWLINE)   n='newline';;
+		T_AND)       n='&';;
+		T_SEMI)      n=';';;
+		T_AND_IF)    n='&&';;
+		T_OR_IF)     n='||';;
+		T_DSEMI)     n=';;';;
+		T_LESS)      n='<';;
+		T_GREAT)     n='>';;
+		T_DLESS)     n='<<';;
+		T_DGREAT)    n='>>';;
+		T_LESSAND)   n='<&';;
+		T_GREATAND)  n='>&';;
+		T_LESSGREAT) n='<>';;
+		T_DLESSDASH) n='<<-';;
+		T_CLOBBER)   n='>|';;
+		T_PIPE)      n='|';;
+		T_LPAREN)    n='(';;
+		T_RPAREN)    n=')';;
+		# Reserved words
+		T_IF)     n='if';;
+		T_THEN)   n='then';;
+		T_ELSE)   n='else';;
+		T_ELIF)   n='elif';;
+		T_FI)     n='fi';;
+		T_DO)     n='do';;
+		T_DONE)   n='done';;
+		T_CASE)   n='case';;
+		T_ESAC)   n='esac';;
+		T_WHILE)  n='while';;
+		T_UNTIL)  n='until';;
+		T_FOR)    n='for';;
+		T_LBRACE) n='{';;
+		T_RBRACE) n='}';;
+		T_BANG)   n='!';;
+		T_IN)     n='in';;
+		# Special symbols
+		T_NAME)            n='name';;
+		T_IO_NUMBER)       n='I/O number';;
+		T_WORD)            n='word';;
+		T_ASSIGNMENT_WORD) n='assignment word';;
+		# Unknown
+		*) n='unknown token';;
+	esac
+
+	printf '%s' "${n}"
+}
--
cgit v0.9.1