From 58fded07a19e6100e307bba9de8e72854b57f1c4 Mon Sep 17 00:00:00 2001
From: P. J. McDermott <pj@pehjota.net>
Date: Sun, 28 Feb 2016 01:47:19 -0500
Subject: Merge branch 'feature/no-dd-lexer'

---
diff --git a/eshtrans/frontend/lexer.esh b/eshtrans/frontend/lexer.esh
index a9aaa6d..8bba0e0 100644
--- a/eshtrans/frontend/lexer.esh
+++ b/eshtrans/frontend/lexer.esh
@@ -18,7 +18,7 @@
 # along with the Eggshell Compiler.  If not, see
 # <http://www.gnu.org/licenses/>.
 
-#dbg=false
+dbg=false
 
 fname=
 lineno=
@@ -31,12 +31,12 @@ here_awaiting_end=
 here_awaiting_word=
 tok=
 
-#dbg()
-#{
-#	if ${dbg}; then
-#		printf 'DEBUG: %s\n' "${@}" >&2
-#	fi
-#}
+dbg()
+{
+	if ${dbg}; then
+		printf 'DEBUG: %s\n' "${@}" >&2
+	fi
+}
 
 #
 # Error handling (used by scanning and interface functions)
@@ -88,8 +88,29 @@ synerr()
 
 lgetc()
 {
-	c="$(dd bs=1 count=1 2>/dev/null; printf '.')"
-	c="${c%.}"
+	if [ ${lbufi} -ge ${lbufc} ]; then
+		c=''
+	else
+		eval "c=\${lbufv_${lbufi}}"
+		#echo "LGETC:$lineno: $lbufi '$c'" >&2
+		lbufi=$((${lbufi} + 1))
+	fi
+}
+
+lungetc()
+{
+	lbufi=$((${lbufi} - 2))
+	eval "c=\${lbufv_${lbufi}}"
+	#echo "LUNGETC:$lineno: $lbufi '$c'" >&2
+	lbufi=$((${lbufi} + 1))
+}
+
+lsetc()
+{
+	lbufi=$((${lbufi} - 1))
+	eval "c=\${lbufv_${lbufi}}"
+	#echo "LSETC:$lineno: $lbufi '$c'" >&2
+	lbufi=$((${lbufi} + 1))
 }
 
 #
@@ -103,7 +124,7 @@ next()
 		return
 	fi
 	while :; do
-		#dbg "parsing char '$c' at lineno $lineno"
+		dbg "parsing char '$c' at lineno $lineno"
 		case "${c}" in
 			'')
 				lgetc
@@ -137,7 +158,8 @@ next()
 					continue
 					;;
 				esac
-				next_word \\
+				lungetc
+				next_word
 				return
 				;;
 			'#')
@@ -181,7 +203,7 @@ next()
 					return
 					;;
 				esac
-				#dbg T_SEMI
+				dbg T_SEMI
 				tok=T_SEMI
 				return
 				;;
@@ -200,7 +222,7 @@ next()
 				return
 				;;
 			*)
-				next_word ''
+				next_word
 				return
 				;;
 		esac
@@ -272,7 +294,8 @@ next_here()
 					fi
 					ln_off=${res%%${RS}*}
 					res="${res#*${RS}}"
-					c="${res%%${RS}*}"
+					lbufi="${res%%${RS}*}"
+					lsetc
 					res="${res#*${RS}}"
 					lineno=$((${lineno} + ${ln_off}))
 					line="${line}${res}"
@@ -353,16 +376,15 @@ next_io()
 
 next_word()
 {
-	local prev_c="${1}"
-	shift 1
 	local res=
 
-	if ! res="$(scan_word false "${prev_c}")"; then
+	if ! res="$(scan_word false)"; then
 		exit 1
 	fi
 	ln_off=${res%%${RS}*}
 	res="${res#*${RS}}"
-	c="${res%%${RS}*}"
+	lbufi="${res%%${RS}*}"
+	lsetc
 	res="${res#*${RS}}"
 
 	# We must advance lineno because scan_word() was run in a subshell.
@@ -391,8 +413,7 @@ next_word()
 scan_word()
 {
 	local in_param="${1}"
-	local prev_c="${2}"
-	shift 2
+	shift 1
 	local lines=
 	local word=
 	local quoted=
@@ -403,17 +424,8 @@ scan_word()
 	word=''
 	quoted=false
 
-	# Sort of a localized ungetc().
-	case "${prev_c}" in
-		'') ;;
-		*)
-			tmp_c="${c}"
-			c="${prev_c}"
-			;;
-	esac
-
 	while :; do
-		#dbg "parsing word char '$c' at lineno $lineno"
+		dbg "parsing word char '$c' at lineno $lineno"
 		case "${c}" in
 			'')
 				break
@@ -443,16 +455,14 @@ scan_word()
 					fi
 					;;
 				esac
-				case "${prev_c}" in
-					'') lgetc;;
-					*) c="${tmp_c}"; prev_c='';;
-				esac
-				if ! res=$(scan_wordexp); then
+				lgetc
+				if ! res="$(scan_wordexp)"; then
 					exit 1
 				fi
 				ln_off=${res%%${RS}*}
 				res="${res#*${RS}}"
-				c="${res%%${RS}*}"
+				lbufi="${res%%${RS}*}"
+				lsetc
 				res="${res#*${RS}}"
 				# We must advance lineno because scan_wordexp()
 				# was run in a subshell.
@@ -470,13 +480,10 @@ scan_word()
 				break
 				;;
 			\\)
-				#dbg 'first backslash in word'
+				dbg 'first backslash in word'
 				word="${word}${c}"
-				case "${prev_c}" in
-					'') lgetc;;
-					*) c="${tmp_c}"; prev_c='';;
-				esac
-				#dbg "next char: '$c'"
+				lgetc
+				dbg "next char: '$c'"
 				case "${c}" in '')
 					# Bash, ksh93, mksh, and zsh ignore a
 					# backslash at the end of a file, but
@@ -494,17 +501,11 @@ scan_word()
 			\')
 				word="${word}${c}"
 				if ${quoted}; then
-					case "${prev_c}" in
-						'') lgetc;;
-						*) c="${tmp_c}"; prev_c='';;
-					esac
+					lgetc
 					continue
 				fi
 				while :; do
-					case "${prev_c}" in
-						'') lgetc;;
-						*) c="${tmp_c}"; prev_c='';;
-					esac
+					lgetc
 					word="${word}${c}"
 					case "${c}" in
 						'')
@@ -540,17 +541,14 @@ scan_word()
 				word="${word}${c}"
 				;;
 		esac
-		case "${prev_c}" in
-			'') lgetc;;
-			*) c="${tmp_c}"; prev_c='';;
-		esac
+		lgetc
 	done
 
 	if ${quoted}; then
 		synerr 'Unterminated quoted string'
 	fi
 
-	printf "%d${RS}%c${RS}%s" ${lines} "${c}" "${word}"
+	printf "%d${RS}%d${RS}%s" ${lines} ${lbufi} "${word}"
 }
 
 scan_wordexp()
@@ -576,10 +574,11 @@ scan_wordexp()
 					# Command substitution
 					if ! res="$(run_sublexer "sub${fname}" \
 							${lineno} "${start}" \
-							"${c}")"; then
+							${lbufi})"; then
 						exit 1
 					fi
-					c="${res##*${RS}}"
+					lbufi="${res##*${RS}}"
+					lsetc
 					res="${res%${RS}*}"
 					ln_off=${res##*${RS}}
 					res="${res%${RS}*}"
@@ -597,14 +596,15 @@ scan_wordexp()
 			fi
 			ln_off=${res%%${RS}*}
 			res="${res#*${RS}}"
-			c="${res%%${RS}*}"
+			lbufi="${res%%${RS}*}"
+			lsetc
 			res="${res#*${RS}}"
 			lineno=$((${lineno} + ${ln_off}))
 			wordexp="\$${res}"
 			;;
 	esac
 
-	printf "%d${RS}%c${RS}%s" ${ln_off} "${c}" "${wordexp}"
+	printf "%d${RS}%d${RS}%s" ${ln_off} ${lbufi} "${wordexp}"
 	return 0
 }
 
@@ -628,7 +628,8 @@ scan_wordexp_param_brace()
 					fi
 					ln_off=${res%%${RS}*}
 					res="${res#*${RS}}"
-					c="${res%%${RS}*}"
+					lbufi="${res%%${RS}*}"
+					lsetc
 					res="${res#*${RS}}"
 					param="#${res}"
 					lineno=$((${lineno} + ${ln_off}))
@@ -647,7 +648,8 @@ scan_wordexp_param_brace()
 			fi
 			ln_off=${res%%${RS}*}
 			res="${res#*${RS}}"
-			c="${res%%${RS}*}"
+			lbufi="${res%%${RS}*}"
+			lsetc
 			res="${res#*${RS}}"
 			param="${res}"
 			lineno=$((${lineno} + ${ln_off}))
@@ -701,18 +703,19 @@ scan_wordexp_param_brace()
 	# If a modification was found
 	if ${mod}; then
 		# Get word.
-		if ! res="$(scan_word true '')"; then
+		if ! res="$(scan_word true)"; then
 			exit 1
 		fi
 		ln_off=${res%%${RS}*}
 		res="${res#*${RS}}"
-		c="${res%%${RS}*}"
+		lbufi="${res%%${RS}*}"
+		lsetc
 		res="${res#*${RS}}"
 		# We must advance lineno because scan_word() was run in a
 		# subshell.
 		lineno=$((${lineno} + ${ln_off}))
 		wordexp="${wordexp}${res}"
-		#dbg "param mod word: '$res'"
+		dbg "param mod word: '$res'"
 	fi
 
 	# Check for right brace.
@@ -771,7 +774,7 @@ scan_param()
 			;;
 	esac
 
-	printf "%d${RS}%c${RS}%s" 0 "${c}" "${param}"
+	printf "%d${RS}%d${RS}%s" 0 ${lbufi} "${param}"
 	return 0
 }
 
@@ -811,12 +814,13 @@ scan_wordexp_arith()
 				;;
 			'$')
 				lgetc
-				if ! res=$(scan_wordexp); then
+				if ! res="$(scan_wordexp)"; then
 					exit 1
 				fi
 				ln_off=${res%%${RS}*}
 				res="${res#*${RS}}"
-				c="${res%%${RS}*}"
+				lbufi="${res%%${RS}*}"
+				lsetc
 				res="${res#*${RS}}"
 				# We must advance lineno because scan_wordexp()
 				# was run in a subshell.
@@ -837,7 +841,7 @@ run_sublexer()
 	local fn="${1}"
 	local ln="${2}"
 	local st="${3}"
-	local ch="${4}"
+	local i="${4}"
 	shift 4
 
 	# Initialize global variables.
@@ -848,7 +852,8 @@ run_sublexer()
 	here_awaiting_end=false
 	here_awaiting_word=false
 
-	c="${ch}"
+	lbufi="${i}"
+	lsetc
 	next
 
 	#dbg=true
@@ -862,7 +867,7 @@ run_sublexer()
 			;;
 	esac
 
-	printf "${RS}%d${RS}%c" ${lineno} "${c}"
+	printf "${RS}%d${RS}%d" ${lineno} ${lbufi}
 	return 0
 }
 
@@ -873,8 +878,9 @@ run_sublexer()
 run_lexer()
 {
 	local fn="${1}"
-	local st="${2}"
-	shift 2
+	local buf="${2}"
+	local st="${3}"
+	shift 3
 
 	# Initialize global variables.
 	fname="${fn}"
@@ -884,6 +890,20 @@ run_lexer()
 	here_awaiting_end=false
 	here_awaiting_word=false
 
+	# Read file into array
+	eval "$(printf '%s' "${buf}" | awk -v FS='' -v j=0 \
+		-v squote="'" -v esc_squote="'\\\\''" '
+		{
+			for (i = 1; i <= NF; ++i) {
+				sub(squote, esc_squote, $i);
+				printf("lbufv_%d='\''%s'\''\n", j++, $i);
+			};
+			printf("lbufv_%d='\''\n'\''\n", j++);
+		}
+		')"
+	lbufi=0
+	lbufc=${#buf}
+
 	# Read the first character and recognize the first token.
 	lgetc
 	next
@@ -905,13 +925,13 @@ accept()
 	shift 1
 	local rw=
 
-	#dbg "looking for $t, current tok ${tok%%${US}*}"
+	dbg "looking for $t, current tok ${tok%%${US}*}"
 	case "${t}" in
 		T_IF|T_THEN|T_ELSE|T_ELIF|T_FI|T_DO|T_DONE|\
 		T_CASE|T_ESAC|T_WHILE|T_UNTIL|T_FOR|\
 		T_LBRACE|T_RBRACE|T_BANG|T_IN|\
 		T_STATIC|T_LOCAL|T_RETURN)
-			#dbg "looking for reserved word $t, have '$tok'"
+			dbg "looking for reserved word $t, have '$tok'"
 			if ! [ "x${tok%%${US}*}" = "x${t}" ]; then
 				# Reserved words are recognized as literal
 				# T_WORDs.
@@ -1012,7 +1032,7 @@ accept()
 			;;
 	esac
 
-	#dbg "accept $t"
+	dbg "accept $t"
 	printf '%s' "${tok}${RS}"
 	next
 	return 0
diff --git a/eshtrans/frontend/main.esh b/eshtrans/frontend/main.esh
index b9f93a6..fc25b5f 100644
--- a/eshtrans/frontend/main.esh
+++ b/eshtrans/frontend/main.esh
@@ -21,9 +21,10 @@
 esh_parse()
 {
 	local fn="${1}"
-	shift 1
+	local buf="${2}"
+	shift 2
 
-	if run_lexer "${fn}" complete_command; then
+	if run_lexer "${fn}" "${buf}" complete_command; then
 		return 0
 	fi
 	return 1
diff --git a/eshtrans/main.esh b/eshtrans/main.esh
index 7e0cb8c..ac3895e 100644
--- a/eshtrans/main.esh
+++ b/eshtrans/main.esh
@@ -88,9 +88,13 @@ main()
 		fi
 		input="${1}"
 		if [ "x${output}" = 'x-' ]; then
-			sh_codegen "$(esh_parse "${input}" <"${input}")"
+			contents="$(cat "${input}"; printf '.')"
+			contents="${contents%.}"
+			sh_codegen "$(esh_parse "${input}" "${contents}")"
 		else
-			sh_codegen "$(esh_parse "${input}" <"${input}")" \
+			contents="$(cat "${input}"; printf '.')"
+			contents="${contents%.}"
+			sh_codegen "$(esh_parse "${input}" "${contents}")" \
 				>"${output}"
 		fi
 	fi
--
cgit v0.9.1