From 0a72d231a7d42396e3551f38eb63d55fea100669 Mon Sep 17 00:00:00 2001
From: P. J. McDermott <pj@pehjota.net>
Date: Sat, 20 Feb 2016 18:04:48 -0500
Subject: Rewrite codegen

The previous version didn't handle nested or multiple command
substitutions.

The stack format has also been extended to support arithmetic
expansions.
---
diff --git a/parsing/codegen.sh b/parsing/codegen.sh
index c28ffca..0e32987 100644
--- a/parsing/codegen.sh
+++ b/parsing/codegen.sh
@@ -1,4 +1,10 @@
-toktext=''
+sc=
+
+sgetc()
+{
+	sc="$(dd bs=1 count=1 2>/dev/null; printf '.')"
+	sc="${sc%.}"
+}
 
 toktext()
 {
@@ -57,29 +63,80 @@ toktext()
 		*) n='';;
 	esac
 
-	toktext="${n}"
+	printf '%s' "${n}"
 }
 
-codegen()
+codegen_sub()
 {
-	local toks="${1}"
+	local array="${1}"
 	shift 1
-	local subtoks=
-	local t=
-
-	case "${toks}" in
-		*"${STX}"*"${ETX}"*)
-			subtoks="${toks#*${STX}}"
-			subtoks="${subtoks%${ETX}*}"
-			toks="${toks%%${STX}*}$(codegen \
-				"${subtoks}")${toks##*${ETX}}"
-			;;
-	esac
 
 	IFS="${RS}"
-	for t in ${toks}; do
+	for t in ${array}; do
 		toktext "${t}"
-		printf '%s ' "${toktext}"
+		printf ' '
 	done
 	unset IFS
 }
+
+# The token stack is encoded in a string in the following grammar:
+#     Terminal symbols:
+#         TOKEN
+#     Production rules:
+#         stack  = tokens [ '<SOH>' type '<STX>' stack '<ETX>' [ tokens ] ] ;
+#         tokens = TOKEN { '<RS>' TOKEN } ;
+#         type   = 'C' | 'A' ;
+# We need to recurse through this stack to get to all the tokens.
+# Each element in the stack (an array of tokens) gets run through the codegen to
+# become text that is inserted into the array below.
+parse_stack()
+{
+	local array=
+
+	array=''
+	while :; do
+		sgetc
+		case "${sc}" in
+			'')
+				# EOF
+				break
+				;;
+			"${SOH}")
+				# New stack element
+				sgetc
+				case "${sc}" in
+					'C')
+						# Command substitution
+						sgetc  # STX
+						array="${array}$(parse_stack)"
+						;;
+					'A')
+						# Arithmetic expansion
+						sgetc  # STX
+						;;
+				esac
+				;;
+			"${ETX}")
+				# End of stack element
+				break
+				;;
+			*)
+				# Token character
+				array="${array}${sc}"
+				;;
+		esac
+	done
+	codegen_sub "${array}"
+}
+
+codegen()
+{
+	local toks="${1}"
+	shift 1
+
+	if printf '%s' "${toks}" | parse_stack; then
+		return 0
+	else
+		return 1
+	fi
+}
diff --git a/parsing/lexer.sh b/parsing/lexer.sh
index 3f6f52f..a7fbb80 100644
--- a/parsing/lexer.sh
+++ b/parsing/lexer.sh
@@ -467,7 +467,8 @@ scan_wordexp()
 					res="${res#*${RS}}"
 					toks="${res%%${RS}*}"
 					lineno=${ln_off}
-					wordexp="\$(${STX}${toks}${ETX})"
+					wordexp="\$(${SOH}C${STX}${toks}"
+					wordexp="${wordexp}${ETX})"
 					# ")" is recognized in run_sublexer().
 					;;
 			esac
diff --git a/parsing/parse.sh b/parsing/parse.sh
index 196ad84..a71f90b 100644
--- a/parsing/parse.sh
+++ b/parsing/parse.sh
@@ -1,3 +1,4 @@
+SOH="$(printf '\001.')"; SOH="${SOH%.}"
 STX="$(printf '\002.')"; STX="${STX%.}"
 ETX="$(printf '\003.')"; ETX="${ETX%.}"
  HT="$(printf '\t.')";    HT="${HT%.}"
@@ -646,4 +647,5 @@ try 'foo $(bar)'
 try 'foo $(bar); baz'
 try 'foo $(bar)' 'baz'
 try 'foo $(bar) baz'
+try 'foo$(bar$(baz))qux'
 #try 'foo $((1 + 1))'
--
cgit v0.9.1