From 8cd31b6c6cfff683a7a47d925a74c8b9939c5e29 Mon Sep 17 00:00:00 2001
From: P. J. McDermott <pj@pehjota.net>
Date: Sun, 21 Feb 2016 01:46:26 -0500
Subject: Implement here-document

Also fix some I/O redirection operator bugs.
---
diff --git a/parsing/lexer.sh b/parsing/lexer.sh
index ce39c23..c88c8b6 100644
--- a/parsing/lexer.sh
+++ b/parsing/lexer.sh
@@ -4,6 +4,9 @@ ln_off=
 start=
 c=
 wordexp=
+here_stack=
+here_awaiting_end=
+here_awaiting_word=
 tok=
 tokens=
 
@@ -67,6 +70,12 @@ lgetc()
 
 next()
 {
+	case "${here_stack}" in *"${RS}"*)
+		if ${here_awaiting_word}; then
+			next_here
+			return
+		fi
+	esac
 	while :; do
 		dbg "parsing char '$c' at lineno $lineno"
 		case "${c}" in
@@ -76,6 +85,15 @@ next()
 				return
 				;;
 			"${LF}")
+				case "${here_stack}" in *"${RS}"*)
+					if ${here_awaiting_end}; then
+						synexp ''
+					else
+						here_awaiting_end=false
+						here_awaiting_word=true
+					fi
+					;;
+				esac
 				lgetc
 				lineno=$((${lineno} + 1))
 				tok=T_NEWLINE
@@ -164,6 +182,89 @@ next()
 	done
 }
 
+next_here()
+{
+	local here=
+	local here_strip_tabs=
+	local here_end=
+	local here_escaped=
+	local line=
+	local word=
+	local res=
+	local wordexp=
+
+	# Pop the here-document off of the stack.
+	here="${here_stack##*${RS}}"
+	here_strip_tabs="${here%%${US}*}"
+	here_end="${here%${US}*}"
+	here_end="${here_end#*${US}}"
+	here_escaped="${here##*${US}}"
+	here_stack="${here_stack%${RS}*}"
+	here_awaiting_word=false
+
+	line=''
+	word=''
+	while :; do
+		case "${c}" in
+			'')
+				# Bash throws a warning when EOF occurs in a
+				# here document.  mksh throws an error.  dash,
+				# BusyBox ash, ksh93, and zsh accept EOF as a
+				# delimiter.  We aim for the lowest common
+				# denominator, so throw an error like mksh does.
+				synerr 'Here-document "%s" unclosed' \
+					"${here_end}"
+				;;
+			"${LF}")
+				line="${line}${c}"
+				word="${word}${line}"
+				case "${line}" in "${here_end}${LF}")
+					lgetc
+					tok="T_WORD${US}${word}"
+					return
+					;;
+				esac
+				line=''
+				;;
+			"${HT}")
+				if ${here_strip_tabs}; then
+					case "${line}" in
+						'')
+							;;
+						*)
+							line="${line}${c}"
+							;;
+					esac
+				else
+					line="${line}${c}"
+				fi
+				;;
+			'$')
+				if ! ${here_escaped}; then
+					lgetc
+					if ! res="$(scan_wordexp)"; then
+						exit 1
+					fi
+					ln_off=${res%%${RS}*}
+					res="${res#*${RS}}"
+					c="${res%%${RS}*}"
+					res="${res#*${RS}}"
+					wordexp="${res%%${RS}*}"
+					lineno=$((${lineno} + ${ln_off}))
+					line="${line}${wordexp}"
+					continue
+				else
+					line="${line}${c}"
+				fi
+				;;
+			*)
+				line="${line}${c}"
+				;;
+		esac
+		lgetc
+	done
+}
+
 next_io()
 {
 	case "${c}" in
@@ -175,20 +276,32 @@ next_io()
 					case "${c}" in '-')
 						lgetc
 						tok=T_DLESSDASH
+						here_stack="${here_stack}${RS}"
+						here_stack="${here_stack}true"
+						here_awaiting_end=true
+						here_awaiting_word=false
+						break
 						;;
 					esac
 					tok=T_DLESS
+					here_stack="${here_stack}${RS}false"
+					here_awaiting_end=true
+					here_awaiting_word=false
+					break
 					;;
 				'&')
 					lgetc
 					tok=T_LESSAND
+					break
 					;;
 				'>')
 					lgetc
 					tok=T_LESSGREAT
+					break
 					;;
 			esac
 			tok=T_LESS
+			break
 			;;
 		'>')
 			lgetc
@@ -196,17 +309,21 @@ next_io()
 				'>')
 					lgetc
 					tok=T_DGREAT
+					break
 					;;
 				'&')
 					lgetc
 					tok=T_GREATAND
+					break
 					;;
 				'|')
 					lgetc
 					tok=T_CLOBBER
+					break
 					;;
 			esac
 			tok=T_GREAT
+			break
 			;;
 	esac
 }
@@ -228,6 +345,22 @@ next_word()
 	# We must advance lineno because scan_word() was run in a subshell.
 	lineno=$((${lineno} + ${ln_off}))
 	tok="T_WORD${US}${word}"
+
+	case "${here_stack}" in *"${RS}"*)
+		if ${here_awaiting_end}; then
+			here_stack="${here_stack}${US}${word}"
+			case "${word}" in
+				*\\*|*'"'*|*"'"*)
+					here_stack="${here_stack}${US}true"
+					;;
+				*)
+					here_stack="${here_stack}${US}false"
+					;;
+			esac
+			here_awaiting_end=false
+		fi
+		;;
+	esac
 }
 
 #
@@ -267,6 +400,15 @@ scan_word()
 				word="${word}${c}"
 				;;
 			'$')
+				case "${here_stack}" in *"${RS}"*)
+					if ${here_awaiting_end}; then
+						synerr '%s %s %s %s' \
+							'Word expansions' \
+							'not supported in' \
+							'here-document' \
+							'delimiters'
+					fi
+				esac
 				lgetc
 				if ! res=$(scan_wordexp); then
 					exit 1
@@ -737,6 +879,9 @@ run_sublexer()
 	fname="${fn}"
 	lineno=${ln}
 	start="${st}"
+	here_stack="${US}"
+	here_awaiting_end=false
+	here_awaiting_word=false
 	tokens=''
 
 	c="${ch}"
@@ -767,6 +912,9 @@ run_lexer()
 	fname="${fn}"
 	lineno=1
 	start="${st}"
+	here_stack="${US}"
+	here_awaiting_end=false
+	here_awaiting_word=false
 	tokens=''
 
 	# Read the first character and recognize the first token.
diff --git a/parsing/parse.sh b/parsing/parse.sh
index 6508142..81b8a54 100644
--- a/parsing/parse.sh
+++ b/parsing/parse.sh
@@ -649,7 +649,11 @@ try()
 #try 'foo $(bar) baz'
 #try 'foo$(bar$(baz))qux'
 #try 'foo $((1 + 1))'
-try '$((1 + 1))'
-try '$((1 + (1 + 1)))'
-try '$((1 + $(foo) + 1))'
-try '$((1'
+#try '$((1 + 1))'
+#try '$((1 + (1 + 1)))'
+#try '$((1 + $(foo) + 1))'
+#try '$((1'
+try 'foo <<EOF' 'bar' 'EOF'
+try 'foo <<-EOF' "${HT}bar" "${HT}EOF"
+try 'foo <<EOF' '$(bar)' 'EOF'
+try 'foo <<E"O"F' '$(bar)' 'E"O"F'  # BUG
--
cgit v0.9.1