From 53c48293168024ce8d62158abbd5684788a1b08f Mon Sep 17 00:00:00 2001 From: P. J. McDermott Date: Sat, 11 Feb 2017 03:19:31 -0500 Subject: research/arrays-and-hashes.txt: New file --- diff --git a/research/arrays-and-hashes.txt b/research/arrays-and-hashes.txt new file mode 100644 index 0000000..eac18cd --- /dev/null +++ b/research/arrays-and-hashes.txt @@ -0,0 +1,465 @@ +Arrays +====== + +Arrays are simple lists of values. Arrays are zero-indexed, but internally +there is an offset that is affected by `shift` and `unshift` calls. + +Array keys are non-zero integers. + +An array has a length limited only by the highest non-zero integer in the +shell's built-in arithmetic. An array is grown by calls to `push` and `unshift` +as well as by assigning values with out-of-bounds keys. An array is shrunk by +calls to `pop` and `shift` as well as by unsetting the 0th and last values. + +Array Expansion +--------------- + +Array expansion initializes an array. The format for array expansion is as +follows: + + @(values) + +where *values* is zero or more *word*s. A reference to the new array is +substituted. + +eshtrans will translate array expansions into calls to `__a_new` (part of +libeshtrans) with array references returned in registers allocated at compile +time. + +For example: + + cardinals=@("zero" "one" "two") + +compiles to: + + __a_new __r0 "zero" "one" "two" + cardinals=${__r0} + +while: + + cmd @("zero" "one" "two") + +compiles to: + + __a_new __r0 "zero" "one" "two" + cmd ${__r0} + +and: + + cardinals=@(@("zero" "cero") @("one" "uno") @("two" "dos")) + +compiles to: + + __a_new __r0 "zero" "cero" + __a_new __r1 "one" "uno" + __a_new __r2 "two" "dos" + __a_new __r3 ${__r0} ${__r1} ${__r2} + cardinals=${__r3} + +Quoting the `@(` operator removes its special meaning. + +Array Value Assignment +---------------------- + +The format for array value assignment is as follows: + + name[key]=word + +eshtrans will translate array value assignments into calls to `__a_set` (part of +libeshtrans). + +For example: + + cardinals[0]="zero" + +compiles to: + + __a_set cardinals 0 "zero" + +Array Value Expansion +--------------------- + +The format for array value expansion is as follows: + + ${name[key]} + +eshtrans will translate array value expansions into calls to `__a_get` (part of +libeshtrans). + +For example: + + zero=${cardinals[0]} + +compiles to: + + __a_get __r0 cardinals 0 + zero=${__r0} + +`unset` +------- + +`unset` is a reserved word and a function that unsets variables, functions, +array values, and arrays. + +The format for unsetting an array value is as follows: + + unset name[key] + +The format for unsetting an array is as follows: + + unset name + +eshtrans will check the types of all of the operands of `unset` commands and +translate them into calls to `unset` (the shell's built-in utility), +`__a_unset`, or `__a_delete` as appropriate. + +For example: + + unset cardinals[1] cardinals[2] + +compiles to: + + __a_unset cardinals 1 + __a_unset cardinals 2 + +and: + + unset cardinals + +compiles to: + + __a_delete cardinals + +libeshtrans Functions +--------------------- + + # __a_new(__r) + # Paramters: + # * __r: Register in which to store array reference + __a_new() + { + __r=${1} + shift 1 + + # Allocate a slot on the array heap. + __malloc __a + __a=__mr + + # Set values. + __k=0 + for __v in "${@}"; do + eval "${__a}__${__k}=\${__v}" + __k=$((${__k} + 1)) + done + + # Set length. + eval "${__a}_l=\${__k}" + + # Store the array reference in the specified return register. + eval "${__r}=\${__a} + } + + # __a_set(__a __k __v) + # Parameters: + # * __a: Array + # * __k: Key + # * __v: Value + __a_set() + { + __a=${1} + __k=${2} + __v=${3} + + # Validate key. + case ${__k} in *[!0-9]*) + printf 'Error: Illegal array key: %s\n' "${__k}" 1>&2 + exit 1 + ;; esac + + # Enlarge array if key is out-of-bounds. + eval "__l=\${${__a}_l}" + if [ ${__k} -ge ${__l} ]; then + eval "${__a}_l=\$((\${__k} - 1))" + fi + + # Set value. + eval "${__a}__${__k}=\${__v}" + } + + # __a_get(__r __a __k) + # Parameters: + # * __r: Register in which to store value + # * __a: Array + # * __k: Key + __a_get() + { + __r=${1} + __a=${2} + __k=${3} + + # Validate key. + case ${__k} in *[!0-9]*) + printf 'Error: Illegal array key: %s\n' "${__k}" 1>&2 + exit 1 + ;; esac + + # Offset key. + eval "__k=\$((\${__k} + \${${__a}__o}))" + + # Store the value in the specified return register. + # NB: If `set -u` is on, the shell will throw an error on undefined values + # or out-of-bound keys, as one would expect. + eval "${__r}=\${${__a}__${__k}}" + } + + # __a_unset(__a __k) + # Parameters: + # * __a: Array + # * __k: Key + __a_unset() + { + __a=${1} + __k=${2} + + # ... + } + + # __a_delete(__a) + # Parameters: + # * __a: Array + __a_delete() + { + __a=${1} + + # ... + } + +libesh Functions +---------------- + + # length(a) + # Parameters: + # * a: Array + length(a) + { + l= + + eval "${&l}=\${${a}_l}" + + return "${l}" + } + + # push(a v) + # Parameters: + # * a: Array + # * v: Value + push(a v) + { + l= + k= + + # Get the current length. + eval "${&l}=\${${a}_l}" + # Offset the length to get the new key. + eval "${&k}=\$((${l} + \${${a}_o}))" + # Set the new value. + eval "${a}__${k}=\${${&v}}" + # Increment the length. + eval "${a}_l=\$((${l} + 1))" + } + + # pop(a) + # Parameters: + # * a: Array + pop(a) + { + # ... + } + + # __shift(a) + # Parameters: + # * a: Array + __shift(a) + { + # ... + } + + # unshift(a v) + # Parameters: + # * a: Array + # * v: Value + unshift(a v) + { + # ... + } + + # split(fs str) + # Parameters: + # * fs: Field separator + # * str: String to split + split(fs str) + { + # See split.sh + } + + # join(str a) + # Parameters: + # * str: Separator + # * a: Array to join + # str will be duplicated around unset values in the middle of the array. + join() + { + l= + joined= + + # Get the current length. + eval "${&l}=\${${a}_l}" + + # Join all the array values. + joined="$(eval "printf '%s' \"$(printf "\\\${${a}__%d}\${str}" \ + $(awk -v end=${l} \ + 'BEGIN { for (i = 0; i < end; ++i) print(i); }'))\"")" + + return "${joined%${str}}" + } + +Hashes +====== + +Hashes are sets of key/value pairs. + +Hash keys may only consist of Latin letters, digits, and underscores. + +Each hash has a list of keys, which is returned by the `keys` function. + +Hash Expansion +-------------- + +Hash expansion initializes a hash. The format for hash expansion is as follows: + + %(values) + +where *values* is zero or more *word*s of the following format: + + key=value + +A reference to the new hash is substituted. + +eshtrans will translate hash expansions into calls to `__h_new` (part of +libeshtrans) with hash references returned in registers allocated at compile +time. + +For example: + + numerals=%(zero=0 one=1 two=2) + +compiles to: + + __h_new __r0 zero 0 one 1 two 2 + numerals=${__r0} + +while: + + cmd %(zero=0 one=1 two=2) + +compiles to: + + __h_new __r0 zero 0 one 1 two 2 + cmd ${__r0} + +and: + + numerals=%(\ + zero=%(arabic=0 roman=N) \ + one=%(arabic=1 roman=i) \ + two=%(arabic=2 roman=ii) \ + ) + +compiles to: + + __h_new __r0 arabic 0 roman N + __h_new __r1 arabic 1 roman i + __h_new __r2 arabic 2 roman ii + __h_new __r3 zero ${__r0} one ${__r1} two ${__r2} + numerals=${__r3} + +Quoting the `%(` operator removes its special meaning. + +Hash Value Assignment +--------------------- + +The format for hash value assignment is as follows: + + name{key}=word + +eshtrans will translate hash value assignments into calls to `__h_set` (part of +libeshtrans). + +For example: + + numerals{zero}=0 + +compiles to: + + __h_set numerals zero 0 + +Hash Value Expansion +-------------------- + +The format for hash value expansion is as follows: + + ${name{key}} + +eshtrans will translate hash value expansions into calls to `__h_get` (part of +libeshtrans). + +For example: + + zero=${numerals{zero}} + +compiles to: + + __h_get __r0 numerals zero + zero=${__r0} + +`ENV` Hash +---------- + +Environment variables can be accessed through the `ENV` hash. This circumvents +identifier mangling. + +Environment variables can be expanded as in the following example: + + IFS=: + for dir in ${ENV{PATH}}; do + : ${dir:=.} + if [ -x "${dir}/${cmd}" ]; then + printf '%s\n' "${dir}/${cmd}" + break + fi + done + +Environment variables can be set as in the following example: + + ENV[LC_ALL]=C + +TODO +==== + + * Array function `each` (maintains an iterator) + * Hash value functions: `length`, `keys`, `values`, `each` + - `keys` `join`s the keys array with `${US}`, then returns the result? + + Maybe sets `IFS="${US}"`, uses the shell's field splitting to remove + `${US}`, and returns a string (bad: space-delimited string of keys that + may contain spaces) + + Or just return the array? + - `each` iterates over the keys array (skipping unset keys) and returns a + key and value, delimited somehow (or, a new array?) + * Unsetting hash values and hashes + - Will need to unset keys array value + - Maintain a key ID (or key key) for each hash key (i.e. the key in the keys + array) + - Keys array could become large and sparse with lots of unsetting and + setting; should be heap instead? + * As above, consider building the hash structure on a heap built on an array + * Reference type checking in functions -- cgit v0.9.1