#!/bin/bash
#

src=html40.txt
url=http://www.w3.org/TR/1998/REC-html40-19980424/html40.txt
dest=htsentities.h

(
    cat <<EOF
/*
  -- ${dest} --
  FILE GENERATED BY $0, DO NOT MODIFY

  We compute the LCG hash
  (see <http://en.wikipedia.org/wiki/Linear_congruential_generator>)
  for each entity. We should in theory check using strncmp() that we
  actually have the correct entity, but this is actually statistically
  not needed.

  We may want to do better, but we expect the hash function to be uniform, and
  let the compiler be smart enough to optimize the switch (for example by
  checking in log2() intervals)
  
  This code has been generated using the evil $0 script.
*/

static int decode_entity(const unsigned int hash, const size_t len) {
  switch(hash) {
EOF
    (
        if test -f ${src}; then
            cat ${src}
        else
            GET "${url}"
        fi
    ) \
        | grep -E '^<!ENTITY [a-zA-Z0-9_]' \
        | sed \
        -e 's/<!ENTITY //' -e "s/[[:space:]][[:space:]]*/ /g" \
        -e 's/-->$//' \
        -e 's/\([^ ]*\) CDATA "&#\([^\"]*\);" -- \(.*\)/\1 \2 \3/'\
| ( \
        read A
        while test -n "$A"; do
            ent="${A%% *}"
            code=$(echo "$A"|cut -f2 -d' ')
            # compute hash
            hash=0
            i=0
            a=1664525
            c=1013904223
            m="$[1 << 32]"
            while test "$i" -lt ${#ent}; do
                d="$(echo -n "${ent:${i}:1}"|hexdump -v -e '/1 "%d"')"
                hash="$[((${hash}*${a})%(${m})+${d}+${c})%(${m})]"
                i=$[${i}+1]
            done
            echo -e "    /* $A */"
            echo -e "  case ${hash}:"
            echo -e "    if (len == ${#ent} /* && strncmp(ent, \"${ent}\") == 0 */) {"
            echo -e "      return ${code};"
            echo -e "    }"
            echo -e "    break;"

            # next
            read A
        done
    )
    cat <<EOF
  }
  /* unknown */
  return -1;
}
EOF
) > ${dest}
