Skip to content

Instantly share code, notes, and snippets.

@submachine
Created May 17, 2019 12:57
Show Gist options
  • Save submachine/52b25cd150d9ecb0c9262217b38dc90b to your computer and use it in GitHub Desktop.
Save submachine/52b25cd150d9ecb0c9262217b38dc90b to your computer and use it in GitHub Desktop.
Test iconv with every possible input combination of two bytes for every supported character set
#!/bin/sh -f
# Run iconv(1) tests with every possible input combination of two bytes.
# Copyright (C) 2019 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
# Contributed by Arjun Shankar <arjun@redhat.com>, 2019.
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.
# The gconv-modules file is supplied along with glibc and contains a list of
# all supported character set conversions; typically, each character set has
# two supported conversions, one from $charset to INTERNAL representation,
# and one from INTERNAL to $charset. The file also lists aliases for
# character sets on separate lines, which are just different names for the
# same charset.
if [ -f "/usr/lib64/gconv/gconv-modules" ]; then
modulelist_file="/usr/lib64/gconv/gconv-modules"
else
if [ -f "/usr/lib/gconv/gconv-modules" ]; then
modulelist_file="/usr/lib/gconv/gconv-modules"
else
echo "No module list found installed on system"
exit 1
fi
fi
charset_list="$(cat $modulelist_file |
grep '^module' | # lines not starting with 'module' are aliases
grep -v '^module\s*INTERNAL' | # drop 'INTERNAL->module' lines
sed 's/\s\+/ /g' | # we don't want 'cut' below to deal with TABs
cut -d' ' -f2 | # grab the name of the character set on col. 2
sed 's|//||' |
sort -u)" # some charsets have multiple direct conversions
# to other charsets for efficiency; drop those dupes
# List of known failures; tested preferentially before exhaustive 2 byte search
failarray=()
# Look at the return code ($ret) and say if the test passed or not
is_test_pass ()
{
# Normal (success/error) iconv run shouldn't return >127
# except when: 124 is returned by `timeout' if iconv hangs
if [ "$ret" -gt "127" ] || [ "$ret" -eq "124" ]; then
# Failure
return 1
else
# Success
return 0
fi
}
# Logs the test result to stdout
log_result ()
{
if [ "$ret" -eq "124" ] || [ "$ret" -eq "137" ]; then # timeout/hang
result="HANG"
else
if [ "$ret" -eq "139" ]; then # segfault
result="SEGFAULT"
else
if [ "$ret" -gt "127" ]; then # unexpected error
result="UNEXPECTED"
else
result="OK"
fi
fi
fi
echo -n "$result: "
if [ "$result" = "OK" ]; then
echo $charset
else
echo "$charset; echo -en \"$twobyte\" | iconv $c -f $charset -t \"$to_cs\""
fi
}
# Requires $twobyte input, $c flag, $charset, and $to_cs to be set; sets $ret
execute_test ()
{
echo -en "$twobyte" |
timeout -k 4 3 iconv $c -f $charset -t "$to_cs" &>/dev/null
ret=$?
}
# Main test loop
for charset in $charset_list; do
# First run all cached test failures from previous charsets
for failcommand in "${failarray[@]}"; do
echo "$failcommand" | IFS=";" read twobyte c to_cs
execute_test
if ! is_test_pass; then
break
fi
done
if [ ${#failarray[@]} -ne 0 ] && ! is_test_pass; then
log_result
continue
fi
# Then run an exhaustive search using all 2-byte input combinations
for b1 in $(seq 0 255); do
for b2 in $(seq 0 255); do
twobyte="$(printf "\\\x%02x\\\x%02x" $b1 $b2)"
for c in "" "-c"; do # "ignore" passed as an option
for i in "" "//IGNORE"; do # "ignore" passed as a suffix
for t in "" "//TRANSLIT"; do # "transliterate" passed as a suffix
# When both TRANSLIT and IGNORE are ON, we test two times:
if [ -n "$i" ] && [ -n "$t" ]; then
# First we test with "//IGNORE//TRANSLIT"
to_cs="UTF-8$i$t"
execute_test
if is_test_pass; then
# Then we test with "//TRANSLIT//IGNORE"
to_cs="UTF-8$t$i"
execute_test
fi
else
# Otherwise, we test only once:
to_cs="UTF-8$t$i"
execute_test
fi
if ! is_test_pass; then
# Cache the failed commandline to speed up future runs
failarray+=("$twobyte;$c;$to_cs")
# Stop testing this charset and go down to log an error
break 5
fi
done #1 (t)
done #2 (i)
done #3 (c)
done #4 (b2)
done #5 (b1)
log_result
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment