From 00aaac1fc3981c7f2e4ea4d88fb81088068f93c4 Mon Sep 17 00:00:00 2001
From: Leonid Plyushch <leonid.plyushch@gmail.com>
Date: Sun, 19 Jan 2020 19:47:26 +0200
Subject: [PATCH] new package: html2text

Requested in https://github.com/termux/termux-packages/issues/4757.
---
 packages/html2text/build.sh                   |   7 +
 .../html2text-1.3.2a_Makefile.in.patch        |  25 +
 .../html2text-1.3.2a_configure.patch          | 147 ++++
 .../patch-utf8-html2text-1.3.2a.patch         | 706 ++++++++++++++++++
 4 files changed, 885 insertions(+)
 create mode 100644 packages/html2text/build.sh
 create mode 100644 packages/html2text/html2text-1.3.2a_Makefile.in.patch
 create mode 100644 packages/html2text/html2text-1.3.2a_configure.patch
 create mode 100644 packages/html2text/patch-utf8-html2text-1.3.2a.patch
diff --git a/packages/html2text/build.sh b/packages/html2text/build.sh
new file mode 100644
index 000000000..60fe86bf7
--- /dev/null
+++ b/packages/html2text/build.sh
@@ -0,0 +1,7 @@
+TERMUX_PKG_HOMEPAGE=http://www.mbayer.de/html2text/
+TERMUX_PKG_DESCRIPTION="Utility that converts HTML documents into plain text"
+TERMUX_PKG_LICENSE="GPL-2.0"
+TERMUX_PKG_VERSION=1.3.2
+TERMUX_PKG_SRCURL=http://www.mbayer.de/html2text/downloads/html2text-${TERMUX_PKG_VERSION}a.tar.gz
+TERMUX_PKG_SHA256=000b39d5d910b867ff7e087177b470a1e26e2819920dcffd5991c33f6d480392
+TERMUX_PKG_BUILD_IN_SRC=true
diff --git a/packages/html2text/html2text-1.3.2a_Makefile.in.patch b/packages/html2text/html2text-1.3.2a_Makefile.in.patch
new file mode 100644
index 000000000..45d959050
--- /dev/null
+++ b/packages/html2text/html2text-1.3.2a_Makefile.in.patch
@@ -0,0 +1,25 @@
+diff -uNr html2text-1.3.2a/Makefile.in html2text-1.3.2a.mod/Makefile.in
+--- html2text-1.3.2a/Makefile.in	2004-01-14 15:47:02.000000000 +0200
++++ html2text-1.3.2a.mod/Makefile.in	2020-01-19 19:44:43.131479673 +0200
+@@ -29,9 +29,9 @@
+ YFLAGS   =
+ 
+ INSTALLER = install
+-BINDIR    = /usr/local/bin
+-MANDIR    = /usr/local/man
+-DOCDIR    = /usr/share/doc/html2text
++BINDIR    = @TERMUX_PREFIX@/bin
++MANDIR    = @TERMUX_PREFIX@/share/man
++DOCDIR    = @TERMUX_PREFIX@/share/doc/html2text
+ 
+ CXX                 = @CXX@
+ BOOL_DEFINITION     = @BOOL_DEFINITION@
+@@ -91,7 +91,7 @@
+ # This is mostly thought for RPM builts and users that don't read the documentation.
+ 
+ install :
+-	$(INSTALLER) -s -m 755 html2text $(BINDIR);
++	$(INSTALLER) -m 755 html2text $(BINDIR);
+ 	$(INSTALLER) -m 644 html2text.1.gz $(MANDIR)/man1;
+ 	$(INSTALLER) -m 644 html2textrc.5.gz $(MANDIR)/man5;
+ 	$(INSTALLER) -d -m 755 $(DOCDIR);
diff --git a/packages/html2text/html2text-1.3.2a_configure.patch b/packages/html2text/html2text-1.3.2a_configure.patch
new file mode 100644
index 000000000..38f19dbc1
--- /dev/null
+++ b/packages/html2text/html2text-1.3.2a_configure.patch
@@ -0,0 +1,147 @@
+diff -uNr html2text-1.3.2a/configure html2text-1.3.2a.mod/configure
+--- html2text-1.3.2a/configure	2004-01-12 17:47:18.000000000 +0200
++++ html2text-1.3.2a.mod/configure	2020-01-19 19:43:44.205959803 +0200
+@@ -31,24 +31,6 @@
+ #
+ 
+ $echo 'Checking C++ compiler... \c';
+-cat <<EOF >$tmp_file.C;
+-#include <iostream>
+-int main(int, char **) {
+-  std::cout << "hello" << std::endl;
+-  return 0;
+-}
+-EOF
+-CXX=unknown;
+-for i in "CC" "g++" "cc" "$CC"; do
+-  if $i -c $tmp_file.C 2>/dev/null; then
+-    CXX="$i";
+-    break;
+-  fi;
+-done;
+-if test "$CXX" = unknown; then
+-  $echo "Error: Could not find a working C++ compiler.";
+-  exit 1;
+-fi;
+ $echo "use \"$CXX\"";
+ 
+ #
+@@ -57,7 +39,7 @@
+ 
+ $echo 'Checking <sys/poll.h>... \c';
+ SYS_POLL_MISSING=unknown;
+-cat <<EOF >$tmp_file.C;
++cat <<EOF >$tmp_file.cc;
+ #ifdef SYS_POLL_MISSING /* { */
+ struct pollfd { int fd; short events; short revents; };
+ extern "C" int poll(struct pollfd *ufds, unsigned int nfds, int timeout);
+@@ -76,7 +58,7 @@
+ }
+ EOF
+ for i in "" -DSYS_POLL_MISSING; do
+-  if $CXX $tmp_file.C $i -o $tmp_file 2>/dev/null; then
++  if $CXX $tmp_file.cc $i -o $tmp_file 2>/dev/null; then
+     SYS_POLL_MISSING="$i";
+     break;
+   fi;
+@@ -97,7 +79,7 @@
+ 
+ $echo 'Checking for socket libraries... \c';
+ SOCKET_LIBRARIES=unknown;
+-cat >$tmp_file.C <<EOF;
++cat >$tmp_file.cc <<EOF;
+ extern "C" int socket();
+ extern "C" void gethostbyname();
+ int main() {
+@@ -107,7 +89,7 @@
+ }
+ EOF
+ for i in "" "-lbsocket" "-lbsocket -lnsl" "-lsocket" "-lsocket -lnsl"; do
+-  if $CXX $tmp_file.C $i -o $tmp_file 2>/dev/null; then
++  if $CXX $tmp_file.cc $i -o $tmp_file 2>/dev/null; then
+     SOCKET_LIBRARIES="$i";
+     break;
+   fi;
+@@ -128,7 +110,7 @@
+ 
+ $echo 'Checking "bool"... \c';
+ BOOL_DEFINITION=unknown;
+-cat <<EOF >$tmp_file.C;
++cat <<EOF >$tmp_file.cc;
+ #ifdef BOOL_DEFINITION
+ BOOL_DEFINITION
+ #endif
+@@ -144,7 +126,7 @@
+   '-DBOOL_DEFINITION="typedef unsigned char bool;const bool false=0,true=1;"' \
+   '-DBOOL_DEFINITION="enum bool{false,true};"'; \
+ do
+-  if eval "$CXX $tmp_file.C $i -o $tmp_file 2>/dev/null"; then
++  if eval "$CXX $tmp_file.cc $i -o $tmp_file 2>/dev/null"; then
+     BOOL_DEFINITION="$i";
+     break;
+   fi;
+@@ -165,7 +147,7 @@
+ 
+ $echo 'Checking "explicit"... \c';
+ EXPLICIT=unknown;
+-cat <<EOF >$tmp_file.C;
++cat <<EOF >$tmp_file.cc;
+ struct C {
+   explicit C(int) {}
+ };
+@@ -175,7 +157,7 @@
+   '' \
+   '-Dexplicit='; \
+ do
+-  if eval "$CXX $tmp_file.C $i -o $tmp_file 2>/dev/null"; then
++  if eval "$CXX $tmp_file.cc $i -o $tmp_file 2>/dev/null"; then
+     EXPLICIT="$i";
+     break;
+   fi;
+@@ -195,7 +177,7 @@
+ #
+ 
+ $echo 'Checking Standard C++ library... \c';
+-cat <<EOF >$tmp_file.C;
++cat <<EOF >$tmp_file.cc;
+ #include <string>
+ #include <list>
+ #include <memory>
+@@ -207,7 +189,7 @@
+ using namespace std;
+ void func() { map<string, string> x; }
+ EOF
+-if $CXX -c $tmp_file.C 2>/dev/null; then
++if $CXX -c $tmp_file.cc 2>/dev/null; then
+   LIBSTDCXX_INCLUDES="";
+   LIBSTDCXX_LIBS="";
+   $echo 'works; no need to make "./libstd"';
+@@ -223,7 +205,7 @@
+ #
+ AUTO_PTR_BROKEN="";
+ $echo 'Checking "auto_ptr"... \c';
+-cat <<EOF >$tmp_file.C;
++cat <<EOF >$tmp_file.cc;
+ #include <memory>
+ #include <string>
+ #include <list>
+@@ -243,7 +225,7 @@
+   return 0;
+ }  
+ EOF
+-if eval "$CXX -c $LIBSTDCXX_INCLUDES $EXPLICIT $BOOL_DEFINITION $tmp_file.C" 2>/dev/null; then
++if eval "$CXX -c $LIBSTDCXX_INCLUDES $EXPLICIT $BOOL_DEFINITION $tmp_file.cc" 2>/dev/null; then
+   $echo 'defined in <memory>, good';
+ else
+   $echo 'not defined or not working, use "./libstd/include/auto_ptr.h"';
+@@ -255,8 +237,8 @@
+ #
+ MAKEDEPEND_INCLUDES="";
+ $echo 'Checking "makedepend" includes... \c';
+-echo "#include <iostream>" >$tmp_file.C;
+-MAKEDEPEND_INCLUDES=`$CXX -E $tmp_file.C 2>/dev/null |
++echo "#include <iostream>" >$tmp_file.cc;
++MAKEDEPEND_INCLUDES=`$CXX -E $tmp_file.cc 2>/dev/null |
+ sed -n \
+   -e 's/^#line .*"\(\/.*\)\/.*".*/-I\1/p' \
+   -e 's/^# [1-9][0-9]* "\(\/.*\)\/.*".*/-I\1/p' |
diff --git a/packages/html2text/patch-utf8-html2text-1.3.2a.patch b/packages/html2text/patch-utf8-html2text-1.3.2a.patch
new file mode 100644
index 000000000..442d1871e
--- /dev/null
+++ b/packages/html2text/patch-utf8-html2text-1.3.2a.patch
@@ -0,0 +1,706 @@
+diff -r -u -bB html2text-1.3.2a/Area.C html2text-1.3.2a-patched/Area.C
+--- html2text-1.3.2a/Area.C	2003-11-23 12:05:29.000000000 +0100
++++ html2text-1.3.2a-patched/Area.C	2005-05-13 22:19:59.862137688 +0200
+@@ -36,10 +36,13 @@
+ #include <iostream>
+ 
+ #include "Area.h"
++#include "html.h"
+ #include "string.h"
+ 
+ #define LATIN1_nbsp 160
+ 
++extern int use_encoding;
++
+ /* ------------------------------------------------------------------------- */
+ 
+ #define malloc_array(type, size)\
+@@ -81,6 +84,27 @@
+ 
+ /* ------------------------------------------------------------------------- */
+ 
++/*           utf_length() and utf_width()       
++ *
++ *     Very simplified algorithm of calculating length of UTF-8
++ *   string. No check for errors. Counting only ASCII bytes and
++ *   leading bytes of UTF-8 multibyte sequences. All bytes like
++ *   10xxxxxx are dropped. If USE_UTF8 is false then returns
++ *   usual length.               --YS
++ */
++
++unsigned int
++Line::utf_length(size_type f, size_type t) const
++{
++  size_type m = (t < length_ ? t : length_);
++  size_type r = m - f;
++  if(USE_UTF8) {
++      for (int i = f; i < m; i++)
++        if((cells_[i].character & 0xc0) == 0x80) r--;
++  }
++  return r;
++}
++
+ void
+ Line::resize(size_type l)
+ {
+@@ -236,6 +260,23 @@
+   return *this;
+ }
+ 
++unsigned int
++Area::utf_width()
++{
++  size_type r = width_;
++  if(USE_UTF8) { r = 0;
++    for (size_type yy = 0; yy < height_; yy++) {
++      size_type r1 = 0;
++      for (int i = width_ - 1; i >= 0; i--) {
++        if(!r1 && isspace(cells_[yy][i].character)) continue;
++        if((cells_[yy][i].character & 0xc0) != 0x80) r1++;
++      }
++      if(r < r1) r = r1;
++    }
++  }
++  return r;
++}
++
+ void
+ Area::resize(size_type w, size_type h)
+ {
+@@ -439,7 +480,7 @@
+       char c = p->character;
+       char a = p->attribute;
+ 
+-      if (c == (char) LATIN1_nbsp) c = ' ';
++      if (c == (char) LATIN1_nbsp && !USE_UTF8) c = ' ';
+ 
+       if (a == Cell::NONE) {
+         os << c;
+Nur in html2text-1.3.2a-patched/: Area.C.orig.
+diff -r -u -bB html2text-1.3.2a/Area.h html2text-1.3.2a-patched/Area.h
+--- html2text-1.3.2a/Area.h	2003-11-23 12:05:29.000000000 +0100
++++ html2text-1.3.2a-patched/Area.h	2005-05-13 22:19:59.863137536 +0200
+@@ -81,6 +81,8 @@
+   Cell       &operator[](size_type x)       { return cells_[x]; }
+   const Cell *cells() const { return cells_; }
+ 
++  unsigned int utf_length(size_type f, size_type t) const;
++
+   void resize(size_type l);
+   void enlarge(size_type l) { if (l > length_) resize(l); }
+ 
+@@ -134,6 +136,8 @@
+   Cell       *operator[](size_type y)       { return cells_[y]; }
+   const Area &operator>>=(size_type rs);
+ 
++  unsigned int utf_width();
++
+   void resize(size_type w, size_type h);
+   void enlarge(size_type w, size_type h);
+ 
+Nur in html2text-1.3.2a-patched/: Area.h.orig.
+diff -r -u -bB html2text-1.3.2a/format.C html2text-1.3.2a-patched/format.C
+--- html2text-1.3.2a/format.C	2003-11-23 12:05:29.000000000 +0100
++++ html2text-1.3.2a-patched/format.C	2005-05-13 22:19:59.865137232 +0200
+@@ -1210,6 +1210,7 @@
+     }
+ 
+     Line::size_type to = from + 1;
++    int to_from;
+ 
+     Line::size_type lbp = (Line::size_type) -1; // "Last break position".
+ 
+@@ -1238,18 +1239,20 @@
+         to++;
+       }
+ 
+-      if (to - from > w && lbp != (Area::size_type) -1) { to = lbp; break; }
++      if (line.utf_length(from,to) > w && lbp != (Area::size_type) -1) 
++                    { to = lbp; break; }
+     }
+ 
++    to_from = line.utf_length(from,to);
+     /*
+      * Copy the "from...to" range from the "line" to the bottom of the "res"
+      * Area.
+      */
+     Area::size_type x = 0;
+     Area::size_type len = to - from;
+-    if (halign == Area::LEFT || len >= w) { ;                   } else
+-    if (halign == Area::CENTER)           { x += (w - len) / 2; } else
+-    if (halign == Area::RIGHT)            { x += w - len;       }
++    if (halign == Area::LEFT || to_from >= w) { ;                   } else
++    if (halign == Area::CENTER)           { x += (w - to_from) / 2; } else
++    if (halign == Area::RIGHT)            { x += w - to_from;       }
+     res->insert(line.cells() + from, len, x, res->height());
+ 
+     /*
+Nur in html2text-1.3.2a-patched/: format.C.orig.
+diff -r -u -bB html2text-1.3.2a/html2text.C html2text-1.3.2a-patched/html2text.C
+--- html2text-1.3.2a/html2text.C	2003-11-23 12:05:29.000000000 +0100
++++ html2text-1.3.2a-patched/html2text.C	2005-05-13 22:19:59.868136776 +0200
+@@ -148,9 +148,10 @@
+   -o <file>      Redirect output into <file>\n\
+   -nobs          Do not use backspaces for boldface and underlining\n\
+   -ascii         Use plain ASCII for output instead of ISO-8859-1\n\
++  -utf8          Assume both terminal and input stream are in UTF-8 mode\n\
+ ";
+ 
+-int use_iso8859 = 1;
++int use_encoding = ISO8859;
+ 
+ int
+ main(int argc, char **argv)
+@@ -199,7 +200,8 @@
+     if (!strcmp(arg, "-width"        )) { width = atoi(argv[++i]);       } else
+     if (!strcmp(arg, "-o"            )) { output_file_name = argv[++i];  } else
+     if (!strcmp(arg, "-nobs"         )) { use_backspaces = false;        } else
+-    if (!strcmp(arg, "-ascii"        )) { use_iso8859 = false;           } else
++    if (!strcmp(arg, "-ascii"        )) { use_encoding = ASCII;          } else
++    if (!strcmp(arg, "-utf8"         )) { use_encoding = UTF8;           } else
+     {
+       std::cerr
+ 	<< "Unrecognized command line option \""
+Nur in html2text-1.3.2a-patched/: html2text.C.orig.
+diff -r -u -bB html2text-1.3.2a/html.h html2text-1.3.2a-patched/html.h
+--- html2text-1.3.2a/html.h	2001-10-04 22:03:54.000000000 +0200
++++ html2text-1.3.2a-patched/html.h	2005-05-13 22:19:59.866137080 +0200
+@@ -61,6 +61,11 @@
+ 
+ /* ------------------------------------------------------------------------- */
+ 
++enum {ASCII, ISO8859, UTF8};
++#define USE_ISO8859 (use_encoding == ISO8859)
++#define USE_ASCII (use_encoding == ASCII)
++#define USE_UTF8 (use_encoding == UTF8)
++
+ #define LATIN1_nbsp   160
+ #define LATIN1_iexcl  161
+ #define LATIN1_cent   162
+diff -r -u -bB html2text-1.3.2a/sgml.C html2text-1.3.2a-patched/sgml.C
+--- html2text-1.3.2a/sgml.C	2003-11-23 12:09:11.000000000 +0100
++++ html2text-1.3.2a-patched/sgml.C	2005-05-13 22:19:59.870136472 +0200
+@@ -62,261 +62,280 @@
+   char name[8];
+   int  iso8859code;
+   char *asciistr;
++  unsigned long unicode;
+ } entities[] = {
+-  { "AElig",   LATIN1_AElig,  "AE"         },
+-  { "AMP",     0,             "&"          },
+-  { "Aacute",  LATIN1_Aacute, "A'"         },
+-  { "Acirc",   LATIN1_Acirc,  "A^"         },
+-  { "Agrave",  LATIN1_Agrave, "A`"         },
+-  { "Alpha",   0,             "A"          },
+-  { "Aring",   LATIN1_Aring,  "AA"         },
+-  { "Atilde",  LATIN1_Atilde, "A~"         },
+-  { "Auml",    LATIN1_Auml,   "A\""        },
+-  { "Beta",    0,             "B"          },
+-  { "Ccedil",  LATIN1_Ccedil, "C,"         },
+-  { "Chi",     0,             "H"          },
+-  { "Dagger",  0,             "++"         },
+-  { "Delta",   0,             "D"          },
+-  { "ETH",     LATIN1_ETH,    "D-"         },
+-  { "Eacute",  LATIN1_Eacute, "E'"         },
+-  { "Ecirc",   LATIN1_Ecirc,  "E^"         },
+-  { "Egrave",  LATIN1_Egrave, "E`"         },
+-  { "Epsilon", 0,             "E"          },
+-  { "Eta",     0,             "E"          },
+-  { "Euml",    LATIN1_Euml,   "E\""        },
+-  { "GT",      0,             ">"          },
+-  { "Gamma",   0,             "G"          },
+-  { "Iacute",  LATIN1_Iacute, "I'"         },
+-  { "Icirc",   LATIN1_Icirc,  "I^"         },
+-  { "Igrave",  LATIN1_Igrave, "I`"         },
+-  { "Iota",    0,             "I"          },
+-  { "Iuml",    LATIN1_Iuml,   "I\""        },
+-  { "Kappa",   0,             "K"          },
+-  { "LT",      0,             "<"          },
+-  { "Lambda",  0,             "L"          },
+-  { "Mu",      0,             "M"          },
+-  { "Ntilde",  LATIN1_Ntilde, "N~"         },
+-  { "Nu",      0,             "N"          },
+-  { "OElig",   0,             "OE"         },
+-  { "Oacute",  LATIN1_Oacute, "O'"         },
+-  { "Ocirc",   LATIN1_Ocirc,  "O^"         },
+-  { "Ograve",  LATIN1_Ograve, "O`"         },
+-  { "Omega",   0,             "O"          },
+-  { "Omicron", 0,             "O"          },
+-  { "Oslash",  LATIN1_Oslash, "O/"         },
+-  { "Otilde",  LATIN1_Otilde, "O~"         },
+-  { "Ouml",    LATIN1_Ouml,   "O\""        },
+-  { "Phi",     0,             "F"          },
+-  { "Pi",      0,             "P"          },
+-  { "Prime",   0,             "''"         },
+-  { "Psi",     0,             "PS"         },
+-  { "QUOT",    0,             "\""         },
+-  { "Rho",     0,             "R"          },
+-  { "Scaron",  0,             "S"          },
+-  { "Sigma",   0,             "S"          },
+-  { "THORN",   LATIN1_THORN,  "TH"         },
+-  { "Tau",     0,             "T"          },
+-  { "Theta",   0,             "TH"         },
+-  { "Uacute",  LATIN1_Uacute, "U'"         },
+-  { "Ucirc",   LATIN1_Ucirc,  "U^"         },
+-  { "Ugrave",  LATIN1_Ugrave, "U`"         },
+-  { "Upsilon", 0,             "U"          },
+-  { "Uuml",    LATIN1_Uuml,   "U\""        },
+-  { "Xi",      0,             "X"          },
+-  { "Yacute",  LATIN1_Yacute, "Y'"         },
+-  { "Yuml",    0,             "Y\""        },
+-  { "Zeta",    0,             "Z"          },
+-  { "aacute",  LATIN1_aacute, "a'"         },
+-  { "acirc",   LATIN1_acirc,  "a^"         },
+-  { "acute",   LATIN1_acute,  "'"          },
+-  { "aelig",   LATIN1_aelig,  "ae"         },
+-  { "agrave",  LATIN1_agrave, "a`"         },
++  { "AElig",   LATIN1_AElig,  "AE",  0x00c6},
++  { "AMP",     0,             "&",   0x0026},
++  { "Aacute",  LATIN1_Aacute, "A'",  0x00c1},
++  { "Acirc",   LATIN1_Acirc,  "A^",  0x00c2},
++  { "Agrave",  LATIN1_Agrave, "A`",  0x00c0},
++  { "Alpha",   0,             "A",   0x0391},
++  { "Aring",   LATIN1_Aring,  "AA",  0x00c5},
++  { "Atilde",  LATIN1_Atilde, "A~",  0x00c3},
++  { "Auml",    LATIN1_Auml,   "A\"", 0x00c4},
++  { "Beta",    0,             "B",   0x0392},
++  { "Ccedil",  LATIN1_Ccedil, "C,",  0x00c7},
++  { "Chi",     0,             "H",   0x03a7},
++  { "Dagger",  0,             "++",  0x2020},
++  { "Delta",   0,             "D",   0x0394},
++  { "ETH",     LATIN1_ETH,    "D-",  0x00d0},
++  { "Eacute",  LATIN1_Eacute, "E'",  0x00c9},
++  { "Ecirc",   LATIN1_Ecirc,  "E^",  0x00ca},
++  { "Egrave",  LATIN1_Egrave, "E`",  0x00c8},
++  { "Epsilon", 0,             "E",   0x0395},
++  { "Eta",     0,             "E",   0x0397},
++  { "Euml",    LATIN1_Euml,   "E\"", 0x00cb},
++  { "GT",      0,             ">",   0x003e},
++  { "Gamma",   0,             "G",   0x0393},
++  { "Iacute",  LATIN1_Iacute, "I'",  0x00cd},
++  { "Icirc",   LATIN1_Icirc,  "I^",  0x00ce},
++  { "Igrave",  LATIN1_Igrave, "I`",  0x00cc},
++  { "Iota",    0,             "I",   0x0399},
++  { "Iuml",    LATIN1_Iuml,   "I\"", 0x00cf},
++  { "Kappa",   0,             "K",   0x039a},
++  { "LT",      0,             "<",   0x003c},
++  { "Lambda",  0,             "L",   0x039b},
++  { "Mu",      0,             "M",   0x039c},
++  { "Ntilde",  LATIN1_Ntilde, "N~",  0x00d1},
++  { "Nu",      0,             "N",   0x039d},
++  { "OElig",   0,             "OE",  0x0152},
++  { "Oacute",  LATIN1_Oacute, "O'",  0x00d3},
++  { "Ocirc",   LATIN1_Ocirc,  "O^",  0x00d4},
++  { "Ograve",  LATIN1_Ograve, "O`",  0x00d2},
++  { "Omega",   0,             "O",   0x03a9},
++  { "Omicron", 0,             "O",   0x039f},
++  { "Oslash",  LATIN1_Oslash, "O/",  0x00d8},
++  { "Otilde",  LATIN1_Otilde, "O~",  0x00d5},
++  { "Ouml",    LATIN1_Ouml,   "O\"", 0x00d6},
++  { "Phi",     0,             "F",   0x03a6},
++  { "Pi",      0,             "P",   0x03a0},
++  { "Prime",   0,             "''",        },
++  { "Psi",     0,             "PS",  0x03a8},
++  { "QUOT",    0,             "\"",        },
++  { "Rho",     0,             "R",   0x03a1},
++  { "Scaron",  0,             "S",   0x0161},
++  { "Sigma",   0,             "S",   0x03a3},
++  { "THORN",   LATIN1_THORN,  "TH",  0x00de},
++  { "Tau",     0,             "T",   0x03a4},
++  { "Theta",   0,             "TH",  0x0398},
++  { "Uacute",  LATIN1_Uacute, "U'",  0x00da},
++  { "Ucirc",   LATIN1_Ucirc,  "U^",  0x00db},
++  { "Ugrave",  LATIN1_Ugrave, "U`",  0x00d9},
++  { "Upsilon", 0,             "U",   0x03a5},
++  { "Uuml",    LATIN1_Uuml,   "U\"", 0x00dc},
++  { "Xi",      0,             "X",   0x039e},
++  { "Yacute",  LATIN1_Yacute, "Y'",  0x00dd},
++  { "Yuml",    0,             "Y\"", 0x0178},
++  { "Zeta",    0,             "Z",   0x0396},
++  { "aacute",  LATIN1_aacute, "a'",  0x00e1},
++  { "acirc",   LATIN1_acirc,  "a^",  0x00e2},
++  { "acute",   LATIN1_acute,  "'",   0x00b4},
++  { "aelig",   LATIN1_aelig,  "ae",  0x00e6},
++  { "agrave",  LATIN1_agrave, "a`",  0x00e0},
+   { "alefsym", 0,             "Aleph"      },
+-  { "alpha",   0,             "a"          },
++  { "alpha",   0,             "a",   0x03b1},
+   { "amp",     0,             "&"          },
+   { "and",     0,             "AND"        },
+   { "ang",     0,             "-V"         },
+   { "apos",    0,             "'"          },
+-  { "aring",   LATIN1_aring,  "aa"         },
+-  { "asymp",   0,             "~="         },
+-  { "atilde",  LATIN1_atilde, "a~"         },
+-  { "auml",    LATIN1_auml,   "a\""        },
++  { "aring",   LATIN1_aring,  "aa",  0x00e5},
++  { "asymp",   0,             "~=",  0x2248},
++  { "atilde",  LATIN1_atilde, "a~",  0x00e3},
++  { "auml",    LATIN1_auml,   "a\"", 0x00e5},
+   { "bdquo",   0,             "\""         },
+-  { "beta",    0,             "b"          },
+-  { "brvbar",  LATIN1_brvbar, "|"          },
+-  { "bull",    0,             " o "        },
++  { "beta",    0,             "b",   0x03b2},
++  { "brvbar",  LATIN1_brvbar, "|",   0x00a6},
++  { "bull",    0,             " o ", 0x2022},
+   { "cap",     0,             "(U"         },
+-  { "ccedil",  LATIN1_ccedil, "c,"         },
+-  { "cedil",   LATIN1_cedil,  ","          },
+-  { "cent",    LATIN1_cent,   "-c-"        },
+-  { "chi",     0,             "h"          },
+-  { "circ",    0,             "^"          },
++  { "ccedil",  LATIN1_ccedil, "c,",  0x00e7},
++  { "cedil",   LATIN1_cedil,  ",",   0x00b8},
++  { "cent",    LATIN1_cent,   "-c-", 0x00a2},
++  { "chi",     0,             "h",   0x03c7},
++  { "circ",    0,             "^",   0x005e},
+ //  { "clubs",   0,             "[clubs]"    },
+   { "cong",    0,             "?="         },
+-  { "copy",    LATIN1_copy,   "(c)"        },
++  { "copy",    LATIN1_copy,   "(c)", 0x00a9},
+   { "crarr",   0,             "<-'"        },
+   { "cup",     0,             ")U"         },
+-  { "curren",  LATIN1_curren, "CUR"        },
++  { "curren",  LATIN1_curren, "CUR", 0x00a4},
+   { "dArr",    0,             "vv"         },
+-  { "dagger",  0,             "+"          },
++  { "dagger",  0,             "+",   0x2020},
+   { "darr",    0,             "v"          },
+-  { "deg",     LATIN1_deg,    "DEG"        },
+-  { "delta",   0,             "d"          },
++  { "deg",     LATIN1_deg,    "DEG", 0x00b0},
++  { "delta",   0,             "d",   0x03b4},
+ //  { "diams",   0,             "[diamonds]" },
+-  { "divide",  LATIN1_divide, "/"          },
+-  { "eacute",  LATIN1_eacute, "e'"         },
+-  { "ecirc",   LATIN1_ecirc,  "e^"         },
+-  { "egrave",  LATIN1_egrave, "e`"         },
++  { "divide",  LATIN1_divide, "/",   0x00f7},
++  { "eacute",  LATIN1_eacute, "e'",  0x00e9},
++  { "ecirc",   LATIN1_ecirc,  "e^",  0x00ea},
++  { "egrave",  LATIN1_egrave, "e`",  0x00e8},
+   { "empty",   0,             "{}"         },
+-  { "epsilon", 0,             "e"          },
+-  { "equiv",   0,             "=="         },
+-  { "eta",     0,             "e"          },
+-  { "eth",     LATIN1_eth,    "d-"         },
+-  { "euml",    LATIN1_euml,   "e\""        },
+-  { "euro",    0,             "EUR"        },
++  { "epsilon", 0,             "e",   0x03b5},
++  { "equiv",   0,             "==",  0x2261},
++  { "eta",     0,             "e",   0x03b7},
++  { "eth",     LATIN1_eth,    "d-",  0x00f0},
++  { "euml",    LATIN1_euml,   "e\"", 0x00eb},
++  { "euro",    0,             "EUR", 0x20ac},
+   { "exist",   0,             "TE"         },
+   { "fnof",    0,             "f"          },
+   { "forall",  0,             "FA"         },
+-  { "frac12",  LATIN1_frac12, " 1/2"       },
+-  { "frac14",  LATIN1_frac14, " 1/4"       },
+-  { "frac34",  LATIN1_frac34, " 3/4"       },
++  { "frac12",  LATIN1_frac12, " 1/2",0x00bd},
++  { "frac14",  LATIN1_frac14, " 1/4",0x00bc},
++  { "frac34",  LATIN1_frac34, " 3/4",0x00be},
+   { "frasl",   0,             "/"          },
+-  { "gamma",   0,             "g"          },
+-  { "ge",      0,             ">="         },
+-  { "gt",      0,             ">"          },
++  { "gamma",   0,             "g",   0x03b3},
++  { "ge",      0,             ">=",  0x2265},
++  { "gt",      0,             ">",   0x003e},
+   { "hArr",    0,             "<=>"        },
+   { "harr",    0,             "<->"        },
+ //  { "hearts",  0,             "[hearts]"   },
+-  { "hellip",  0,             "..."        },
+-  { "iacute",  LATIN1_iacute, "i'"         },
+-  { "icirc",   LATIN1_icirc,  "i^"         },
+-  { "iexcl",   LATIN1_iexcl,  "!"          },
+-  { "igrave",  LATIN1_igrave, "i`"         },
++  { "hellip",  0,             "...", 0x2026},
++  { "iacute",  LATIN1_iacute, "i'",  0x00ed},
++  { "icirc",   LATIN1_icirc,  "i^",  0x00ee},
++  { "iexcl",   LATIN1_iexcl,  "!",   0x00a1},
++  { "igrave",  LATIN1_igrave, "i`",  0x00ec},
+   { "image",   0,             "Im"         },
+-  { "infin",   0,             "oo"         },
+-  { "int",     0,             "INT"        },
+-  { "iota",    0,             "i"          },
+-  { "iquest",  LATIN1_iquest, "?"          },
++  { "infin",   0,             "oo",  0x221e},
++  { "int",     0,             "INT", 0x222b},
++  { "iota",    0,             "i",   0x03b9},
++  { "iquest",  LATIN1_iquest, "?",   0x00bf},
+   { "isin",    0,             "(-"         },
+-  { "iuml",    LATIN1_iuml,   "i\""        },
+-  { "kappa",   0,             "k"          },
++  { "iuml",    LATIN1_iuml,   "i\"", 0x00ef},
++  { "kappa",   0,             "k",   0x03ba},
+   { "lArr",    0,             "<="         },
+-  { "lambda",  0,             "l"          },
++  { "lambda",  0,             "l",   0x03bb},
+   { "lang",    0,             "</"         },
+   { "laquo",   LATIN1_laquo,  "<<"         },
+-  { "larr",    0,             "<-"         },
++  { "larr",    0,             "<-",  0x2190},
+ //  { "lceil",   0,             "<|"         },
+   { "ldquo",   0,             "\""         },
+-  { "le",      0,             "<="         },
++  { "le",      0,             "<=",  0x2264},
+ //  { "lfloor",  0,             "|<"         },
+   { "lowast",  0,             "*"          },
+   { "loz",     0,             "<>"         },
+   { "lsaquo",  0,             "<"          },
+   { "lsquo",   0,             "`"          },
+-  { "lt",      0,             "<"          },
+-  { "macr",    LATIN1_macr,   "-"          },
++  { "lt",      0,             "<",   0x003c},
++  { "macr",    LATIN1_macr,   "-",   0x00af},
+   { "mdash",   0,             "--"         },
+-  { "micro",   LATIN1_micro,  "my"         },
+-  { "middot",  LATIN1_middot, "."          },
+-  { "minus",   0,             "-"          },
+-  { "mu",      0,             "m"          },
++  { "micro",   LATIN1_micro,  "my",  0x00b5},
++  { "middot",  LATIN1_middot, ".",   0x00b7},
++  { "minus",   0,             "-",   0x2212},
++  { "mu",      0,             "m",   0x03bc},
+   { "nabla",   0,             "Nabla"      },
+-  { "nbsp",    LATIN1_nbsp,   " "          },
++  { "nbsp",    LATIN1_nbsp,   " ",   0x00a0},
+   { "ndash",   0,             "-"          },
+-  { "ne",      0,             "!="         },
++  { "ne",      0,             "!=",  0x2260},
+   { "ni",      0,             "-)"         },
+   { "not",     LATIN1_not,    "NOT"        },
+   { "notin",   0,             "!(-"        },
+   { "nsub",    0,             "!(C"        },
+-  { "ntilde",  LATIN1_ntilde, "n~"         },
+-  { "nu",      0,             "n"          },
+-  { "oacute",  LATIN1_oacute, "o'"         },
+-  { "ocirc",   LATIN1_ocirc,  "o^"         },
++  { "ntilde",  LATIN1_ntilde, "n~",  0x00f1},
++  { "nu",      0,             "n",   0x03bd},
++  { "oacute",  LATIN1_oacute, "o'",  0x00f3},
++  { "ocirc",   LATIN1_ocirc,  "o^",  0x00f4},
+   { "oelig",   0,             "oe"         },
+-  { "ograve",  LATIN1_ograve, "o`"         },
++  { "ograve",  LATIN1_ograve, "o`",  0x00f2},
+   { "oline",   LATIN1_macr,   "-"          },
+-  { "omega",   0,             "o"          },
+-  { "omicron", 0,             "o"          },
++  { "omega",   0,             "o",   0x03c9},
++  { "omicron", 0,             "o",   0x03bf},
+   { "oplus",   0,             "(+)"        },
+   { "or",      0,             "OR"         },
+-  { "ordf",    LATIN1_ordf,   "-a"         },
+-  { "ordm",    LATIN1_ordm,   "-o"         },
+-  { "oslash",  LATIN1_oslash, "o/"         },
+-  { "otilde",  LATIN1_otilde, "o~"         },
++  { "ordf",    LATIN1_ordf,   "-a",  0x00aa},
++  { "ordm",    LATIN1_ordm,   "-o",  0x00ba},
++  { "oslash",  LATIN1_oslash, "o/",  0x00f8},
++  { "otilde",  LATIN1_otilde, "o~",  0x00f5},
+   { "otimes",  0,             "(x)"        },
+-  { "ouml",    LATIN1_ouml,   "o\""        },
+-  { "para",    LATIN1_para,   "P:"         },
+-  { "part",    0,             "PART"       },
+-  { "permil",  0,             " 0/00"      },
++  { "ouml",    LATIN1_ouml,   "o\"", 0x00f6},
++  { "para",    LATIN1_para,   "P:",  0x00b6},
++  { "part",    0,             "PART",0x2202},
++  { "permil",  0,             " 0/00",0x2030},
+   { "perp",    0,             "-T"         },
+-  { "phi",     0,             "f"          },
+-  { "pi",      0,             "p"          },
++  { "phi",     0,             "f",   0x03c6},
++  { "pi",      0,             "p",   0x03c0},
+   { "piv",     0,             "Pi"         },
+-  { "plusmn",  LATIN1_plusmn, "+/-"        },
+-  { "pound",   LATIN1_pound,  "-L-"        },
++  { "plusmn",  LATIN1_plusmn, "+/-", 0x00b1},
++  { "pound",   LATIN1_pound,  "-L-", 0x00a3},
+   { "prime",   0,             "'"          },
+-  { "prod",    0,             "PROD"       },
++  { "prod",    0,             "PROD",0x220f},
+   { "prop",    0,             "0("         },
+-  { "psi",     0,             "ps"         },
++  { "psi",     0,             "ps",  0x03c8},
+   { "quot",    0,             "\""         },
+   { "rArr",    0,             "=>"         },
+-  { "radic",   0,             "SQRT"       },
++  { "radic",   0,             "SQRT",0x221a},
+   { "rang",    0,             "/>"         },
+   { "raquo",   LATIN1_raquo,  ">>"         },
+-  { "rarr",    0,             "->"         },
++  { "rarr",    0,             "->",  0x2192},
+ //  { "rceil",   0,             ">|"         },
+   { "rdquo",   0,             "\""         },
+   { "real",    0,             "Re"         },
+-  { "reg",     LATIN1_reg,    "(R)"        },
++  { "reg",     LATIN1_reg,    "(R)", 0x00ae},
+ //  { "rfloor",  0,             "|>"         },
+-  { "rho",     0,             "r"          },
++  { "rho",     0,             "r",   0x03c1},
+   { "rsaquo",  0,             ">"          },
+   { "rsquo",   0,             "'"          },
+   { "sbquo",   0,             "'"          },
+-  { "scaron",  0,             "s"          },
++  { "scaron",  0,             "s",   0x0161},
+   { "sdot",    0,             "DOT"        },
+-  { "sect",    LATIN1_sect,   "S:"         },
++  { "sect",    LATIN1_sect,   "S:",  0x00a7},
+   { "shy",     LATIN1_shy,    ""           },
+-  { "sigma",   0,             "s"          },
+-  { "sigmaf",  0,             "s"          },
++  { "sigma",   0,             "s",   0x03c3},
++  { "sigmaf",  0,             "s",   0x03c2},
+   { "sim",     0,             "~"          },
+ //  { "spades",  0,             "[spades]"   },
+   { "sub",     0,             "(C"         },
+   { "sube",    0,             "(_"         },
+-  { "sum",     0,             "SUM"        },
++  { "sum",     0,             "SUM", 0x2211},
+   { "sup",     0,             ")C"         },
+-  { "sup1",    LATIN1_sup1,   "^1"         },
+-  { "sup2",    LATIN1_sup2,   "^2"         },
+-  { "sup3",    LATIN1_sup3,   "^3"         },
++  { "sup1",    LATIN1_sup1,   "^1",  0x00b9},
++  { "sup2",    LATIN1_sup2,   "^2",  0x00b2},
++  { "sup3",    LATIN1_sup3,   "^3",  0x00b3},
+   { "supe",    0,             ")_"         },
+-  { "szlig",   LATIN1_szlig,  "ss"         },
+-  { "tau",     0,             "t"          },
++  { "szlig",   LATIN1_szlig,  "ss",  0x00df},
++  { "tau",     0,             "t",   0x03c4},
+   { "there4",  0,             ".:"         },
+-  { "theta",   0,             "th"         },
+-  { "thorn",   LATIN1_thorn,  "th"         },
+-  { "tilde",   0,             "~"          },
+-  { "times",   LATIN1_times,  "x"          },
+-  { "trade",   0,             "[TM]"       },
++  { "theta",   0,             "th",  0x03b8},
++  { "thorn",   LATIN1_thorn,  "th",  0x00fe},
++  { "tilde",   0,             "~",   0x02dc},
++  { "times",   LATIN1_times,  "x",   0x00d7},
++  { "trade",   0,             "[TM]",0x2122},
+   { "uArr",    0,             "^^"         },
+-  { "uacute",  LATIN1_uacute, "u'"         },
++  { "uacute",  LATIN1_uacute, "u'",  0x00fa},
+   { "uarr",    0,             "^"          },
+-  { "ucirc",   LATIN1_ucirc,  "u^"         },
+-  { "ugrave",  LATIN1_ugrave, "u`"         },
+-  { "uml",     LATIN1_uml,    "\""         },
+-  { "upsilon", 0,             "u"          },
+-  { "uuml",    LATIN1_uuml,   "u\""        },
++  { "ucirc",   LATIN1_ucirc,  "u^",  0x00fb},
++  { "ugrave",  LATIN1_ugrave, "u`",  0x00f9},
++  { "uml",     LATIN1_uml,    "\"",  0x00a8},
++  { "upsilon", 0,             "u",   0x03c5},
++  { "uuml",    LATIN1_uuml,   "u\"", 0x00fc},
+   { "weierp",  0,             "P"          },
+-  { "xi",      0,             "x"          },
+-  { "yacute",  LATIN1_yacute, "y'"         },
+-  { "yen",     LATIN1_yen,    "YEN"        },
+-  { "yuml",    LATIN1_yuml,   "y\""        },
+-  { "zeta",    0,             "z"          },
++  { "xi",      0,             "x",   0x03be},
++  { "yacute",  LATIN1_yacute, "y'",  0x00fd},
++  { "yen",     LATIN1_yen,    "YEN", 0x00a5},
++  { "yuml",    LATIN1_yuml,   "y\"", 0x00ff},
++  { "zeta",    0,             "z",   0x03b6},
+ };
+ 
+-extern int use_iso8859;
++extern int use_encoding;
+ 
+ /* ------------------------------------------------------------------------- */
+ 
++char ubuf[4];
++
++char *mkutf(unsigned long x)
++{
++  memset(ubuf, 0, 4);
++  if(x < 128) ubuf[0] = x;
++  else if(x < 0x800) {
++     ubuf[0] = (0xc0 | ((x >> 6) & 0x1f));
++     ubuf[1] = (0x80 | (x & 0x3f));
++  }
++  else {
++     ubuf[0] = (0xe0 | ((x >> 12) & 0x0f));
++     ubuf[1] = (0x80 | ((x >> 6) & 0x3f));
++     ubuf[2] = (0x80 | (x & 0x3f));
++  }
++  return ubuf;
++}
++
+ void
+ replace_sgml_entities(string *s)
+ {
+@@ -330,9 +349,9 @@
+      */
+     while (j < l && s->at(j) != '&') ++j;
+     /*
+-     * We could convert high-bit chars to "&#233;" here if use_iso8859
+-     * is off, then let them be translated or not.  Is the purpose of
+-     * !use_iso8859 to allow SGML entities to be seen, or to strongly
++     * We could convert high-bit chars to "&#233;" here if USE_ASCII
++     * is on, then let them be translated or not.  Is the purpose of
++     * USE_ASCII to allow SGML entities to be seen, or to strongly
+      * filter against high-ASCII chars that might blow up a terminal
+      * that doesn't speak ISO8859?  For the moment, "allow SGML entities
+      * to be seen" -- no filtering here.
+@@ -370,7 +389,11 @@
+           if (!isdigit(c)) break;
+           x = 10 * x + c - '0';
+         }
+-        if (use_iso8859 || (x < 128)) {
++        if (USE_UTF8) {
++          s->replace(beg, j - beg, mkutf(x));
++          j = beg + 1;
++        }
++        else if (USE_ISO8859 && (x < 256) || USE_ASCII && (x < 128)) {
+         s->replace(beg, j - beg, 1, (char) x);
+         j = beg + 1;
+         } else {
+@@ -408,13 +431,17 @@
+         (int (*)(const void *, const void *)) strcmp
+       );
+       if (entity != NULL) {
+-        if (use_iso8859 && entity->iso8859code) {
++        if (USE_ISO8859 && entity->iso8859code) {
+           s->replace(beg, j - beg, 1, (char) entity->iso8859code);
+           j = beg + 1;
+-        } else if (entity->asciistr) {
++        } else if (USE_ASCII && entity->asciistr) {
+           s->replace(beg, j - beg, entity->asciistr);
+         j = beg + 1;
+         } /* else don't replace it at all, we don't have a translation */
++        else if(USE_UTF8 && entity->unicode) {
++        s->replace(beg, j - beg, mkutf(entity->unicode));
++        j = beg + 1;
++        }
+       }
+     } else {
+       ;                         /* EXTENSION: Allow literal '&' sometimes. */
+diff -r -u -bB html2text-1.3.2a/table.C html2text-1.3.2a-patched/table.C
+--- html2text-1.3.2a/table.C	2002-07-22 13:32:50.000000000 +0200
++++ html2text-1.3.2a-patched/table.C	2005-05-13 22:19:59.871136320 +0200
+@@ -175,7 +175,7 @@
+           - (*number_of_columns_return - 1) * (column_spacing + 0),
+           Area::LEFT // Yields better results than "p->halign"!
+         ));
+-	p->width = tmp.get() ? tmp->width() : 0;
++	p->width = tmp.get() ? tmp->utf_width() : 0;
+       }
+       p->minimized = false;
+ 
+@@ -308,7 +308,7 @@
+ 	left_of_column + old_column_width - 1,
+ 	Area::LEFT // Yields better results than "lc.halign"!
+       ));
+-      w = tmp->width();
++      w = tmp->utf_width();
+       if (w >= left_of_column + old_column_width) lc.minimized = true;
+     }
+     if (w > left_of_column + new_column_width) {