From e0d80d391fc86c103bb1bb6e74ee7e8602f57603 Mon Sep 17 00:00:00 2001
From: Kesara Rathnayake <kesara@fq.nz>
Date: Wed, 12 Jul 2023 00:23:50 +1200
Subject: [PATCH] feat: Allow Unicode in everywhere

Fixes #960
---
 xml2rfc/util/unicode.py     |  5 ---
 xml2rfc/writers/base.py     | 64 +------------------------------------
 xml2rfc/writers/preptool.py | 31 +++---------------
 xml2rfc/writers/v2v3.py     | 54 -------------------------------
 4 files changed, 6 insertions(+), 148 deletions(-)

diff --git a/xml2rfc/util/unicode.py b/xml2rfc/util/unicode.py
index b89afdddb..6071abc58 100644
--- a/xml2rfc/util/unicode.py
+++ b/xml2rfc/util/unicode.py
@@ -35,7 +35,6 @@
     'street',
     'title',
     'u',
-    't',
 ])
 
 # Attribute values should not contain unicode, with some exceptions
@@ -85,10 +84,6 @@
     'title',
     ])
 
-bare_unicode_tags_with_notice = set([
-    't',
-])
-
 def is_svg(e):
     '''
     Returns true if an element is a SVG element
diff --git a/xml2rfc/writers/base.py b/xml2rfc/writers/base.py
index 19274e11c..8821605ca 100644
--- a/xml2rfc/writers/base.py
+++ b/xml2rfc/writers/base.py
@@ -25,14 +25,9 @@
     pass
 
 from xml2rfc import strings, log
-from xml2rfc.uniscripts import is_script
 from xml2rfc.util.date import extract_date, augment_date, format_date, get_expiry_date
 from xml2rfc.util.name import short_author_ascii_name_parts, full_author_name_expansion, short_author_name_parts
-from xml2rfc.util.unicode import (
-        punctuation, unicode_replacements, unicode_content_tags,
-        bare_unicode_tags, bare_unicode_tags_with_notice,
-        bare_latin_tags, unicode_attributes, downcode, downcode_punctuation,
-        is_svg)
+from xml2rfc.util.unicode import is_svg
 from xml2rfc.utils import namespaces, find_duplicate_ids, slugify
 
 
@@ -2039,63 +2034,6 @@ def page_bottom_center(self):
             text = 'Expires %s' % format_date(*parts, legacy=self.options.legacy_date_format)
         return text
 
-    def downcode_punctuation(self):
-        self.downcode(replacements=punctuation)
-        self.downcode_reference_punctuation()
-
-    def downcode(self, replacements=unicode_replacements):
-        """
-
-        Traverses an lxml.etree and replaces unicode characters with the proper
-        equivalents specified in rfc2629-xhtml.ent, resulting in no non-ascii
-        characters except in elements that explicitly permit non-ascii content.
-
-        """
-        for e in self.tree.iter():
-            if is_svg(e):
-                continue
-            if e.text:
-                if not e.tag in unicode_content_tags:
-                    try:
-                        e.text.encode('ascii')
-                    except UnicodeEncodeError:
-                        e.text = downcode(e.text, replacements=replacements)
-                elif e.tag in bare_unicode_tags:
-                    pass
-                elif e.tag in bare_unicode_tags_with_notice:
-                    pass
-                elif e.tag in bare_latin_tags and is_script(e.text, 'Latin'):
-                    pass
-                elif not e.get('ascii'):
-                    try:
-                        e.text.encode('ascii')
-                    except UnicodeEncodeError:
-                        e.text = downcode(e.text, replacements=replacements)
-            if e.tail:
-                if not e.getparent().tag in unicode_content_tags:
-                    try:
-                        e.tail.encode('ascii')
-                    except UnicodeEncodeError:
-                        e.tail = downcode(e.tail, replacements=replacements)
-
-    def downcode_attributes(self, replacements=unicode_replacements):
-        for e in self.tree.iter():
-            for key in e.attrib.keys():
-                if not (e.tag, key) in unicode_attributes:
-                    try:
-                        e.get(key).encode('ascii')
-                    except UnicodeEncodeError:
-                        e.set(key, downcode(e.get(key), replacements=replacements))
-
-    def downcode_reference_punctuation(self):
-        for r in self.tree.xpath('.//references'):
-            for e in r.iter():
-                for key in e.attrib.keys():
-                    try:
-                        e.get(key).encode('ascii')
-                    except UnicodeEncodeError:
-                        e.set(key, downcode_punctuation(e.get(key)))
-
     def pretty_print_prep(self, e, p):
         ind = self.options.indent
         ## The actual printing is done in self.write()
diff --git a/xml2rfc/writers/preptool.py b/xml2rfc/writers/preptool.py
index 54e8de7d6..809446fa8 100644
--- a/xml2rfc/writers/preptool.py
+++ b/xml2rfc/writers/preptool.py
@@ -41,9 +41,8 @@
 from xml2rfc.util.name import full_author_name_expansion
 from xml2rfc.util.num import ol_style_formatter
 from xml2rfc.util.unicode import (
-        unicode_content_tags, unicode_attributes, bare_unicode_tags,
-        bare_unicode_tags_with_notice, expand_unicode_element, isascii,
-        downcode, latinscript_attributes, is_svg)
+        unicode_content_tags, unicode_attributes, expand_unicode_element,
+        isascii, latinscript_attributes, is_svg)
 from xml2rfc.utils import build_dataurl, namespaces, sdict, clean_text
 from xml2rfc.writers.base import default_options, BaseV3Writer, RfcWriterError
 
@@ -503,33 +502,13 @@ def check_attribute_values(self, e, p):
         #
 
     def check_ascii_text(self, e, p):
-        self.downcode_punctuation()
         for c in self.root.iter():
             if is_svg(c):
                 continue
-            p = c.getparent()
-            if c.text and not isascii(c.text):
+            if c.text and not isascii(c.text) and c.tag not in unicode_content_tags:
                 show = c.text.encode('ascii', errors='replace')
-                if c.tag in unicode_content_tags:
-                    if c.tag in bare_unicode_tags_with_notice:
-                        if self.options.warn_bare_unicode:
-                            self.warn(c, 'Found non-ascii characters in an element that should be inspected to ensure they are intentional, in <%s>: %s' % (c.tag, show))
-                    elif not c.get('ascii') and not c.tag in bare_unicode_tags and not is_script(c.text, 'Latin'):
-                        self.err(c, 'Found non-ascii content without matching ascii attribute in <%s>: %s' % (c.tag, show))
-                else:
-                    self.err(c, 'Found non-ascii characters outside of elements that can have non-ascii content, in <%s>: %s' % (c.tag, show))
-                    c.text = downcode(c.text)
-            if c.tail and not isascii(c.tail):
-                show = c.tail.encode('ascii', errors='replace')
-                if p.tag in unicode_content_tags:
-                    if p.tag in bare_unicode_tags_with_notice:
-                        if self.options.warn_bare_unicode:
-                            self.warn(p, 'Found non-ascii characters in an element that should be inspected to ensure they are intentional, in <%s>: %s' % (p.tag, show))
-                    elif not p.get('ascii') and not p.tag in bare_unicode_tags:
-                        self.err(p, 'Found non-unicode content without matching ascii attribute in <%s>: %s' % (p.tag, show))
-                else:
-                    self.err(p, 'Found non-ascii characters outside of elements that can have non-ascii content, in <%s>: %s' % (p.tag, show))
-                    c.tail = downcode(c.tail)
+                if self.options.warn_bare_unicode:
+                    self.warn(c, 'Found non-ascii characters in an element that should be inspected to ensure they are intentional, in <%s>: %s' % (c.tag, show))
 
     def normalize_text_items(self, e, p):
         """
diff --git a/xml2rfc/writers/v2v3.py b/xml2rfc/writers/v2v3.py
index 3e7f0dc9b..2a17016fd 100644
--- a/xml2rfc/writers/v2v3.py
+++ b/xml2rfc/writers/v2v3.py
@@ -13,7 +13,6 @@
 
 import xml2rfc
 from xml2rfc import log
-from xml2rfc.util.unicode import unicode_content_tags, unicode_replacements, isascii, is_svg
 from xml2rfc.utils import hastext, isempty, sdict, slugify, iscomment
 from xml2rfc.writers.base import default_options, BaseV3Writer
 
@@ -316,7 +315,6 @@ def convert2to3(self):
             './/*[self::artwork or self::dl or self::figure or self::ol or self::sourcecode or self::t or self::ul]',
             '//*[@*="yes" or @*="no"]',      # convert old attribute false/true
             '.;pretty_print_prep()',
-            '.;wrap_non_ascii()',
         ]
 
         # Remove any DOCTYPE declaration
@@ -1148,55 +1146,3 @@ def attribute_yes_no(self, e, p):
 #             if c.tail != None:
 #                 if c.tail.strip() == '':
 #                     c.tail = None
-
-    def wrap_non_ascii(self, e, p):
-        self.downcode(replacements=unicode_replacements)
-        self.downcode_punctuation()
-        for e in self.tree.iter():
-            if is_svg(e):
-                continue
-            def uwrap(text, line): 
-                words = []
-                elements = []
-                ascii = None
-                for word in re.split('(\s+)', text, flags=re.U):
-                    if isascii(word):
-                        words.append(word)
-                    else:
-                        u = self.element('u', line=line)
-                        u.text = word
-                        if ascii is None:
-                            ascii = ''.join(words)
-                            words = []
-                            elements.append(u)
-                        else:
-                            elements[-1].tail = ''.join(words)
-                            words = []
-                            elements.append(u)
-                if words:
-                    if ascii is None:
-                        ascii = ''.join(words)
-                    else:
-                        elements[-1].tail = ''.join(words)
-                return ascii, elements
-            if e.text and e.tag not in unicode_content_tags:
-                    try:
-                        e.text.encode('ascii')
-                    except UnicodeEncodeError:
-                        e.text, elements = uwrap(e.text, e.sourceline)
-                        if len(e):
-                            c = e[0]
-                            for u in elements:
-                                c.addprevious(u)
-                        else:
-                            for u in elements:
-                                e.append(u)
-            if e.tail and e.getparent().tag not in unicode_content_tags:
-                    try:
-                        e.tail.encode('ascii')
-                    except UnicodeEncodeError:
-                        e.tail, elements = uwrap(e.tail, e.sourceline)
-                        for u in elements:
-                            e.addnext(u)
-                            e = u
-