From e0d80d391fc86c103bb1bb6e74ee7e8602f57603 Mon Sep 17 00:00:00 2001 From: Kesara Rathnayake Date: Wed, 12 Jul 2023 00:23:50 +1200 Subject: [PATCH] feat: Allow Unicode in everywhere Fixes #960 --- xml2rfc/util/unicode.py | 5 --- xml2rfc/writers/base.py | 64 +------------------------------------ xml2rfc/writers/preptool.py | 31 +++--------------- xml2rfc/writers/v2v3.py | 54 ------------------------------- 4 files changed, 6 insertions(+), 148 deletions(-) diff --git a/xml2rfc/util/unicode.py b/xml2rfc/util/unicode.py index b89afdddb..6071abc58 100644 --- a/xml2rfc/util/unicode.py +++ b/xml2rfc/util/unicode.py @@ -35,7 +35,6 @@ 'street', 'title', 'u', - 't', ]) # Attribute values should not contain unicode, with some exceptions @@ -85,10 +84,6 @@ 'title', ]) -bare_unicode_tags_with_notice = set([ - 't', -]) - def is_svg(e): ''' Returns true if an element is a SVG element diff --git a/xml2rfc/writers/base.py b/xml2rfc/writers/base.py index 19274e11c..8821605ca 100644 --- a/xml2rfc/writers/base.py +++ b/xml2rfc/writers/base.py @@ -25,14 +25,9 @@ pass from xml2rfc import strings, log -from xml2rfc.uniscripts import is_script from xml2rfc.util.date import extract_date, augment_date, format_date, get_expiry_date from xml2rfc.util.name import short_author_ascii_name_parts, full_author_name_expansion, short_author_name_parts -from xml2rfc.util.unicode import ( - punctuation, unicode_replacements, unicode_content_tags, - bare_unicode_tags, bare_unicode_tags_with_notice, - bare_latin_tags, unicode_attributes, downcode, downcode_punctuation, - is_svg) +from xml2rfc.util.unicode import is_svg from xml2rfc.utils import namespaces, find_duplicate_ids, slugify @@ -2039,63 +2034,6 @@ def page_bottom_center(self): text = 'Expires %s' % format_date(*parts, legacy=self.options.legacy_date_format) return text - def downcode_punctuation(self): - self.downcode(replacements=punctuation) - self.downcode_reference_punctuation() - - def downcode(self, replacements=unicode_replacements): - """ - - Traverses an lxml.etree and replaces unicode characters with the proper - equivalents specified in rfc2629-xhtml.ent, resulting in no non-ascii - characters except in elements that explicitly permit non-ascii content. - - """ - for e in self.tree.iter(): - if is_svg(e): - continue - if e.text: - if not e.tag in unicode_content_tags: - try: - e.text.encode('ascii') - except UnicodeEncodeError: - e.text = downcode(e.text, replacements=replacements) - elif e.tag in bare_unicode_tags: - pass - elif e.tag in bare_unicode_tags_with_notice: - pass - elif e.tag in bare_latin_tags and is_script(e.text, 'Latin'): - pass - elif not e.get('ascii'): - try: - e.text.encode('ascii') - except UnicodeEncodeError: - e.text = downcode(e.text, replacements=replacements) - if e.tail: - if not e.getparent().tag in unicode_content_tags: - try: - e.tail.encode('ascii') - except UnicodeEncodeError: - e.tail = downcode(e.tail, replacements=replacements) - - def downcode_attributes(self, replacements=unicode_replacements): - for e in self.tree.iter(): - for key in e.attrib.keys(): - if not (e.tag, key) in unicode_attributes: - try: - e.get(key).encode('ascii') - except UnicodeEncodeError: - e.set(key, downcode(e.get(key), replacements=replacements)) - - def downcode_reference_punctuation(self): - for r in self.tree.xpath('.//references'): - for e in r.iter(): - for key in e.attrib.keys(): - try: - e.get(key).encode('ascii') - except UnicodeEncodeError: - e.set(key, downcode_punctuation(e.get(key))) - def pretty_print_prep(self, e, p): ind = self.options.indent ## The actual printing is done in self.write() diff --git a/xml2rfc/writers/preptool.py b/xml2rfc/writers/preptool.py index 54e8de7d6..809446fa8 100644 --- a/xml2rfc/writers/preptool.py +++ b/xml2rfc/writers/preptool.py @@ -41,9 +41,8 @@ from xml2rfc.util.name import full_author_name_expansion from xml2rfc.util.num import ol_style_formatter from xml2rfc.util.unicode import ( - unicode_content_tags, unicode_attributes, bare_unicode_tags, - bare_unicode_tags_with_notice, expand_unicode_element, isascii, - downcode, latinscript_attributes, is_svg) + unicode_content_tags, unicode_attributes, expand_unicode_element, + isascii, latinscript_attributes, is_svg) from xml2rfc.utils import build_dataurl, namespaces, sdict, clean_text from xml2rfc.writers.base import default_options, BaseV3Writer, RfcWriterError @@ -503,33 +502,13 @@ def check_attribute_values(self, e, p): # def check_ascii_text(self, e, p): - self.downcode_punctuation() for c in self.root.iter(): if is_svg(c): continue - p = c.getparent() - if c.text and not isascii(c.text): + if c.text and not isascii(c.text) and c.tag not in unicode_content_tags: show = c.text.encode('ascii', errors='replace') - if c.tag in unicode_content_tags: - if c.tag in bare_unicode_tags_with_notice: - if self.options.warn_bare_unicode: - self.warn(c, 'Found non-ascii characters in an element that should be inspected to ensure they are intentional, in <%s>: %s' % (c.tag, show)) - elif not c.get('ascii') and not c.tag in bare_unicode_tags and not is_script(c.text, 'Latin'): - self.err(c, 'Found non-ascii content without matching ascii attribute in <%s>: %s' % (c.tag, show)) - else: - self.err(c, 'Found non-ascii characters outside of elements that can have non-ascii content, in <%s>: %s' % (c.tag, show)) - c.text = downcode(c.text) - if c.tail and not isascii(c.tail): - show = c.tail.encode('ascii', errors='replace') - if p.tag in unicode_content_tags: - if p.tag in bare_unicode_tags_with_notice: - if self.options.warn_bare_unicode: - self.warn(p, 'Found non-ascii characters in an element that should be inspected to ensure they are intentional, in <%s>: %s' % (p.tag, show)) - elif not p.get('ascii') and not p.tag in bare_unicode_tags: - self.err(p, 'Found non-unicode content without matching ascii attribute in <%s>: %s' % (p.tag, show)) - else: - self.err(p, 'Found non-ascii characters outside of elements that can have non-ascii content, in <%s>: %s' % (p.tag, show)) - c.tail = downcode(c.tail) + if self.options.warn_bare_unicode: + self.warn(c, 'Found non-ascii characters in an element that should be inspected to ensure they are intentional, in <%s>: %s' % (c.tag, show)) def normalize_text_items(self, e, p): """ diff --git a/xml2rfc/writers/v2v3.py b/xml2rfc/writers/v2v3.py index 3e7f0dc9b..2a17016fd 100644 --- a/xml2rfc/writers/v2v3.py +++ b/xml2rfc/writers/v2v3.py @@ -13,7 +13,6 @@ import xml2rfc from xml2rfc import log -from xml2rfc.util.unicode import unicode_content_tags, unicode_replacements, isascii, is_svg from xml2rfc.utils import hastext, isempty, sdict, slugify, iscomment from xml2rfc.writers.base import default_options, BaseV3Writer @@ -316,7 +315,6 @@ def convert2to3(self): './/*[self::artwork or self::dl or self::figure or self::ol or self::sourcecode or self::t or self::ul]', '//*[@*="yes" or @*="no"]', # convert old attribute false/true '.;pretty_print_prep()', - '.;wrap_non_ascii()', ] # Remove any DOCTYPE declaration @@ -1148,55 +1146,3 @@ def attribute_yes_no(self, e, p): # if c.tail != None: # if c.tail.strip() == '': # c.tail = None - - def wrap_non_ascii(self, e, p): - self.downcode(replacements=unicode_replacements) - self.downcode_punctuation() - for e in self.tree.iter(): - if is_svg(e): - continue - def uwrap(text, line): - words = [] - elements = [] - ascii = None - for word in re.split('(\s+)', text, flags=re.U): - if isascii(word): - words.append(word) - else: - u = self.element('u', line=line) - u.text = word - if ascii is None: - ascii = ''.join(words) - words = [] - elements.append(u) - else: - elements[-1].tail = ''.join(words) - words = [] - elements.append(u) - if words: - if ascii is None: - ascii = ''.join(words) - else: - elements[-1].tail = ''.join(words) - return ascii, elements - if e.text and e.tag not in unicode_content_tags: - try: - e.text.encode('ascii') - except UnicodeEncodeError: - e.text, elements = uwrap(e.text, e.sourceline) - if len(e): - c = e[0] - for u in elements: - c.addprevious(u) - else: - for u in elements: - e.append(u) - if e.tail and e.getparent().tag not in unicode_content_tags: - try: - e.tail.encode('ascii') - except UnicodeEncodeError: - e.tail, elements = uwrap(e.tail, e.sourceline) - for u in elements: - e.addnext(u) - e = u -