Skip to content

Commit

Permalink
feat: Allow Unicode in everywhere
Browse files Browse the repository at this point in the history
  • Loading branch information
kesara committed Jul 27, 2023
1 parent a5d5532 commit e0d80d3
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 148 deletions.
5 changes: 0 additions & 5 deletions xml2rfc/util/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
'street',
'title',
'u',
't',
])

# Attribute values should not contain unicode, with some exceptions
Expand Down Expand Up @@ -85,10 +84,6 @@
'title',
])

bare_unicode_tags_with_notice = set([
't',
])

def is_svg(e):
'''
Returns true if an element is a SVG element
Expand Down
64 changes: 1 addition & 63 deletions xml2rfc/writers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,9 @@
pass

from xml2rfc import strings, log
from xml2rfc.uniscripts import is_script
from xml2rfc.util.date import extract_date, augment_date, format_date, get_expiry_date
from xml2rfc.util.name import short_author_ascii_name_parts, full_author_name_expansion, short_author_name_parts
from xml2rfc.util.unicode import (
punctuation, unicode_replacements, unicode_content_tags,
bare_unicode_tags, bare_unicode_tags_with_notice,
bare_latin_tags, unicode_attributes, downcode, downcode_punctuation,
is_svg)
from xml2rfc.util.unicode import is_svg
from xml2rfc.utils import namespaces, find_duplicate_ids, slugify


Expand Down Expand Up @@ -2039,63 +2034,6 @@ def page_bottom_center(self):
text = 'Expires %s' % format_date(*parts, legacy=self.options.legacy_date_format)
return text

def downcode_punctuation(self):
self.downcode(replacements=punctuation)
self.downcode_reference_punctuation()

def downcode(self, replacements=unicode_replacements):
"""
Traverses an lxml.etree and replaces unicode characters with the proper
equivalents specified in rfc2629-xhtml.ent, resulting in no non-ascii
characters except in elements that explicitly permit non-ascii content.
"""
for e in self.tree.iter():
if is_svg(e):
continue
if e.text:
if not e.tag in unicode_content_tags:
try:
e.text.encode('ascii')
except UnicodeEncodeError:
e.text = downcode(e.text, replacements=replacements)
elif e.tag in bare_unicode_tags:
pass
elif e.tag in bare_unicode_tags_with_notice:
pass
elif e.tag in bare_latin_tags and is_script(e.text, 'Latin'):
pass
elif not e.get('ascii'):
try:
e.text.encode('ascii')
except UnicodeEncodeError:
e.text = downcode(e.text, replacements=replacements)
if e.tail:
if not e.getparent().tag in unicode_content_tags:
try:
e.tail.encode('ascii')
except UnicodeEncodeError:
e.tail = downcode(e.tail, replacements=replacements)

def downcode_attributes(self, replacements=unicode_replacements):
for e in self.tree.iter():
for key in e.attrib.keys():
if not (e.tag, key) in unicode_attributes:
try:
e.get(key).encode('ascii')
except UnicodeEncodeError:
e.set(key, downcode(e.get(key), replacements=replacements))

def downcode_reference_punctuation(self):
for r in self.tree.xpath('.//references'):
for e in r.iter():
for key in e.attrib.keys():
try:
e.get(key).encode('ascii')
except UnicodeEncodeError:
e.set(key, downcode_punctuation(e.get(key)))

def pretty_print_prep(self, e, p):
ind = self.options.indent
## The actual printing is done in self.write()
Expand Down
31 changes: 5 additions & 26 deletions xml2rfc/writers/preptool.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,8 @@
from xml2rfc.util.name import full_author_name_expansion
from xml2rfc.util.num import ol_style_formatter
from xml2rfc.util.unicode import (
unicode_content_tags, unicode_attributes, bare_unicode_tags,
bare_unicode_tags_with_notice, expand_unicode_element, isascii,
downcode, latinscript_attributes, is_svg)
unicode_content_tags, unicode_attributes, expand_unicode_element,
isascii, latinscript_attributes, is_svg)
from xml2rfc.utils import build_dataurl, namespaces, sdict, clean_text
from xml2rfc.writers.base import default_options, BaseV3Writer, RfcWriterError

Expand Down Expand Up @@ -503,33 +502,13 @@ def check_attribute_values(self, e, p):
#

def check_ascii_text(self, e, p):
self.downcode_punctuation()
for c in self.root.iter():
if is_svg(c):
continue
p = c.getparent()
if c.text and not isascii(c.text):
if c.text and not isascii(c.text) and c.tag not in unicode_content_tags:
show = c.text.encode('ascii', errors='replace')
if c.tag in unicode_content_tags:
if c.tag in bare_unicode_tags_with_notice:
if self.options.warn_bare_unicode:
self.warn(c, 'Found non-ascii characters in an element that should be inspected to ensure they are intentional, in <%s>: %s' % (c.tag, show))
elif not c.get('ascii') and not c.tag in bare_unicode_tags and not is_script(c.text, 'Latin'):
self.err(c, 'Found non-ascii content without matching ascii attribute in <%s>: %s' % (c.tag, show))
else:
self.err(c, 'Found non-ascii characters outside of elements that can have non-ascii content, in <%s>: %s' % (c.tag, show))
c.text = downcode(c.text)
if c.tail and not isascii(c.tail):
show = c.tail.encode('ascii', errors='replace')
if p.tag in unicode_content_tags:
if p.tag in bare_unicode_tags_with_notice:
if self.options.warn_bare_unicode:
self.warn(p, 'Found non-ascii characters in an element that should be inspected to ensure they are intentional, in <%s>: %s' % (p.tag, show))
elif not p.get('ascii') and not p.tag in bare_unicode_tags:
self.err(p, 'Found non-unicode content without matching ascii attribute in <%s>: %s' % (p.tag, show))
else:
self.err(p, 'Found non-ascii characters outside of elements that can have non-ascii content, in <%s>: %s' % (p.tag, show))
c.tail = downcode(c.tail)
if self.options.warn_bare_unicode:
self.warn(c, 'Found non-ascii characters in an element that should be inspected to ensure they are intentional, in <%s>: %s' % (c.tag, show))

def normalize_text_items(self, e, p):
"""
Expand Down
54 changes: 0 additions & 54 deletions xml2rfc/writers/v2v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

import xml2rfc
from xml2rfc import log
from xml2rfc.util.unicode import unicode_content_tags, unicode_replacements, isascii, is_svg
from xml2rfc.utils import hastext, isempty, sdict, slugify, iscomment
from xml2rfc.writers.base import default_options, BaseV3Writer

Expand Down Expand Up @@ -316,7 +315,6 @@ def convert2to3(self):
'.//*[self::artwork or self::dl or self::figure or self::ol or self::sourcecode or self::t or self::ul]',
'//*[@*="yes" or @*="no"]', # convert old attribute false/true
'.;pretty_print_prep()',
'.;wrap_non_ascii()',
]

# Remove any DOCTYPE declaration
Expand Down Expand Up @@ -1148,55 +1146,3 @@ def attribute_yes_no(self, e, p):
# if c.tail != None:
# if c.tail.strip() == '':
# c.tail = None

def wrap_non_ascii(self, e, p):
self.downcode(replacements=unicode_replacements)
self.downcode_punctuation()
for e in self.tree.iter():
if is_svg(e):
continue
def uwrap(text, line):
words = []
elements = []
ascii = None
for word in re.split('(\s+)', text, flags=re.U):
if isascii(word):
words.append(word)
else:
u = self.element('u', line=line)
u.text = word
if ascii is None:
ascii = ''.join(words)
words = []
elements.append(u)
else:
elements[-1].tail = ''.join(words)
words = []
elements.append(u)
if words:
if ascii is None:
ascii = ''.join(words)
else:
elements[-1].tail = ''.join(words)
return ascii, elements
if e.text and e.tag not in unicode_content_tags:
try:
e.text.encode('ascii')
except UnicodeEncodeError:
e.text, elements = uwrap(e.text, e.sourceline)
if len(e):
c = e[0]
for u in elements:
c.addprevious(u)
else:
for u in elements:
e.append(u)
if e.tail and e.getparent().tag not in unicode_content_tags:
try:
e.tail.encode('ascii')
except UnicodeEncodeError:
e.tail, elements = uwrap(e.tail, e.sourceline)
for u in elements:
e.addnext(u)
e = u

0 comments on commit e0d80d3

Please sign in to comment.