Use general category to determine zero width combining characters

Previously, the canonical combining class was used to determine which characters are zero width combining characters. This had two problems: - it classified spacing marks (category Mc) as zero width - it classified enclosing marks (category Me) as normal characters Fix by using the general category to generate a table of combining characters that have zero width. Characters with a general category of Mn or Me are included in this table. Characters with a general category of Mc are not included, and so use the default width of 1.
jquast · Sep 2, 2015 · eaec972 · eaec972
1 parent 1b82a34
commit eaec972
Show file tree

Hide file tree

Showing 8 changed files with 472 additions and 104 deletions.
diff --git a/README.rst b/README.rst
@@ -38,9 +38,9 @@ than the most current Unicode Standard release files, which this project
 aims to track.
 
 The most current release of this API is based from Unicode Standard release
-*7.0.0*, dated *2014-02-28, 23:15:00 GMT [KW, LI]* for table generated by
-file ``EastAsianWidth-7.0.0.txt`` and *2014-02-07, 18:42:08 GMT [MD]* for
-``DerivedCombiningClass-7.0.0.txt``.
+*8.0.0*, dated *2015-02-10, 21:00:00 GMT [KW, LI]* for table generated by
+file ``EastAsianWidth-8.0.0.txt`` and *2015-02-13, 13:47:11 GMT [MD]* for
+``DerivedGeneralCategory-8.0.0.txt``.
 
 Installation
 ------------
@@ -140,12 +140,14 @@ Updating Tables
 The command ``python setup.py update`` will fetch the following resources:
 
 - http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
-- http://www.unicode.org/Public/UNIDATA/extracted/DerivedCombiningClass.txt
+- http://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt
 
-And generate the table files `wcwidth/table_wide.py`_ and `wcwidth/table_comb.py`_.
+And generate the table files `wcwidth/table_wide.py`_,
+`wcwidth/table_comb.py`_, and `wcwidth/table_zero.py`_.
 
 .. _`wcwidth/table_wide.py`: https://github.com/jquast/wcwidth/tree/master/wcwidth/table_wide.py
 .. _`wcwidth/table_comb.py`: https://github.com/jquast/wcwidth/tree/master/wcwidth/table_comb.py
+.. _`wcwidth/table_zero.py`: https://github.com/jquast/wcwidth/tree/master/wcwidth/table_zero.py
 
 wcwidth.c
 ---------

diff --git a/bin/wcwidth-browser.py b/bin/wcwidth-browser.py
@@ -37,7 +37,7 @@
 import signal
 
 # local
-from wcwidth import wcwidth, table_comb
+from wcwidth.wcwidth import _bisearch, wcwidth, COMBINING
 
 # 3rd-party
 from blessed import Terminal
@@ -116,6 +116,7 @@ def __init__(self, width=2):
         self.characters = (unichr(idx)
                            for idx in xrange(LIMIT_UCS)
                            if wcwidth(unichr(idx)) == width
+                           and not _bisearch(idx, COMBINING)
                            )
 
     def __iter__(self):
@@ -152,7 +153,7 @@ def __init__(self, width=1):
         """
         self.characters = []
         letters_o = (u'o' * width)
-        for boundaries in table_comb.NONZERO_COMBINING:
+        for boundaries in COMBINING:
             for val in [_val for _val in
                         range(boundaries[0], boundaries[1] + 1)
                         if _val <= LIMIT_UCS]:

diff --git a/bin/wcwidth-combining-comparator.py b/bin/wcwidth-combining-comparator.py
@@ -18,7 +18,7 @@
 import sys
 
 # local imports
-from wcwidth.wcwidth import _bisearch, NONZERO_COMBINING
+from wcwidth.wcwidth import _bisearch, COMBINING
 
 
 def report_comb_msg(ucs, comb_py, comb_wc):
@@ -66,8 +66,8 @@ def report_comb_msg(ucs, comb_py, comb_wc):
 
 
 def _is_equal_combining(ucs):
-    comb_py = bool(unicodedata.combining(ucs))
-    comb_wc = bool(_bisearch(ord(ucs), NONZERO_COMBINING))
+    comb_py = bool(unicodedata.category(ucs) in ['Mc', 'Me', 'Mn'])
+    comb_wc = bool(_bisearch(ord(ucs), COMBINING))
     assert comb_py == comb_wc, report_comb_msg(ucs, comb_py, comb_wc)
 
 

diff --git a/setup.py b/setup.py
@@ -47,9 +47,10 @@ class SetupUpdate(setuptools.Command):
     EAW_OUT = os.path.join(HERE, 'wcwidth', 'table_wide.py')
 
     UCD_URL = ('http://www.unicode.org/Public/UNIDATA/extracted/'
-               'DerivedCombiningClass.txt')
-    UCD_IN = os.path.join(HERE, 'data', 'DerivedCombiningClass.txt')
+               'DerivedGeneralCategory.txt')
+    UCD_IN = os.path.join(HERE, 'data', 'DerivedGeneralCategory.txt')
     CMB_OUT = os.path.join(HERE, 'wcwidth', 'table_comb.py')
+    ZERO_OUT = os.path.join(HERE, 'wcwidth', 'table_zero.py')
 
     def initialize_options(self):
         """Override builtin method: no options are available."""
@@ -60,10 +61,11 @@ def finalize_options(self):
         pass
 
     def run(self):
-        """Execute command: update east-asian and combining tables."""
+        """Execute command: update east-asian, combining and zero width tables."""
         assert os.getenv('VIRTUAL_ENV'), 'You should be in a virtualenv'
         self.do_east_asian_width()
         self.do_combining()
+        self.do_zero_width()
 
     def do_east_asian_width(self):
         """Fetch and update east-asian tables."""
@@ -75,9 +77,16 @@ def do_east_asian_width(self):
     def do_combining(self):
         """Fetch and update combining tables."""
         self._do_retrieve(self.UCD_URL, self.UCD_IN)
-        (version, date, values) = self._do_combining_parse(self.UCD_IN)
+        (version, date, values) = self._do_category_parse(self.UCD_IN, ('Mc', 'Me', 'Mn',))
         table = self._make_table(values)
-        self._do_write(self.CMB_OUT, 'NONZERO_COMBINING', version, date, table)
+        self._do_write(self.CMB_OUT, 'COMBINING', version, date, table)
+
+    def do_zero_width(self):
+        """Fetch and update zero width tables."""
+        self._do_retrieve(self.UCD_URL, self.UCD_IN)
+        (version, date, values) = self._do_category_parse(self.UCD_IN, ('Me', 'Mn',))
+        table = self._make_table(values)
+        self._do_write(self.ZERO_OUT, 'ZERO_WIDTH', version, date, table)
 
     @staticmethod
     def _make_table(values):
@@ -143,8 +152,8 @@ def _do_east_asian_width_parse(fname,
         return version, date, sorted(values)
 
     @staticmethod
-    def _do_combining_parse(fname, exclude_values=(0,)):
-        """Parse unicode combining tables."""
+    def _do_category_parse(fname, categories):
+        """Parse unicode category tables."""
         version, date, values = None, None, []
         print("parsing {} ..".format(fname))
         for line in open(fname, 'rb'):
@@ -159,8 +168,8 @@ def _do_combining_parse(fname, exclude_values=(0,)):
                 continue
             addrs, details = uline.split(';', 1)
             addrs, details = addrs.rstrip(), details.lstrip()
-            if not any(details.startswith('{} #'.format(value))
-                       for value in exclude_values):
+            if any(details.startswith('{} #'.format(value))
+                       for value in categories):
                 start, stop = addrs, addrs
                 if '..' in addrs:
                     start, stop = addrs.split('..')