Skip to content

Commit

Permalink
Use general category to determine zero width combining characters
Browse files Browse the repository at this point in the history
Previously, the canonical combining class was used to determine
which characters are zero width combining characters. This had
two problems:
- it classified spacing marks (category Mc) as zero width
- it classified enclosing marks (category Me) as normal characters

Fix by using the general category to generate a table of combining
characters that have zero width. Characters with a general category
of Mn or Me are included in this table. Characters with a general
category of Mc are not included, and so use the default width of 1.
  • Loading branch information
philipc committed Sep 2, 2015
1 parent 1b82a34 commit eaec972
Show file tree
Hide file tree
Showing 8 changed files with 472 additions and 104 deletions.
12 changes: 7 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ than the most current Unicode Standard release files, which this project
aims to track.

The most current release of this API is based from Unicode Standard release
*7.0.0*, dated *2014-02-28, 23:15:00 GMT [KW, LI]* for table generated by
file ``EastAsianWidth-7.0.0.txt`` and *2014-02-07, 18:42:08 GMT [MD]* for
``DerivedCombiningClass-7.0.0.txt``.
*8.0.0*, dated *2015-02-10, 21:00:00 GMT [KW, LI]* for table generated by
file ``EastAsianWidth-8.0.0.txt`` and *2015-02-13, 13:47:11 GMT [MD]* for
``DerivedGeneralCategory-8.0.0.txt``.

Installation
------------
Expand Down Expand Up @@ -140,12 +140,14 @@ Updating Tables
The command ``python setup.py update`` will fetch the following resources:

- http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
- http://www.unicode.org/Public/UNIDATA/extracted/DerivedCombiningClass.txt
- http://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt

And generate the table files `wcwidth/table_wide.py`_ and `wcwidth/table_comb.py`_.
And generate the table files `wcwidth/table_wide.py`_,
`wcwidth/table_comb.py`_, and `wcwidth/table_zero.py`_.

.. _`wcwidth/table_wide.py`: https://github.com/jquast/wcwidth/tree/master/wcwidth/table_wide.py
.. _`wcwidth/table_comb.py`: https://github.com/jquast/wcwidth/tree/master/wcwidth/table_comb.py
.. _`wcwidth/table_zero.py`: https://github.com/jquast/wcwidth/tree/master/wcwidth/table_zero.py

wcwidth.c
---------
Expand Down
5 changes: 3 additions & 2 deletions bin/wcwidth-browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
import signal

# local
from wcwidth import wcwidth, table_comb
from wcwidth.wcwidth import _bisearch, wcwidth, COMBINING

# 3rd-party
from blessed import Terminal
Expand Down Expand Up @@ -116,6 +116,7 @@ def __init__(self, width=2):
self.characters = (unichr(idx)
for idx in xrange(LIMIT_UCS)
if wcwidth(unichr(idx)) == width
and not _bisearch(idx, COMBINING)
)

def __iter__(self):
Expand Down Expand Up @@ -152,7 +153,7 @@ def __init__(self, width=1):
"""
self.characters = []
letters_o = (u'o' * width)
for boundaries in table_comb.NONZERO_COMBINING:
for boundaries in COMBINING:
for val in [_val for _val in
range(boundaries[0], boundaries[1] + 1)
if _val <= LIMIT_UCS]:
Expand Down
6 changes: 3 additions & 3 deletions bin/wcwidth-combining-comparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import sys

# local imports
from wcwidth.wcwidth import _bisearch, NONZERO_COMBINING
from wcwidth.wcwidth import _bisearch, COMBINING


def report_comb_msg(ucs, comb_py, comb_wc):
Expand Down Expand Up @@ -66,8 +66,8 @@ def report_comb_msg(ucs, comb_py, comb_wc):


def _is_equal_combining(ucs):
comb_py = bool(unicodedata.combining(ucs))
comb_wc = bool(_bisearch(ord(ucs), NONZERO_COMBINING))
comb_py = bool(unicodedata.category(ucs) in ['Mc', 'Me', 'Mn'])
comb_wc = bool(_bisearch(ord(ucs), COMBINING))
assert comb_py == comb_wc, report_comb_msg(ucs, comb_py, comb_wc)


Expand Down
27 changes: 18 additions & 9 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,10 @@ class SetupUpdate(setuptools.Command):
EAW_OUT = os.path.join(HERE, 'wcwidth', 'table_wide.py')

UCD_URL = ('http://www.unicode.org/Public/UNIDATA/extracted/'
'DerivedCombiningClass.txt')
UCD_IN = os.path.join(HERE, 'data', 'DerivedCombiningClass.txt')
'DerivedGeneralCategory.txt')
UCD_IN = os.path.join(HERE, 'data', 'DerivedGeneralCategory.txt')
CMB_OUT = os.path.join(HERE, 'wcwidth', 'table_comb.py')
ZERO_OUT = os.path.join(HERE, 'wcwidth', 'table_zero.py')

def initialize_options(self):
"""Override builtin method: no options are available."""
Expand All @@ -60,10 +61,11 @@ def finalize_options(self):
pass

def run(self):
"""Execute command: update east-asian and combining tables."""
"""Execute command: update east-asian, combining and zero width tables."""
assert os.getenv('VIRTUAL_ENV'), 'You should be in a virtualenv'
self.do_east_asian_width()
self.do_combining()
self.do_zero_width()

def do_east_asian_width(self):
"""Fetch and update east-asian tables."""
Expand All @@ -75,9 +77,16 @@ def do_east_asian_width(self):
def do_combining(self):
"""Fetch and update combining tables."""
self._do_retrieve(self.UCD_URL, self.UCD_IN)
(version, date, values) = self._do_combining_parse(self.UCD_IN)
(version, date, values) = self._do_category_parse(self.UCD_IN, ('Mc', 'Me', 'Mn',))
table = self._make_table(values)
self._do_write(self.CMB_OUT, 'NONZERO_COMBINING', version, date, table)
self._do_write(self.CMB_OUT, 'COMBINING', version, date, table)

def do_zero_width(self):
"""Fetch and update zero width tables."""
self._do_retrieve(self.UCD_URL, self.UCD_IN)
(version, date, values) = self._do_category_parse(self.UCD_IN, ('Me', 'Mn',))
table = self._make_table(values)
self._do_write(self.ZERO_OUT, 'ZERO_WIDTH', version, date, table)

@staticmethod
def _make_table(values):
Expand Down Expand Up @@ -143,8 +152,8 @@ def _do_east_asian_width_parse(fname,
return version, date, sorted(values)

@staticmethod
def _do_combining_parse(fname, exclude_values=(0,)):
"""Parse unicode combining tables."""
def _do_category_parse(fname, categories):
"""Parse unicode category tables."""
version, date, values = None, None, []
print("parsing {} ..".format(fname))
for line in open(fname, 'rb'):
Expand All @@ -159,8 +168,8 @@ def _do_combining_parse(fname, exclude_values=(0,)):
continue
addrs, details = uline.split(';', 1)
addrs, details = addrs.rstrip(), details.lstrip()
if not any(details.startswith('{} #'.format(value))
for value in exclude_values):
if any(details.startswith('{} #'.format(value))
for value in categories):
start, stop = addrs, addrs
if '..' in addrs:
start, stop = addrs.split('..')
Expand Down
Loading

0 comments on commit eaec972

Please sign in to comment.