Skip to content

Commit

Permalink
Auto merge of #365 - froydnj:uts-data-slimming, r=SimonSapin
Browse files Browse the repository at this point in the history
more idna data slimming

We can do a better job of packing the uts46 data:

* We can merge identically-mapped entries that don't have an associated string slice.  This saves ~10% space.
* We can make slices smaller and pack them into `Mapping` better, which saves 25% space on 64-bit platforms.  I think it might save half that on 32-bit platforms, but I didn't check.

Together these are good for ~42KB of space savings on a 64-bit platform.

<!-- Reviewable:start -->
---
This change is [<img src="https://reviewable.io/review_button.svg" height="34" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/rust-url/365)
<!-- Reviewable:end -->
  • Loading branch information
bors-servo authored Jun 20, 2017
2 parents d19d5d0 + c018150 commit 37557e4
Show file tree
Hide file tree
Showing 3 changed files with 6,015 additions and 6,700 deletions.
60 changes: 57 additions & 3 deletions idna/src/make_uts46_mapping_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,15 @@ def strtab_slice(s):
return c

def rust_slice(s):
return "(StringTableSlice { byte_start: %d, byte_len: %d })" % s
start = s[0]
length = s[1]
start_lo = start & 0xff
start_hi = start >> 8
assert length <= 255
assert start_hi <= 255
return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)

ranges = []

for line in txt:
# remove comments
Expand All @@ -66,12 +74,58 @@ def rust_slice(s):
if not last:
last = first
mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
unicode_str = None
if len(fields) > 2:
if fields[2].strip():
unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
mapping += rust_slice(strtab_slice(unicode_str))
elif mapping == "Deviation":
mapping += rust_slice(strtab_slice(''))
unicode_str = u''
ranges.append((first, last, mapping, unicode_str))

def mergeable_key(r):
mapping = r[2]
# These types have associated data, so we should not merge them.
if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
return r
assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid')
return mapping

grouped_ranges = itertools.groupby(ranges, key=mergeable_key)

optimized_ranges = []

for (k, g) in grouped_ranges:
group = list(g)
if len(group) == 1:
optimized_ranges.append(group[0])
continue
# Assert that nothing in the group has an associated unicode string.
for g in group:
if len(g[3]) > 2:
assert not g[3][2].strip()
# Assert that consecutive members of the group don't leave gaps in
# the codepoint space.
a, b = itertools.tee(group)
next(b, None)
for (g1, g2) in itertools.izip(a, b):
last_char = int(g1[1], 16)
next_char = int(g2[0], 16)
if last_char + 1 == next_char:
continue
# There's a gap where surrogates would appear, but we don't have to
# worry about that gap, as surrogates never appear in Rust strings.
# Assert we're seeing the surrogate case here.
assert last_char == 0xd7ff
assert next_char == 0xe000
first = group[0][0]
last = group[-1][1]
mapping = group[0][2]
unicode_str = group[0][3]
optimized_ranges.append((first, last, mapping, unicode_str))

for (first, last, mapping, unicode_str) in optimized_ranges:
if unicode_str is not None:
mapping += rust_slice(strtab_slice(unicode_str))
print(" Range { from: '%s', to: '%s', mapping: %s }," % (escape_char(char(first)),
escape_char(char(last)),
mapping))
Expand Down
13 changes: 9 additions & 4 deletions idna/src/uts46.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,22 @@ include!("uts46_mapping_table.rs");

#[derive(Debug)]
struct StringTableSlice {
byte_start: u16,
byte_len: u16,
// Store these as separate fields so the structure will have an
// alignment of 1 and thus pack better into the Mapping enum, below.
byte_start_lo: u8,
byte_start_hi: u8,
byte_len: u8,
}

fn decode_slice(slice: &StringTableSlice) -> &'static str {
let start = slice.byte_start as usize;
let lo = slice.byte_start_lo as usize;
let hi = slice.byte_start_hi as usize;
let start = (hi << 8) | lo;
let len = slice.byte_len as usize;
&STRING_TABLE[start..(start + len)]
}

#[repr(u16)]
#[repr(u8)]
#[derive(Debug)]
enum Mapping {
Valid,
Expand Down
Loading

0 comments on commit 37557e4

Please sign in to comment.