diff --git a/tokenization/FST.js b/tokenization/FST.js new file mode 100644 index 00000000..c9e5f12d --- /dev/null +++ b/tokenization/FST.js @@ -0,0 +1,150 @@ +/** + * FST - Finite-state transducer data structure + * + * A graph structure which is very efficient for querying prefix (or suffix) matches. + * + * see: https://www.elastic.co/blog/you-complete-me + */ + +const Graph = require('./Graph') +const ETX = String.fromCharCode(3) // end-of-text + +class FST { + constructor () { + this.head = new Graph() + this.tail = new Graph() + } + + // add a new token to the index + add (token) { + if (this.has(token)) { return } + this._index(this._split(token)) + } + + // remove token from index + delete (token) { + this._deindex(this._split(token)) + } + + // index contains token + has (token) { + let node = this._walk(this.head, '>', this._split(token)) + return !!node && !!node.length(`>${ETX}`) + } + + // index contains a token with this prefix + hasPrefix (prefix) { + let node = this._walk(this.head, '>', this._split(prefix)) + return !!node && (!node.findOne(`>${ETX}`) || node.findOne('_meta')['>count'] > 1) + } + + // index contains a token with this suffix + hasSuffix (suffix) { + let node = this._walk(this.tail, '<', this._split(suffix).reverse()) + return !!node && (!node.findOne(`<${ETX}`) || node.findOne('_meta')[' 1) + } + + // split token in to characters + _split (token) { return (token || '').split('') } + + // walk the graph & return the last node + _walk (parent, direction, chars, create, each) { + let DIR = (direction === '<') ? '<' : '>' + let END = (DIR === '<') ? this.head : this.tail + let LAST = (chars.length - 1) + for (let i = 0; i < chars.length; i++) { + let child = parent.findOne(`${DIR}${chars[i]}`) + if (create === true) { + // create new graph node + if (!child) { + child = new Graph() + child.add('_meta', { '>count': 0, 'e>x + * [3] >e>x>a + * [2] >e>x>a>m + * [2] >e>x>a>m>p + * [2] >e>x>a>m>p>l + * [2] >e>x>a>m>p>l>e + * [2] >e>x>a>m>p>l>e>s + * [3] >e>x>c + * [1] >e>x>c>e + * [1] >e>x>c>e>s + * [1] >e>x>c>e>s>s + */ + print (node, direction) { + this._recurse(node, direction, (path, count) => { + console.error(`[${count}] ${path}`) + }) + } + + _recurse (node, direction, each, prefix) { + if (!direction) { direction = '>' } + if (!prefix) { prefix = '' } + for (let key in node.edges) { + if (key[0] !== direction) { continue } + if (!node.length(key)) { continue } + if (node !== this.head && node !== this.tail) { + let count = node.findOne('_meta')[`${direction}count`] + each(prefix, count) + } else if (prefix.length > 0) { return } + this._recurse(node.findOne(key), direction, each, prefix + key) + } + } + + // walk the graph & add characters to graph + _index (chars) { + this._walk(this.head, '>', chars, true) + this._walk(this.tail, '<', chars.reverse(), true) + } + + // walk the graph & remove characters from graph + _deindex (chars) { + // left-to-right + let node = this._walk(this.head, '>', chars) + if (node && node.remove(`>${ETX}`, this.tail)) { + this._walk(this.head, '>', chars, false, (child, parent, char) => { + let meta = child.findOne('_meta') + if (meta && --meta['>count'] < 1) { + parent.remove(`>${char}`, child) + } + }) + } + + // right-to-left + let reversed = chars.slice().reverse() + node = this._walk(this.tail, '<', reversed) + if (node && node.remove(`<${ETX}`, this.head)) { + this._walk(this.tail, '<', reversed, false, (child, parent, char) => { + let meta = child.findOne('_meta') + if (meta && --meta[' { + test('constructor', (t) => { + let fst = new FST() + t.deepEquals(fst.head.constructor.name, 'Graph') + t.deepEquals(fst.tail.constructor.name, 'Graph') + t.deepEquals(Object.keys(fst.head.edges).length, 0) + t.deepEquals(Object.keys(fst.tail.edges).length, 0) + t.end() + }) +} + +module.exports.tests.graph = (test) => { + test('graph', (t) => { + let fst = new FST() + fst.add('example') + + // left-to-right + t.true( + fst.head + .findOne('>e') + .findOne('>x') + .findOne('>a') + .findOne('>m') + .findOne('>p') + .findOne('>l') + .findOne('>e') + .findOne(`>${ETX}`) + ) + + // right-to-left + t.true( + fst.tail + .findOne(' { + test('has', (t) => { + let fst = new FST() + + fst.add('example') + fst.add('exam') + t.true(fst.has('example')) + t.true(fst.has('exam')) + + t.end() + }) +} + +module.exports.tests._meta = (test) => { + test('_meta', (t) => { + let fst = new FST() + + // add term + fst.add('example') + + // meta-data + t.equals(Object.keys(fst.head.edges).length, 1) + t.equals(fst._walk(fst.head, '>', fst._split('e')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('ex')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('exa')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('exam')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('examp')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('exampl')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('example')).findOne('_meta')['>count'], 1) + + // duplicate term + fst.add('example') + + // meta-data + t.equals(Object.keys(fst.head.edges).length, 1) + t.equals(fst._walk(fst.head, '>', fst._split('e')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('ex')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('exa')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('exam')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('examp')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('exampl')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('example')).findOne('_meta')['>count'], 1) + + // add term + fst.add('excess') + + t.equals(Object.keys(fst.head.edges).length, 1) + t.equals(fst._walk(fst.head, '>', fst._split('e')).findOne('_meta')['>count'], 2) + t.equals(fst._walk(fst.head, '>', fst._split('ex')).findOne('_meta')['>count'], 2) + t.equals(fst._walk(fst.head, '>', fst._split('exa')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('exam')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('examp')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('exampl')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('example')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('exc')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('exce')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('exces')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('excess')).findOne('_meta')['>count'], 1) + + // add term + fst.add('examples') + + // meta-data + t.equals(Object.keys(fst.head.edges).length, 1) + t.equals(fst._walk(fst.head, '>', fst._split('e')).findOne('_meta')['>count'], 3) + t.equals(fst._walk(fst.head, '>', fst._split('ex')).findOne('_meta')['>count'], 3) + t.equals(fst._walk(fst.head, '>', fst._split('exa')).findOne('_meta')['>count'], 2) + t.equals(fst._walk(fst.head, '>', fst._split('exam')).findOne('_meta')['>count'], 2) + t.equals(fst._walk(fst.head, '>', fst._split('examp')).findOne('_meta')['>count'], 2) + t.equals(fst._walk(fst.head, '>', fst._split('exampl')).findOne('_meta')['>count'], 2) + t.equals(fst._walk(fst.head, '>', fst._split('example')).findOne('_meta')['>count'], 2) + t.equals(fst._walk(fst.head, '>', fst._split('examples')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('exc')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('exce')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('exces')).findOne('_meta')['>count'], 1) + t.equals(fst._walk(fst.head, '>', fst._split('excess')).findOne('_meta')['>count'], 1) + + t.end() + }) +} + +module.exports.tests.prefix = (test) => { + test('prefix', (t) => { + let fst = new FST() + + fst.add('example') + t.true(fst.hasPrefix('e')) + t.true(fst.hasPrefix('ex')) + t.true(fst.hasPrefix('exa')) + t.true(fst.hasPrefix('exam')) + t.true(fst.hasPrefix('examp')) + t.true(fst.hasPrefix('exampl')) + t.false(fst.hasPrefix('example')) + + fst.add('example') + t.true(fst.hasPrefix('e')) + t.true(fst.hasPrefix('ex')) + t.true(fst.hasPrefix('exa')) + t.true(fst.hasPrefix('exam')) + t.true(fst.hasPrefix('examp')) + t.true(fst.hasPrefix('exampl')) + t.false(fst.hasPrefix('example')) + + fst.add('example@') + t.true(fst.hasPrefix('e')) + t.true(fst.hasPrefix('ex')) + t.true(fst.hasPrefix('exa')) + t.true(fst.hasPrefix('exam')) + t.true(fst.hasPrefix('examp')) + t.true(fst.hasPrefix('exampl')) + t.true(fst.hasPrefix('example')) + t.false(fst.hasPrefix('example@')) + + t.end() + }) +} + +module.exports.tests.suffix = (test) => { + test('suffix', (t) => { + let fst = new FST() + + fst.add('example') + t.true(fst.hasSuffix('e')) + t.true(fst.hasSuffix('le')) + t.true(fst.hasSuffix('ple')) + t.true(fst.hasSuffix('mple')) + t.true(fst.hasSuffix('ample')) + t.true(fst.hasSuffix('xample')) + t.false(fst.hasSuffix('example')) + + fst.add('example') + t.true(fst.hasSuffix('e')) + t.true(fst.hasSuffix('le')) + t.true(fst.hasSuffix('ple')) + t.true(fst.hasSuffix('mple')) + t.true(fst.hasSuffix('ample')) + t.true(fst.hasSuffix('xample')) + t.false(fst.hasSuffix('example')) + + fst.add('@example') + t.true(fst.hasSuffix('e')) + t.true(fst.hasSuffix('le')) + t.true(fst.hasSuffix('ple')) + t.true(fst.hasSuffix('mple')) + t.true(fst.hasSuffix('ample')) + t.true(fst.hasSuffix('xample')) + t.true(fst.hasSuffix('example')) + t.false(fst.hasSuffix('@example')) + + t.end() + }) +} + +module.exports.tests.delete = (test) => { + test('delete', (t) => { + let fst = new FST() + t.deepEquals(Object.keys(fst.head.edges), []) + t.deepEquals(Object.keys(fst.tail.edges), []) + + fst.add('example') + t.deepEquals(Object.keys(fst.head.edges), ['>e']) + t.deepEquals(Object.keys(fst.tail.edges), ['e']) + t.deepEquals(Object.keys(fst.tail.edges), ['e']) + t.deepEquals(Object.keys(fst.tail.edges), [' { + function test (name, testFunction) { + return tape(`FST: ${name}`, testFunction) + } + + for (var testCase in module.exports.tests) { + module.exports.tests[testCase](test, common) + } +} diff --git a/tokenization/Graph.js b/tokenization/Graph.js index 66cc24e3..3ca548e9 100644 --- a/tokenization/Graph.js +++ b/tokenization/Graph.js @@ -16,6 +16,10 @@ class Graph { if (!this.edges[relationship]) { return false } let len = this.edges[relationship].length this.edges[relationship] = this.edges[relationship].filter(n => n !== node) + if (!this.edges[relationship].length) { + delete this.edges[relationship] + return true + } return this.edges[relationship].length !== len } diff --git a/tokenization/Graph.test.js b/tokenization/Graph.test.js index 223b16dd..c9a1b50a 100644 --- a/tokenization/Graph.test.js +++ b/tokenization/Graph.test.js @@ -57,7 +57,7 @@ module.exports.tests.remove = (test) => { // remove node from 'bar' let ok3 = graph.remove('bar', node1) - t.equal(graph.edges.bar.length, 0) + t.false(graph.edges.bar) t.true(ok3) // remove node from 'baz'