Skip to content

Commit 8244319

Browse files
committed
sqlite: initial commit
1 parent 007ffdd commit 8244319

16 files changed

+713
-105
lines changed

.jshintrc

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"quotmark": "single",
1717
"undef": true,
1818
"unused": false,
19-
"maxparams": 4,
19+
"maxparams": 5,
2020
"maxdepth": 4,
2121
"maxlen": 140
2222
}

Placeholder.js

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11

22
var _ = require('lodash'),
3-
TokenGraph = require('./lib/TokenGraph'),
43
DocStore = require('./lib/DocStore');
54

65
// constructor
7-
function Placeholder(){
8-
this.graph = new TokenGraph();
9-
this.store = new DocStore();
6+
function Placeholder( options ){
7+
this.store = new DocStore( options );
108
}
119

1210
// load prototype methods from modules

cmd/build.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd );
44

55
PLACEHOLDER_DATA=${PLACEHOLDER_DATA:-"./data"};
66

7-
rm -f ${PLACEHOLDER_DATA}/graph.json ${PLACEHOLDER_DATA}/store.sqlite3;
7+
rm -f ${PLACEHOLDER_DATA}/store.sqlite3;
88

99
cat ${PLACEHOLDER_DATA}/wof.extract | node ${DIR}/load.js
1010

cmd/load.js

+5-5
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@ process.stdin.pipe( split() )
1515
.pipe( through.obj( function insert( row, _, next ){
1616
ph.insertWofRecord( row, next );
1717
}, function flush( next ){
18-
ph.printStatistics();
19-
console.error('sorting...');
20-
ph.graph.sort(); // sort all arrays
21-
console.error('vacuuming sqlite db...');
22-
ph.store.db.run('VACUUM;');
18+
// ph.printStatistics();
19+
console.error('pre commit...');
20+
ph.store.preCommit();
21+
// console.error('sorting...');
22+
// ph.graph.sort(); // sort all arrays
2323
console.error('saving...');
2424
ph.save();
2525
next();

lib/DocStore.js

+83-3
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,14 @@ var codec = {
1414
// connect to and configure sqlite3 database
1515
function DocStore(){}
1616

17-
DocStore.prototype.open = function( path ){
18-
this.db = new sqlite3.Database( path );
17+
DocStore.prototype.open = function( path, options ){
18+
19+
if( options && true === options.readonly ){
20+
this.db = new sqlite3.Database( path, sqlite3.OPEN_READONLY );
21+
} else {
22+
this.db = new sqlite3.Database( path );
23+
}
24+
1925
this.configure();
2026
};
2127

@@ -37,8 +43,62 @@ DocStore.prototype.configure = function(){
3743
DocStore.prototype.reset = function(){
3844
this.db.serialize(function(){
3945
this.db.run('DROP TABLE IF EXISTS docs;');
46+
this.db.run('DROP TABLE IF EXISTS lineage;');
47+
this.db.run('DROP TABLE IF EXISTS tokens;');
48+
this.db.run('DROP TABLE IF EXISTS fulltext;');
49+
this.db.run('VACUUM;');
50+
51+
this.db.run('CREATE TABLE docs( id INTEGER PRIMARY KEY, json TEXT );');
52+
this.db.run('CREATE TABLE lineage( id INTEGER, pid INTEGER );');
53+
this.db.run('CREATE TABLE tokens( id INTEGER, lang STRING, tag STRING, token STRING );');
54+
55+
var options = [
56+
'token',
57+
'tokenize="' + [
58+
'unicode61',
59+
'remove_diacritics 0',
60+
'tokenchars \'_\''
61+
].join(' ') + '"',
62+
'prefix=1',
63+
'prefix=2',
64+
'prefix=3',
65+
'prefix=4',
66+
'prefix=5',
67+
'prefix=6',
68+
'prefix=7',
69+
'prefix=8',
70+
'prefix=9',
71+
'prefix=10',
72+
'prefix=11',
73+
'prefix=12',
74+
'prefix=13',
75+
// 'content=\'\'',
76+
// 'detail=none',
77+
'columnsize=0'
78+
].join(', ');
79+
80+
this.db.run('CREATE VIRTUAL TABLE fulltext USING fts5(' + options + ');');
81+
}.bind(this));
82+
};
83+
84+
DocStore.prototype.preCommit = function(){
85+
this.db.serialize(function(){
86+
87+
console.error('create indices...');
88+
this.db.run('CREATE INDEX IF NOT EXISTS lineage_cover_idx ON lineage(id, pid);');
89+
this.db.run('CREATE INDEX IF NOT EXISTS tokens_cover_idx ON tokens(id, lang, tag);');
90+
91+
// this is quite large, could use the fulltext index instead?
92+
this.db.run('CREATE INDEX IF NOT EXISTS tokens_token_idx ON tokens(token);');
93+
94+
console.error('create fulltext table...');
95+
this.db.run('INSERT INTO fulltext(rowid, token) SELECT rowid, REPLACE(token," ","_") FROM tokens;');
96+
97+
console.error('optimize fulltext table...');
98+
this.db.run('INSERT INTO fulltext(fulltext) VALUES(\'optimize\');');
99+
100+
console.error('vacuuming sqlite db...');
40101
this.db.run('VACUUM;');
41-
this.db.get('CREATE TABLE docs( id INTEGER PRIMARY KEY, json TEXT );');
42102
}.bind(this));
43103
};
44104

@@ -49,6 +109,26 @@ DocStore.prototype.set = function( id, doc, cb ){
49109
);
50110
};
51111

112+
DocStore.prototype.setLineage = function( id, pids, cb ){
113+
if( !Array.isArray( pids ) || !pids.length ){ return cb(); }
114+
this.db.run(
115+
'INSERT INTO lineage ( id, pid ) VALUES ' + pids.map( pid => {
116+
return '(' + id + ',' + pid + ')';
117+
}).join(',') + ';',
118+
cb
119+
);
120+
};
121+
122+
DocStore.prototype.setTokens = function( id, tokens, cb ){
123+
if( !Array.isArray( tokens ) || !tokens.length ){ return cb(); }
124+
this.db.run(
125+
'INSERT INTO tokens ( id, lang, tag, token ) VALUES ' + tokens.map( token => {
126+
return '(' + id + ',"' + token.lang + '","' + token.tag + '","' + token.body + '")';
127+
}).join(',') + ';',
128+
cb
129+
);
130+
};
131+
52132
DocStore.prototype.get = function( id, cb ){
53133
this.db.get(
54134
'SELECT json FROM docs WHERE id = $id LIMIT 1;',

package.json

+3-2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
},
3232
"homepage": "https://github.com/pelias/placeholder#readme",
3333
"dependencies": {
34+
"async": "^2.5.0",
3435
"express": "^4.15.2",
3536
"lodash": "^4.17.4",
3637
"lower-case": "^1.1.4",
@@ -40,7 +41,7 @@
4041
"require-dir": "^0.3.1",
4142
"sorted-intersect": "^0.1.4",
4243
"split2": "^2.1.1",
43-
"sqlite3": "^3.1.8",
44+
"sqlite3": "^3.1.13",
4445
"through2": "^2.0.3"
4546
},
4647
"devDependencies": {
@@ -50,7 +51,7 @@
5051
"precommit-hook": "^3.0.0",
5152
"semantic-release": "^8.2.0",
5253
"tap-spec": "^4.1.1",
53-
"tape": "^4.6.3"
54+
"tape": "^4.8.0"
5455
},
5556
"pre-commit": [
5657
"lint",

prototype/io.js

+17-22
Original file line numberDiff line numberDiff line change
@@ -4,44 +4,39 @@ var fs = require('fs'),
44
path = require('path');
55

66
var dataDir = process.env.PLACEHOLDER_DATA || path.join( __dirname, '../data/');
7-
var graphPath = path.join( dataDir, 'graph.json' );
87
var storePath = path.join( dataDir, 'store.sqlite3' );
98

9+
// WIP
10+
var SqlDatabase = require('../wip/SqlDatabase');
11+
1012
// load data from disk
1113
module.exports.load = function( opts ){
1214
this.store.open( storePath );
1315
if( opts && opts.reset === true ){
1416
this.store.reset();
15-
} else {
16-
var graph = require( graphPath );
17-
this.graph.nodes = graph.nodes;
18-
this.graph.edges = graph.edges;
1917
}
18+
19+
// sql
20+
var db = new SqlDatabase( this.store.db );
21+
22+
// both
23+
var tokenize = require('../wip/test_tokenize').tokenize.bind({ db: db });
24+
var query = require('../wip/query').query.bind( null, db, tokenize );
25+
26+
// WIP
27+
this.wip = {
28+
db: db,
29+
tokenize: tokenize,
30+
query: query
31+
};
2032
};
2133

2234
// save data to disk
2335
module.exports.save = function( path ){
24-
fs.writeFileSync( graphPath, JSON.stringify( this.graph ) );
2536
this.close();
2637
};
2738

2839
// gracefully close connections
2940
module.exports.close = function(){
3041
this.store.close();
3142
};
32-
33-
// deserialize data
34-
// module.exports.import = function( data ){
35-
// this.graph.nodes = data.nodes;
36-
// this.graph.edges = data.edges;
37-
// this.store.docs = data.docs;
38-
// };
39-
//
40-
// // serialize data
41-
// module.exports.export = function( path ){
42-
// return {
43-
// docs: this.store.docs,
44-
// nodes: this.graph.nodes,
45-
// edges: this.graph.edges
46-
// };
47-
// };

prototype/wof.js

+68-18
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ function insertWofRecord( wof, next ){
3030
lat: wof['lbl:latitude'] || wof['geom:latitude'],
3131
lon: wof['lbl:longitude'] ||wof['geom:longitude']
3232
},
33-
names: {}
33+
names: {},
34+
tokens: [],
35+
parentIds: []
3436
};
3537

3638
// --- cast strings to numeric types ---
@@ -44,44 +46,41 @@ function insertWofRecord( wof, next ){
4446

4547
// --- tokens ---
4648

47-
// convenience function with $id bound as first argument
48-
var addToken = this.graph.addToken.bind( this.graph, id );
49-
5049
// disable adding tokens to the index for the 'empire' placetype.
5150
// this ensures empire records are not retrieved via search.
5251
if( 'empire' !== doc.placetype ){
5352

5453
// add 'wof:label'
55-
analysis.normalize( wof['wof:label'] ).forEach( addToken );
54+
doc.tokens.push({ lang: 'und', tag: 'label', body: wof['wof:label'] });
5655

5756
// add 'wof:name'
58-
analysis.normalize( wof['wof:name'] ).forEach( addToken );
57+
doc.tokens.push({ lang: 'und', tag: 'label', body: wof['wof:name'] });
5958

6059
// add 'wof:abbreviation'
61-
analysis.normalize( wof['wof:abbreviation'] ).forEach( addToken );
60+
doc.tokens.push({ lang: 'und', tag: 'abbr', body: wof['wof:abbreviation'] });
6261

6362
// add 'ne:abbrev'
64-
// analysis.normalize( wof['ne:abbrev'] ).forEach( addToken );
63+
// doc.tokens.push({ lang: 'und', body: wof['ne:abbrev'] });
6564

6665
// fields specific to countries & dependencies
6766
if( 'country' === doc.placetype || 'dependency' === doc.placetype ) {
6867
if( wof['iso:country'] && wof['iso:country'] !== 'XX' ){
6968

7069
// add 'ne:iso_a2'
71-
analysis.normalize( wof['ne:iso_a2'] ).forEach( addToken );
70+
doc.tokens.push({ lang: 'und', tag: 'abbr', body: wof['ne:iso_a2'] });
7271

7372
// add 'ne:iso_a3'
74-
analysis.normalize( wof['ne:iso_a3'] ).forEach( addToken );
73+
doc.tokens.push({ lang: 'und', tag: 'abbr', body: wof['ne:iso_a3'] });
7574

7675
// add 'wof:country'
7776
// warning: eg. FR for 'French Guiana'
78-
// analysis.normalize( wof['wof:country'] ).forEach( addToken );
77+
// doc.tokens.push({ lang: 'und', tag: 'abbr', body: wof['wof:country'] });
7978

8079
// add 'iso:country'
81-
analysis.normalize( wof['iso:country'] ).forEach( addToken );
80+
doc.tokens.push({ lang: 'und', tag: 'abbr', body: wof['iso:country'] });
8281

8382
// add 'wof:country_alpha3'
84-
analysis.normalize( wof['wof:country_alpha3'] ).forEach( addToken );
83+
doc.tokens.push({ lang: 'und', tag: 'abbr', body: wof['wof:country_alpha3'] });
8584
}
8685
}
8786

@@ -97,7 +96,11 @@ function insertWofRecord( wof, next ){
9796

9897
// index each alternative name
9998
for( var n in wof[ attr ] ){
100-
analysis.normalize( wof[ attr ][ n ] ).forEach( addToken );
99+
doc.tokens.push({
100+
lang: match[1],
101+
tag: match[2],
102+
body: wof[ attr ][ n ]
103+
});
101104
}
102105

103106
// doc - only store 'preferred' strings
@@ -121,7 +124,7 @@ function insertWofRecord( wof, next ){
121124
parentId = wof['wof:parent_id'];
122125
if( 'string' === typeof parentId ){ parentId = parseInt( parentId, 10 ); }
123126
if( !isNaN( parentId ) && parentId !== id && parentId > 0 ){
124-
this.graph.setEdge( parentId, id ); // is child of
127+
doc.parentIds.push( parentId ); // is child of
125128
}
126129
}
127130

@@ -131,15 +134,62 @@ function insertWofRecord( wof, next ){
131134
var pid = wof['wof:hierarchy'][h][i];
132135
if( 'string' === typeof pid ){ pid = parseInt( pid, 10 ); }
133136
if( pid === id || pid <= 0 || pid === parentId ){ continue; }
134-
// this.graph.setEdge( id, pid, 'p' ); // has parent
135-
this.graph.setEdge( pid, id ); // is child of
137+
// doc.parentIds.push( id, pid, 'p' ); // has parent
138+
doc.parentIds.push( pid ); // is child of
136139
}
137140
}
138141

142+
// ---- consume aggregates
143+
144+
// normalize tokens
145+
doc.tokens = doc.tokens.reduce(( res, token ) => {
146+
analysis.normalize( token.body ).forEach( norm => {
147+
res.push({ lang: token.lang, tag: token.tag, body: norm });
148+
});
149+
return res;
150+
}, []);
151+
152+
// deduplicate tokens
153+
var seen = {};
154+
doc.tokens = doc.tokens.filter( token => {
155+
return seen.hasOwnProperty( token.body ) ? false : ( seen[ token.body ] = true );
156+
});
157+
158+
// store tokens in graph
159+
// doc.tokens.forEach(token => {
160+
// this.graph.addToken( doc.id, token );
161+
// }, this);
162+
163+
// deduplicate parent ids
164+
doc.parentIds = doc.parentIds.filter(( pid, pos ) => {
165+
return doc.parentIds.indexOf( pid ) === pos;
166+
});
167+
168+
// store parent ids
169+
// doc.parentIds.forEach(pid => {
170+
// this.graph.setEdge( pid, doc.id );
171+
// }, this);
172+
139173
// --- store ---
140174
// add doc to store
141-
this.store.set( id, doc, next );
142175

176+
var tokens = doc.tokens;
177+
var parentIds = doc.parentIds;
178+
179+
// --- delete fields
180+
delete doc.tokens;
181+
delete doc.parentIds;
182+
183+
this.store.set( id, doc, ( err ) => {
184+
if( err ){ console.error( err ); }
185+
this.store.setTokens( id, tokens, ( err ) => {
186+
if( err ){ console.error( err ); }
187+
this.store.setLineage( id, parentIds, ( err ) => {
188+
if( err ){ console.error( err ); }
189+
next();
190+
});
191+
});
192+
});
143193
}
144194

145195
// check if value is a valid number

0 commit comments

Comments
 (0)