Skip to content

Commit fb158c6

Browse files
committed
Add Python and JSX generators
These generators were originally written by Yoshiki Shibukawa, I have fixed some compilation errors and rebased against the current snowball code.
1 parent 1354a7d commit fb158c6

20 files changed

+4755
-11
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,6 @@
33
/snowball
44
/src_c
55
/stemwords
6+
/dist
7+
/jsx_out
8+
/python_out

GNUmakefile

Lines changed: 127 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,13 @@
33
c_src_dir = src_c
44
java_src_main_dir = java/org/tartarus/snowball
55
java_src_dir = $(java_src_main_dir)/ext
6+
python_output_dir = python_out
7+
python_runtime_dir = snowballstemmer
8+
python_sample_dir = sample
9+
jsx_output_dir = jsx_out
10+
jsx_runtime_src_dir = jsx
11+
jsx_runtime_dir = lib
12+
jsx_sample_dir = sample
613

714
libstemmer_algorithms = danish dutch english finnish french german hungarian \
815
italian \
@@ -23,13 +30,17 @@ COMPILER_SOURCES = compiler/space.c \
2330
compiler/analyser.c \
2431
compiler/generator.c \
2532
compiler/driver.c \
26-
compiler/generator_java.c
33+
compiler/generator_java.c \
34+
compiler/generator_python.c \
35+
compiler/generator_jsx.c
36+
2737
COMPILER_HEADERS = compiler/header.h \
2838
compiler/syswords.h \
2939
compiler/syswords2.h
3040

3141
RUNTIME_SOURCES = runtime/api.c \
3242
runtime/utilities.c
43+
3344
RUNTIME_HEADERS = runtime/api.h \
3445
runtime/header.h
3546

@@ -38,13 +49,33 @@ JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \
3849
java/org/tartarus/snowball/SnowballStemmer.java \
3950
java/org/tartarus/snowball/TestApp.java
4051

52+
JSX_RUNTIME_SOURCES = jsx/among.jsx \
53+
jsx/base-stemmer.jsx \
54+
jsx/stemmer.jsx
55+
56+
JSX_SAMPLE_SOURCES = jsx/testapp.jsx \
57+
jsx/stemwords.jsx
58+
59+
PYTHON_RUNTIME_SOURCES = python/snowballstemmer/basestemmer.py \
60+
python/snowballstemmer/among.py
61+
62+
PYTHON_SAMPLE_SOURCES = python/testapp.py \
63+
python/stemwords.py
64+
65+
PYTHON_PACKAGE_FILES = python/MANIFEST.in \
66+
python/setup.py
67+
4168
LIBSTEMMER_SOURCES = libstemmer/libstemmer.c
4269
LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c
4370
LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h
4471
LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in
4572

4673
STEMWORDS_SOURCES = examples/stemwords.c
4774

75+
JSX_STEMWORDS_SOURCE = jsx/stemwords.jsx
76+
77+
PYTHON_STEMWORDS_SOURCE = python/stemwords.py
78+
4879
ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%/stem*.sbl)
4980
C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \
5081
$(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \
@@ -57,6 +88,9 @@ C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \
5788
C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c)
5889
C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h)
5990
JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java)
91+
PYTHON_SOURCES = $(libstemmer_algorithms:%=$(python_output_dir)/%_stemmer.py) \
92+
$(python_output_dir)/__init__.py
93+
JSX_SOURCES = $(libstemmer_algorithms:%=$(jsx_output_dir)/%-stemmer.jsx)
6094

6195
COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o)
6296
RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o)
@@ -83,10 +117,14 @@ clean:
83117
$(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \
84118
$(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \
85119
$(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \
120+
$(PYTHON_SOURCES) \
121+
$(JSX_SOURCES) jsx_stemwords \
86122
libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \
87123
libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c
88124
rm -rf dist
89125
rmdir $(c_src_dir) || true
126+
rmdir $(python_output_dir) || true
127+
rmdir $(jsx_output_dir) || true
90128

91129
snowball: $(COMPILER_OBJECTS)
92130
$(CC) -o $@ $^
@@ -113,6 +151,9 @@ libstemmer.o: libstemmer/libstemmer.o $(RUNTIME_OBJECTS) $(C_LIB_OBJECTS)
113151
stemwords: $(STEMWORDS_OBJECTS) libstemmer.o
114152
$(CC) -o $@ $^
115153

154+
jsx_stemwords: $(JSX_STEMWORDS_SOURCE) $(JSX_SOURCES)
155+
jsx --executable node --output $@ --add-search-path $(jsx_output_dir) --add-search-path $(jsx_runtime_src_dir) $(JSX_STEMWORDS_SOURCE)
156+
116157
algorithms/%/stem_Unicode.sbl: algorithms/%/stem_ISO_8859_1.sbl
117158
cp $^ $@
118159

@@ -154,12 +195,30 @@ $(java_src_dir)/%Stemmer.java: algorithms/%/stem_Unicode.sbl snowball
154195
echo "./snowball $< -j -o $${o} -p \"org.tartarus.snowball.SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer"; \
155196
./snowball $< -j -o $${o} -p "org.tartarus.snowball.SnowballStemmer" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer
156197

198+
$(python_output_dir)/%_stemmer.py: algorithms/%/stem_Unicode.sbl snowball
199+
@mkdir -p $(python_output_dir)
200+
@l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \
201+
o="$(python_output_dir)/$${l}_stemmer"; \
202+
echo "./snowball $< -py -o $${o} -p \"SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n `python -c "print('$${l}'.title())"`Stemmer"; \
203+
./snowball $< -py -o $${o} -p "BaseStemmer" -eprefix $${l}_ -r ../runtime -n `python -c "print('$${l}'.title())"`Stemmer
204+
205+
$(python_output_dir)/__init__.py:
206+
@mkdir -p $(python_output_dir)
207+
python python/create_init.py $(python_output_dir)
208+
209+
$(jsx_output_dir)/%-stemmer.jsx: algorithms/%/stem_Unicode.sbl snowball
210+
@mkdir -p $(jsx_output_dir)
211+
@l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \
212+
o="$(jsx_output_dir)/$${l}-stemmer"; \
213+
echo "./snowball $< -jsx -o $${o} -p \"SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n `python -c "print('$${l}'.title())"`Stemmer"; \
214+
./snowball $< -jsx -o $${o} -p "BaseStemmer" -eprefix $${l}_ -r ../runtime -n `python -c "print('$${l}'.title())"`Stemmer
215+
157216
splint: snowball.splint
158217
snowball.splint: $(COMPILER_SOURCES)
159218
splint $^ >$@ -weak
160219

161220
# Make a full source distribution
162-
dist: dist_snowball dist_libstemmer_c dist_libstemmer_java
221+
dist: dist_snowball dist_libstemmer_c dist_libstemmer_java dist_libstemmer_jsx dist_libstemmer_python
163222

164223
# Make a distribution of all the sources involved in snowball
165224
dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \
@@ -250,6 +309,42 @@ dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
250309
(cd dist && tar zcf $${destname}.tgz $${destname}) && \
251310
rm -rf $${dest}
252311

312+
dist_libstemmer_python: $(PYTHON_SOURCES)
313+
destname=snowballstemmer; \
314+
dest=dist/$${destname}; \
315+
rm -rf $${dest} && \
316+
rm -f $${dest}.tgz && \
317+
echo "a1" && \
318+
mkdir -p $${dest} && \
319+
mkdir -p $${dest}/src/$(python_runtime_dir) && \
320+
mkdir -p $${dest}/src/$(python_sample_dir) && \
321+
cp doc/libstemmer_python_README $${dest}/README.rst && \
322+
cp -a $(PYTHON_SOURCES) $${dest}/src/$(python_runtime_dir) && \
323+
cp -a $(PYTHON_SAMPLE_SOURCES) $${dest}/src/$(python_sample_dir) && \
324+
cp -a $(PYTHON_RUNTIME_SOURCES) $${dest}/src/$(python_runtime_dir) && \
325+
cp -a $(PYTHON_PACKAGE_FILES) $${dest} && \
326+
(cd $${dest} && python setup.py sdist && cp dist/*.tar.gz ..) && \
327+
rm -rf $${dest}
328+
329+
dist_libstemmer_jsx: $(JSX_SOURCES)
330+
destname=jsxstemmer; \
331+
dest=dist/$${destname}; \
332+
rm -rf $${dest} && \
333+
rm -f $${dest}.tgz && \
334+
mkdir -p $${dest} && \
335+
mkdir -p $${dest}/$(jsx_runtime_dir) && \
336+
mkdir -p $${dest}/$(jsx_sample_dir) && \
337+
cp -a doc/libstemmer_jsx_README $${dest}/README && \
338+
cp -a $(JSX_RUNTIME_SOURCES) $${dest}/$(jsx_runtime_dir) && \
339+
cp -a $(JSX_SAMPLE_SOURCES) $${dest}/$(jsx_sample_dir) && \
340+
cp -a $(JSX_SOURCES) $${dest}/$(jsx_runtime_dir) && \
341+
(cd $${dest} && \
342+
echo "README" >> MANIFEST && \
343+
ls $(jsx_runtime_dir)/*.jsx >> MANIFEST && \
344+
ls $(jsx_sample_dir)/*.jsx >> MANIFEST) && \
345+
(cd dist && tar zcf $${destname}.tgz $${destname}) && \
346+
rm -rf $${dest}
347+
253348
check: check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r
254349

255350
check_utf8: $(libstemmer_algorithms:%=check_utf8_%)
@@ -277,24 +372,48 @@ check_utf8_%: $(STEMMING_DATA)/% stemwords
277372

278373
check_iso_8859_1_%: $(STEMMING_DATA)/% stemwords
279374
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_1"
280-
@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-1"))' | \
375+
@iconv -fUTF8 -tISO8859-1 $</voc.txt | \
281376
./stemwords -c ISO_8859_1 -l `echo $<|sed 's!.*/!!'` -o tmp.txt
282-
@python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-1"))' | \
377+
@iconv -fUTF8 -tISO8859-1 $</output.txt | \
283378
diff -u - tmp.txt
284379
@rm tmp.txt
285380

286381
check_iso_8859_2_%: $(STEMMING_DATA)/% stemwords
287382
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_2"
288-
@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-2"))' | \
383+
@iconv -fUTF8 -tISO8859-2 $</voc.txt | \
289384
./stemwords -c ISO_8859_2 -l `echo $<|sed 's!.*/!!'` -o tmp.txt
290-
@python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-2"))' | \
385+
@iconv -fUTF8 -tISO8859-2 $</output.txt | \
291386
diff -u - tmp.txt
292387
@rm tmp.txt
293388

294389
check_koi8r_%: $(STEMMING_DATA)/% stemwords
295390
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with KOI8R"
296-
@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("koi8_r"))' | \
391+
@iconv -fUTF8 -tKOI8R $</voc.txt | \
297392
./stemwords -c KOI8_R -l `echo $<|sed 's!.*/!!'` -o tmp.txt
298-
@python -c 'print(open("$</output.txt").read().decode("utf8").encode("koi8_r"))' | \
393+
@iconv -fUTF8 -tKOI8R $</output.txt | \
299394
diff -u - tmp.txt
300395
@rm tmp.txt
396+
397+
check_jsx: $(libstemmer_algorithms:%=check_jsx_%)
398+
399+
check_jsx_%: $(STEMMING_DATA)/% jsx_stemwords
400+
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with UTF-8"
401+
@./jsx_stemwords -c utf8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt
402+
@diff -u $</output.txt tmp.txt
403+
@rm tmp.txt
404+
405+
check_python: check_python_stemwords $(libstemmer_algorithms:%=check_python_%)
406+
407+
check_python_%: $(STEMMING_DATA)/%
408+
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with UTF-8"
409+
(cd python_check && \
410+
python stemwords.py -c utf8 -l `echo $<|sed 's!.*/!!'` -i ../$</voc.txt -o tmp.txt && \
411+
diff -u ../$</output.txt tmp.txt && \
412+
rm tmp.txt)
413+
414+
check_python_stemwords: $(PYTHON_STEMWORDS_SOURCE) $(PYTHON_SOURCES)
415+
mkdir -p python_check && \
416+
mkdir -p python_check/snowballstemmer && \
417+
cp -a $(PYTHON_RUNTIME_SOURCES) python_check/snowballstemmer && \
418+
cp -a $(PYTHON_SOURCES) python_check/snowballstemmer && \
419+
cp -a $(PYTHON_STEMWORDS_SOURCE) python_check/

compiler/driver.c

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@ static void print_arglist(void) {
2222
" [-j[ava]]\n"
2323
#endif
2424
" [-c++]\n"
25+
#ifndef DISABLE_PYTHON
26+
" [-py[thon]]\n"
27+
#endif
28+
#ifndef DISABLE_JSX
29+
" [-jsx]\n"
30+
#endif
2531
" [-w[idechars]]\n"
2632
" [-u[tf8]]\n"
2733
" [-n[ame] class name]\n"
@@ -94,6 +100,13 @@ static void read_options(struct options * o, int argc, char * argv[]) {
94100
o->name = argv[i++];
95101
continue;
96102
}
103+
#ifndef DISABLE_JSX
104+
if (eq(s, "-jsx")) {
105+
o->make_lang = LANG_JSX;
106+
o->widechars = true;
107+
continue;
108+
}
109+
#endif
97110
#ifndef DISABLE_JAVA
98111
if (eq(s, "-j") || eq(s, "-java")) {
99112
o->make_lang = LANG_JAVA;
@@ -105,6 +118,13 @@ static void read_options(struct options * o, int argc, char * argv[]) {
105118
o->make_lang = LANG_CPLUSPLUS;
106119
continue;
107120
}
121+
#ifndef DISABLE_PYTHON
122+
if (eq(s, "-py") || eq(s, "-python")) {
123+
o->make_lang = LANG_PYTHON;
124+
o->widechars = true;
125+
continue;
126+
}
127+
#endif
108128
if (eq(s, "-w") || eq(s, "-widechars")) {
109129
o->widechars = true;
110130
o->utf8 = false;
@@ -237,6 +257,30 @@ extern int main(int argc, char * argv[]) {
237257
close_generator_java(g);
238258
fclose(o->output_java);
239259
}
260+
#endif
261+
#ifndef DISABLE_PYTHON
262+
if (o->make_lang == LANG_PYTHON) {
263+
symbol * b = add_s_to_b(0, s);
264+
b = add_s_to_b(b, ".py");
265+
o->output_python = get_output(b);
266+
lose_b(b);
267+
g = create_generator_python(a, o);
268+
generate_program_python(g);
269+
close_generator_python(g);
270+
fclose(o->output_python);
271+
}
272+
#endif
273+
#ifndef DISABLE_JSX
274+
if (o->make_lang == LANG_JSX) {
275+
symbol * b = add_s_to_b(0, s);
276+
b = add_s_to_b(b, ".jsx");
277+
o->output_jsx = get_output(b);
278+
lose_b(b);
279+
g = create_generator_jsx(a, o);
280+
generate_program_jsx(g);
281+
close_generator_jsx(g);
282+
fclose(o->output_jsx);
283+
}
240284
#endif
241285
}
242286
close_analyser(a);

0 commit comments

Comments
 (0)