Skip to content

Commit c3455d6

Browse files
committed
Add Python generator
This generator was originally written by Yoshiki Shibukawa, I have fixed some compilation errors and rebased against the current snowball code.
1 parent 9e0a834 commit c3455d6

File tree

14 files changed

+2282
-14
lines changed

14 files changed

+2282
-14
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@
33
/snowball
44
/src_c
55
/stemwords
6+
/dist
7+
/python_out

.travis.yml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
language: c
2-
compiler: gcc
1+
language: python
2+
python:
3+
- 2.7
4+
- 3.3
5+
- 3.4
36
before_script: git clone https://github.com/snowballstem/snowball-data ../snowball-data
4-
script: make check
7+
script:
8+
- make check
9+
- make check_python python=python

GNUmakefile

Lines changed: 74 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
c_src_dir = src_c
44
java_src_main_dir = java/org/tartarus/snowball
55
java_src_dir = $(java_src_main_dir)/ext
6+
python ?= python3
7+
python_output_dir = python_out
8+
python_runtime_dir = snowballstemmer
9+
python_sample_dir = sample
610

711
libstemmer_algorithms = danish dutch english finnish french german hungarian \
812
italian \
@@ -23,13 +27,16 @@ COMPILER_SOURCES = compiler/space.c \
2327
compiler/analyser.c \
2428
compiler/generator.c \
2529
compiler/driver.c \
26-
compiler/generator_java.c
30+
compiler/generator_java.c \
31+
compiler/generator_python.c
32+
2733
COMPILER_HEADERS = compiler/header.h \
2834
compiler/syswords.h \
2935
compiler/syswords2.h
3036

3137
RUNTIME_SOURCES = runtime/api.c \
3238
runtime/utilities.c
39+
3340
RUNTIME_HEADERS = runtime/api.h \
3441
runtime/header.h
3542

@@ -38,13 +45,24 @@ JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \
3845
java/org/tartarus/snowball/SnowballStemmer.java \
3946
java/org/tartarus/snowball/TestApp.java
4047

48+
PYTHON_RUNTIME_SOURCES = python/snowballstemmer/basestemmer.py \
49+
python/snowballstemmer/among.py
50+
51+
PYTHON_SAMPLE_SOURCES = python/testapp.py \
52+
python/stemwords.py
53+
54+
PYTHON_PACKAGE_FILES = python/MANIFEST.in \
55+
python/setup.py
56+
4157
LIBSTEMMER_SOURCES = libstemmer/libstemmer.c
4258
LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c
4359
LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h
4460
LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in
4561

4662
STEMWORDS_SOURCES = examples/stemwords.c
4763

64+
PYTHON_STEMWORDS_SOURCE = python/stemwords.py
65+
4866
ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%/stem*.sbl)
4967
C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \
5068
$(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \
@@ -57,6 +75,8 @@ C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \
5775
C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c)
5876
C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h)
5977
JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java)
78+
PYTHON_SOURCES = $(libstemmer_algorithms:%=$(python_output_dir)/%_stemmer.py) \
79+
$(python_output_dir)/__init__.py
6080

6181
COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o)
6282
RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o)
@@ -83,10 +103,12 @@ clean:
83103
$(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \
84104
$(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \
85105
$(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \
106+
$(PYTHON_SOURCES) \
86107
libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \
87108
libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c
88109
rm -rf dist
89110
rmdir $(c_src_dir) || true
111+
rmdir $(python_output_dir) || true
90112

91113
snowball: $(COMPILER_OBJECTS)
92114
$(CC) -o $@ $^
@@ -154,12 +176,23 @@ $(java_src_dir)/%Stemmer.java: algorithms/%/stem_Unicode.sbl snowball
154176
echo "./snowball $< -j -o $${o} -p \"org.tartarus.snowball.SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer"; \
155177
./snowball $< -j -o $${o} -p "org.tartarus.snowball.SnowballStemmer" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer
156178

179+
$(python_output_dir)/%_stemmer.py: algorithms/%/stem_Unicode.sbl snowball
180+
@mkdir -p $(python_output_dir)
181+
@l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \
182+
o="$(python_output_dir)/$${l}_stemmer"; \
183+
echo "./snowball $< -py -o $${o} -p \"SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n `$(python) -c "print('$${l}'.title())"`Stemmer"; \
184+
./snowball $< -py -o $${o} -p "BaseStemmer" -eprefix $${l}_ -r ../runtime -n `$(python) -c "print('$${l}'.title())"`Stemmer
185+
186+
$(python_output_dir)/__init__.py:
187+
@mkdir -p $(python_output_dir)
188+
$(python) python/create_init.py $(python_output_dir)
189+
157190
splint: snowball.splint
158191
snowball.splint: $(COMPILER_SOURCES)
159192
splint $^ >$@ -weak
160193

161194
# Make a full source distribution
162-
dist: dist_snowball dist_libstemmer_c dist_libstemmer_java
195+
dist: dist_snowball dist_libstemmer_c dist_libstemmer_java dist_libstemmer_python
163196

164197
# Make a distribution of all the sources involved in snowball
165198
dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \
@@ -250,6 +283,23 @@ dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
250283
(cd dist && tar zcf $${destname}.tgz $${destname}) && \
251284
rm -rf $${dest}
252285

286+
dist_libstemmer_python: $(PYTHON_SOURCES)
287+
destname=snowballstemmer; \
288+
dest=dist/$${destname}; \
289+
rm -rf $${dest} && \
290+
rm -f $${dest}.tgz && \
291+
echo "a1" && \
292+
mkdir -p $${dest} && \
293+
mkdir -p $${dest}/src/$(python_runtime_dir) && \
294+
mkdir -p $${dest}/src/$(python_sample_dir) && \
295+
cp doc/libstemmer_python_README $${dest}/README.rst && \
296+
cp -a $(PYTHON_SOURCES) $${dest}/src/$(python_runtime_dir) && \
297+
cp -a $(PYTHON_SAMPLE_SOURCES) $${dest}/src/$(python_sample_dir) && \
298+
cp -a $(PYTHON_RUNTIME_SOURCES) $${dest}/src/$(python_runtime_dir) && \
299+
cp -a $(PYTHON_PACKAGE_FILES) $${dest} && \
300+
(cd $${dest} && $(python) setup.py sdist && cp dist/*.tar.gz ..) && \
301+
rm -rf $${dest}
302+
253303
check: check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r
254304

255305
check_utf8: $(libstemmer_algorithms:%=check_utf8_%)
@@ -277,24 +327,40 @@ check_utf8_%: $(STEMMING_DATA)/% stemwords
277327

278328
check_iso_8859_1_%: $(STEMMING_DATA)/% stemwords
279329
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_1"
280-
@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-1"))' | \
330+
@iconv -fUTF8 -tISO8859-1 $</voc.txt | \
281331
./stemwords -c ISO_8859_1 -l `echo $<|sed 's!.*/!!'` -o tmp.txt
282-
@python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-1"))' | \
332+
@iconv -fUTF8 -tISO8859-1 $</output.txt | \
283333
diff -u - tmp.txt
284334
@rm tmp.txt
285335

286336
check_iso_8859_2_%: $(STEMMING_DATA)/% stemwords
287337
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_2"
288-
@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-2"))' | \
338+
@iconv -fUTF8 -tISO8859-2 $</voc.txt | \
289339
./stemwords -c ISO_8859_2 -l `echo $<|sed 's!.*/!!'` -o tmp.txt
290-
@python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-2"))' | \
340+
@iconv -fUTF8 -tISO8859-2 $</output.txt | \
291341
diff -u - tmp.txt
292342
@rm tmp.txt
293343

294344
check_koi8r_%: $(STEMMING_DATA)/% stemwords
295345
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with KOI8R"
296-
@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("koi8_r"))' | \
346+
@iconv -fUTF8 -tKOI8R $</voc.txt | \
297347
./stemwords -c KOI8_R -l `echo $<|sed 's!.*/!!'` -o tmp.txt
298-
@python -c 'print(open("$</output.txt").read().decode("utf8").encode("koi8_r"))' | \
348+
@iconv -fUTF8 -tKOI8R $</output.txt | \
299349
diff -u - tmp.txt
300350
@rm tmp.txt
351+
352+
check_python: check_python_stemwords $(libstemmer_algorithms:%=check_python_%)
353+
354+
check_python_%: $(STEMMING_DATA)/%
355+
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with UTF-8"
356+
(cd python_check && \
357+
$(python) stemwords.py -c utf8 -l `echo $<|sed 's!.*/!!'` -i ../$</voc.txt -o tmp.txt && \
358+
diff -u ../$</output.txt tmp.txt && \
359+
rm tmp.txt)
360+
361+
check_python_stemwords: $(PYTHON_STEMWORDS_SOURCE) $(PYTHON_SOURCES)
362+
mkdir -p python_check && \
363+
mkdir -p python_check/snowballstemmer && \
364+
cp -a $(PYTHON_RUNTIME_SOURCES) python_check/snowballstemmer && \
365+
cp -a $(PYTHON_SOURCES) python_check/snowballstemmer && \
366+
cp -a $(PYTHON_STEMWORDS_SOURCE) python_check/

compiler/driver.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ static void print_arglist(void) {
2222
" [-j[ava]]\n"
2323
#endif
2424
" [-c++]\n"
25+
#ifndef DISABLE_PYTHON
26+
" [-py[thon]]\n"
27+
#endif
2528
" [-w[idechars]]\n"
2629
" [-u[tf8]]\n"
2730
" [-n[ame] class name]\n"
@@ -105,6 +108,13 @@ static void read_options(struct options * o, int argc, char * argv[]) {
105108
o->make_lang = LANG_CPLUSPLUS;
106109
continue;
107110
}
111+
#ifndef DISABLE_PYTHON
112+
if (eq(s, "-py") || eq(s, "-python")) {
113+
o->make_lang = LANG_PYTHON;
114+
o->widechars = true;
115+
continue;
116+
}
117+
#endif
108118
if (eq(s, "-w") || eq(s, "-widechars")) {
109119
o->widechars = true;
110120
o->utf8 = false;
@@ -237,6 +247,18 @@ extern int main(int argc, char * argv[]) {
237247
close_generator_java(g);
238248
fclose(o->output_java);
239249
}
250+
#endif
251+
#ifndef DISABLE_PYTHON
252+
if (o->make_lang == LANG_PYTHON) {
253+
symbol * b = add_s_to_b(0, s);
254+
b = add_s_to_b(b, ".py");
255+
o->output_python = get_output(b);
256+
lose_b(b);
257+
g = create_generator_python(a, o);
258+
generate_program_python(g);
259+
close_generator_python(g);
260+
fclose(o->output_python);
261+
}
240262
#endif
241263
}
242264
close_analyser(a);

0 commit comments

Comments
 (0)