Skip to content

Commit 0aeef16

Browse files
committed
Add Python generator
This generator was originally written by Yoshiki Shibukawa, I have fixed some compilation errors and rebased against the current snowball code.
1 parent 30e10d9 commit 0aeef16

File tree

14 files changed

+2277
-5
lines changed

14 files changed

+2277
-5
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@
33
/snowball
44
/src_c
55
/stemwords
6+
/dist
7+
/python_out

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@ before_install:
1313
- git clone --depth=1 -b "$TRAVIS_BRANCH" https://github.com:"${TRAVIS_REPO_SLUG%%/*}"/snowball-data.git || git clone --depth=1 -b "$TRAVIS_BRANCH" https://github.com/snowballstem/snowball-data.git || git clone --depth=1 https://github.com/snowballstem/snowball-data.git
1414
script:
1515
- make && make check STEMMING_DATA=snowball-data
16+
- make check_python python=python2.7

GNUmakefile

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
c_src_dir = src_c
44
java_src_main_dir = java/org/tartarus/snowball
55
java_src_dir = $(java_src_main_dir)/ext
6+
python ?= python3
7+
python_output_dir = python_out
8+
python_runtime_dir = snowballstemmer
9+
python_sample_dir = sample
610

711
libstemmer_algorithms = danish dutch english finnish french german hungarian \
812
italian \
@@ -23,13 +27,16 @@ COMPILER_SOURCES = compiler/space.c \
2327
compiler/analyser.c \
2428
compiler/generator.c \
2529
compiler/driver.c \
26-
compiler/generator_java.c
30+
compiler/generator_java.c \
31+
compiler/generator_python.c
32+
2733
COMPILER_HEADERS = compiler/header.h \
2834
compiler/syswords.h \
2935
compiler/syswords2.h
3036

3137
RUNTIME_SOURCES = runtime/api.c \
3238
runtime/utilities.c
39+
3340
RUNTIME_HEADERS = runtime/api.h \
3441
runtime/header.h
3542

@@ -38,13 +45,24 @@ JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \
3845
java/org/tartarus/snowball/SnowballStemmer.java \
3946
java/org/tartarus/snowball/TestApp.java
4047

48+
PYTHON_RUNTIME_SOURCES = python/snowballstemmer/basestemmer.py \
49+
python/snowballstemmer/among.py
50+
51+
PYTHON_SAMPLE_SOURCES = python/testapp.py \
52+
python/stemwords.py
53+
54+
PYTHON_PACKAGE_FILES = python/MANIFEST.in \
55+
python/setup.py
56+
4157
LIBSTEMMER_SOURCES = libstemmer/libstemmer.c
4258
LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c
4359
LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h
4460
LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in
4561

4662
STEMWORDS_SOURCES = examples/stemwords.c
4763

64+
PYTHON_STEMWORDS_SOURCE = python/stemwords.py
65+
4866
ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%/stem*.sbl)
4967
C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \
5068
$(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \
@@ -57,6 +75,8 @@ C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \
5775
C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c)
5876
C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h)
5977
JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java)
78+
PYTHON_SOURCES = $(libstemmer_algorithms:%=$(python_output_dir)/%_stemmer.py) \
79+
$(python_output_dir)/__init__.py
6080

6181
COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o)
6282
RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o)
@@ -83,10 +103,12 @@ clean:
83103
$(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \
84104
$(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \
85105
$(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \
106+
$(PYTHON_SOURCES) \
86107
libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \
87108
libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c
88109
rm -rf dist
89110
rmdir $(c_src_dir) || true
111+
rmdir $(python_output_dir) || true
90112

91113
snowball: $(COMPILER_OBJECTS)
92114
$(CC) -o $@ $^
@@ -154,12 +176,23 @@ $(java_src_dir)/%Stemmer.java: algorithms/%/stem_Unicode.sbl snowball
154176
echo "./snowball $< -j -o $${o} -p \"org.tartarus.snowball.SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer"; \
155177
./snowball $< -j -o $${o} -p "org.tartarus.snowball.SnowballStemmer" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer
156178

179+
$(python_output_dir)/%_stemmer.py: algorithms/%/stem_Unicode.sbl snowball
180+
@mkdir -p $(python_output_dir)
181+
@l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \
182+
o="$(python_output_dir)/$${l}_stemmer"; \
183+
echo "./snowball $< -py -o $${o} -p \"SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n `$(python) -c "print('$${l}'.title())"`Stemmer"; \
184+
./snowball $< -py -o $${o} -p "BaseStemmer" -eprefix $${l}_ -r ../runtime -n `$(python) -c "print('$${l}'.title())"`Stemmer
185+
186+
$(python_output_dir)/__init__.py:
187+
@mkdir -p $(python_output_dir)
188+
$(python) python/create_init.py $(python_output_dir)
189+
157190
splint: snowball.splint
158191
snowball.splint: $(COMPILER_SOURCES)
159192
splint $^ >$@ -weak
160193

161194
# Make a full source distribution
162-
dist: dist_snowball dist_libstemmer_c dist_libstemmer_java
195+
dist: dist_snowball dist_libstemmer_c dist_libstemmer_java dist_libstemmer_python
163196

164197
# Make a distribution of all the sources involved in snowball
165198
dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \
@@ -250,6 +283,23 @@ dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
250283
(cd dist && tar zcf $${destname}.tgz $${destname}) && \
251284
rm -rf $${dest}
252285

286+
dist_libstemmer_python: $(PYTHON_SOURCES)
287+
destname=snowballstemmer; \
288+
dest=dist/$${destname}; \
289+
rm -rf $${dest} && \
290+
rm -f $${dest}.tgz && \
291+
echo "a1" && \
292+
mkdir -p $${dest} && \
293+
mkdir -p $${dest}/src/$(python_runtime_dir) && \
294+
mkdir -p $${dest}/src/$(python_sample_dir) && \
295+
cp doc/libstemmer_python_README $${dest}/README.rst && \
296+
cp -a $(PYTHON_SOURCES) $${dest}/src/$(python_runtime_dir) && \
297+
cp -a $(PYTHON_SAMPLE_SOURCES) $${dest}/src/$(python_sample_dir) && \
298+
cp -a $(PYTHON_RUNTIME_SOURCES) $${dest}/src/$(python_runtime_dir) && \
299+
cp -a $(PYTHON_PACKAGE_FILES) $${dest} && \
300+
(cd $${dest} && $(python) setup.py sdist && cp dist/*.tar.gz ..) && \
301+
rm -rf $${dest}
302+
253303
check: check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r
254304

255305
check_utf8: $(libstemmer_algorithms:%=check_utf8_%)
@@ -293,3 +343,19 @@ check_koi8r_%: $(STEMMING_DATA)/% stemwords
293343
@python -c 'print(open("$</output.txt").read().decode("utf8").encode("koi8_r"))' | \
294344
diff -u - tmp.txt
295345
@rm tmp.txt
346+
347+
check_python: check_python_stemwords $(libstemmer_algorithms:%=check_python_%)
348+
349+
check_python_%: $(STEMMING_DATA)/%
350+
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with UTF-8"
351+
(cd python_check && \
352+
$(python) stemwords.py -c utf8 -l `echo $<|sed 's!.*/!!'` -i ../$</voc.txt -o tmp.txt && \
353+
diff -u ../$</output.txt tmp.txt && \
354+
rm tmp.txt)
355+
356+
check_python_stemwords: $(PYTHON_STEMWORDS_SOURCE) $(PYTHON_SOURCES)
357+
mkdir -p python_check && \
358+
mkdir -p python_check/snowballstemmer && \
359+
cp -a $(PYTHON_RUNTIME_SOURCES) python_check/snowballstemmer && \
360+
cp -a $(PYTHON_SOURCES) python_check/snowballstemmer && \
361+
cp -a $(PYTHON_STEMWORDS_SOURCE) python_check/

compiler/driver.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ static void print_arglist(void) {
2222
" [-j[ava]]\n"
2323
#endif
2424
" [-c++]\n"
25+
#ifndef DISABLE_PYTHON
26+
" [-py[thon]]\n"
27+
#endif
2528
" [-w[idechars]]\n"
2629
" [-u[tf8]]\n"
2730
" [-n[ame] class name]\n"
@@ -105,6 +108,13 @@ static void read_options(struct options * o, int argc, char * argv[]) {
105108
o->make_lang = LANG_CPLUSPLUS;
106109
continue;
107110
}
111+
#ifndef DISABLE_PYTHON
112+
if (eq(s, "-py") || eq(s, "-python")) {
113+
o->make_lang = LANG_PYTHON;
114+
o->widechars = true;
115+
continue;
116+
}
117+
#endif
108118
if (eq(s, "-w") || eq(s, "-widechars")) {
109119
o->widechars = true;
110120
o->utf8 = false;
@@ -237,6 +247,18 @@ extern int main(int argc, char * argv[]) {
237247
close_generator_java(g);
238248
fclose(o->output_java);
239249
}
250+
#endif
251+
#ifndef DISABLE_PYTHON
252+
if (o->make_lang == LANG_PYTHON) {
253+
symbol * b = add_s_to_b(0, s);
254+
b = add_s_to_b(b, ".py");
255+
o->output_python = get_output(b);
256+
lose_b(b);
257+
g = create_generator_python(a, o);
258+
generate_program_python(g);
259+
close_generator_python(g);
260+
fclose(o->output_python);
261+
}
240262
#endif
241263
}
242264
close_analyser(a);

0 commit comments

Comments
 (0)