diff --git a/.gitignore b/.gitignore index 2147da841..8600b76de 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ /snowball /src_c /stemwords +/dist +/python_out diff --git a/.travis.yml b/.travis.yml index 318dcd15c..cedf08118 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,3 +13,4 @@ before_install: - git clone --depth=1 -b "$TRAVIS_BRANCH" https://github.com:"${TRAVIS_REPO_SLUG%%/*}"/snowball-data.git || git clone --depth=1 -b "$TRAVIS_BRANCH" https://github.com/snowballstem/snowball-data.git || git clone --depth=1 https://github.com/snowballstem/snowball-data.git script: - make && make check STEMMING_DATA=snowball-data + - make check_python python=python2.7 STEMMING_DATA=snowball-data diff --git a/GNUmakefile b/GNUmakefile index 8086a0fa7..da24140d5 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -3,6 +3,10 @@ c_src_dir = src_c java_src_main_dir = java/org/tartarus/snowball java_src_dir = $(java_src_main_dir)/ext +python ?= python3 +python_output_dir = python_out +python_runtime_dir = snowballstemmer +python_sample_dir = sample libstemmer_algorithms = danish dutch english finnish french german hungarian \ italian \ @@ -23,7 +27,9 @@ COMPILER_SOURCES = compiler/space.c \ compiler/analyser.c \ compiler/generator.c \ compiler/driver.c \ - compiler/generator_java.c + compiler/generator_java.c \ + compiler/generator_python.c + COMPILER_HEADERS = compiler/header.h \ compiler/syswords.h \ compiler/syswords2.h @@ -38,6 +44,15 @@ JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \ java/org/tartarus/snowball/SnowballStemmer.java \ java/org/tartarus/snowball/TestApp.java +PYTHON_RUNTIME_SOURCES = python/snowballstemmer/basestemmer.py \ + python/snowballstemmer/among.py + +PYTHON_SAMPLE_SOURCES = python/testapp.py \ + python/stemwords.py + +PYTHON_PACKAGE_FILES = python/MANIFEST.in \ + python/setup.py + LIBSTEMMER_SOURCES = libstemmer/libstemmer.c LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h @@ -45,6 +60,8 @@ LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer STEMWORDS_SOURCES = examples/stemwords.c +PYTHON_STEMWORDS_SOURCE = python/stemwords.py + ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%/stem*.sbl) C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \ $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \ @@ -57,6 +74,8 @@ C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \ C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java) +PYTHON_SOURCES = $(libstemmer_algorithms:%=$(python_output_dir)/%_stemmer.py) \ + $(python_output_dir)/__init__.py COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o) RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o) @@ -83,10 +102,12 @@ clean: $(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \ $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \ $(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \ + $(PYTHON_SOURCES) \ libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \ libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c rm -rf dist rmdir $(c_src_dir) || true + rmdir $(python_output_dir) || true snowball: $(COMPILER_OBJECTS) $(CC) -o $@ $^ @@ -154,12 +175,23 @@ $(java_src_dir)/%Stemmer.java: algorithms/%/stem_Unicode.sbl snowball echo "./snowball $< -j -o $${o} -p \"org.tartarus.snowball.SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer"; \ ./snowball $< -j -o $${o} -p "org.tartarus.snowball.SnowballStemmer" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer +$(python_output_dir)/%_stemmer.py: algorithms/%/stem_Unicode.sbl snowball + @mkdir -p $(python_output_dir) + @l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \ + o="$(python_output_dir)/$${l}_stemmer"; \ + echo "./snowball $< -py -o $${o} -p \"SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n `$(python) -c "print('$${l}'.title())"`Stemmer"; \ + ./snowball $< -py -o $${o} -p "BaseStemmer" -eprefix $${l}_ -r ../runtime -n `$(python) -c "print('$${l}'.title())"`Stemmer + +$(python_output_dir)/__init__.py: + @mkdir -p $(python_output_dir) + $(python) python/create_init.py $(python_output_dir) + splint: snowball.splint snowball.splint: $(COMPILER_SOURCES) splint $^ >$@ -weak # Make a full source distribution -dist: dist_snowball dist_libstemmer_c dist_libstemmer_java +dist: dist_snowball dist_libstemmer_c dist_libstemmer_java dist_libstemmer_python # Make a distribution of all the sources involved in snowball dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \ @@ -250,6 +282,23 @@ dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ (cd dist && tar zcf $${destname}.tgz $${destname}) && \ rm -rf $${dest} +dist_libstemmer_python: $(PYTHON_SOURCES) + destname=snowballstemmer; \ + dest=dist/$${destname}; \ + rm -rf $${dest} && \ + rm -f $${dest}.tgz && \ + echo "a1" && \ + mkdir -p $${dest} && \ + mkdir -p $${dest}/src/$(python_runtime_dir) && \ + mkdir -p $${dest}/src/$(python_sample_dir) && \ + cp doc/libstemmer_python_README $${dest}/README.rst && \ + cp -a $(PYTHON_SOURCES) $${dest}/src/$(python_runtime_dir) && \ + cp -a $(PYTHON_SAMPLE_SOURCES) $${dest}/src/$(python_sample_dir) && \ + cp -a $(PYTHON_RUNTIME_SOURCES) $${dest}/src/$(python_runtime_dir) && \ + cp -a $(PYTHON_PACKAGE_FILES) $${dest} && \ + (cd $${dest} && $(python) setup.py sdist && cp dist/*.tar.gz ..) && \ + rm -rf $${dest} + check: check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r check_utf8: $(libstemmer_algorithms:%=check_utf8_%) @@ -293,3 +342,19 @@ check_koi8r_%: $(STEMMING_DATA)/% stemwords @python -c 'print(open("$make_lang = LANG_CPLUSPLUS; continue; } +#ifndef DISABLE_PYTHON + if (eq(s, "-py") || eq(s, "-python")) { + o->make_lang = LANG_PYTHON; + o->widechars = true; + continue; + } +#endif if (eq(s, "-w") || eq(s, "-widechars")) { o->widechars = true; o->utf8 = false; @@ -237,6 +247,18 @@ extern int main(int argc, char * argv[]) { close_generator_java(g); fclose(o->output_java); } +#endif +#ifndef DISABLE_PYTHON + if (o->make_lang == LANG_PYTHON) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".py"); + o->output_python = get_output(b); + lose_b(b); + g = create_generator_python(a, o); + generate_program_python(g); + close_generator_python(g); + fclose(o->output_python); + } #endif } close_analyser(a); diff --git a/compiler/generator_python.c b/compiler/generator_python.c new file mode 100644 index 000000000..3b5c5907e --- /dev/null +++ b/compiler/generator_python.c @@ -0,0 +1,1458 @@ + +#include /* for exit */ +#include /* for strlen */ +#include /* for fprintf etc */ +#include "header.h" + +/* prototypes */ + +static void generate(struct generator * g, struct node * p); +static void w(struct generator * g, const char * s); +static void writef(struct generator * g, const char * s, struct node * p); + + +enum special_labels { + x_return = -1 +}; + +static int new_label(struct generator * g) { + + int next_label = g->next_label++; + g->max_label = (next_label > g->max_label) ? next_label : g->max_label; + return next_label; +} + +static struct str * vars_newname(struct generator * g) { + + struct str * output; + g->var_number ++; + output = str_new(); + str_append_string(output, "v_"); + str_append_int(output, g->var_number); + return output; +} + +/* Output routines */ +static void output_str(FILE * outfile, struct str * str) { + + char * s = b_to_s(str_data(str)); + fprintf(outfile, "%s", s); + free(s); +} + +/* Write routines for simple entities */ + +static void write_char(struct generator * g, int ch) { + + str_append_ch(g->outbuf, ch); +} + +static void write_newline(struct generator * g) { + + str_append_string(g->outbuf, "\n"); +} + +static void write_string(struct generator * g, const char * s) { + str_append_string(g->outbuf, s); +} + +static void write_b(struct generator * g, symbol * b) { + + str_append_b(g->outbuf, b); +} + +static void write_str(struct generator * g, struct str * str) { + + str_append(g->outbuf, str); +} + +static void write_int(struct generator * g, int i) { + + str_append_int(g->outbuf, i); +} + + +/* Write routines for items from the syntax tree */ + +static void write_varname(struct generator * g, struct name * p) { + + int ch = "SBIrxg"[p->type]; + if (p->type != t_external) + { + write_char(g, ch); + write_char(g, '_'); + } + str_append_b(g->outbuf, p->b); +} + +static void write_varref(struct generator * g, struct name * p) { + + /* In python, references look just the same */ + write_varname(g, p); +} + +static void write_hexdigit(struct generator * g, int n) { + + write_char(g, n < 10 ? n + '0' : n - 10 + 'A'); +} + +static void write_hex(struct generator * g, int ch) { + + write_string(g, "\\u"); + { + int i; + for (i = 12; i >= 0; i -= 4) write_hexdigit(g, ch >> i & 0xf); + } +} + +static void write_literal_string(struct generator * g, symbol * p) { + + int i; + write_string(g, "u\""); + for (i = 0; i < SIZE(p); i++) { + int ch = p[i]; + if (32 <= ch && ch <= 127) { + if (ch == '\"' || ch == '\\') write_string(g, "\\"); + write_char(g, ch); + } else { + write_hex(g, ch); + } + } + write_string(g, "\""); +} + +static void write_margin(struct generator * g) { + + int i; + for (i = 0; i < g->margin; i++) write_string(g, " "); +} + +static void write_comment(struct generator * g, struct node * p) { + + write_margin(g); + write_string(g, "# "); + write_string(g, (char *) name_of_token(p->type)); + if (p->name != 0) { + write_string(g, " "); + str_append_b(g->outbuf, p->name->b); + } + write_string(g, ", line "); + write_int(g, p->line_number); + write_newline(g); +} + +static void write_block_start(struct generator * g) { + + w(g, "~+~N"); +} + +static void write_block_end(struct generator * g) /* block end */ { + + w(g, "~-"); +} + +static void write_savecursor(struct generator * g, struct node * p, + struct str * savevar) { + + g->B[0] = str_data(savevar); + g->S[1] = ""; + if (p->mode != m_forward) g->S[1] = "self.limit - "; + writef(g, "~M~B0 = ~S1self.cursor~N" , p); +} + +static void restore_string(struct node * p, struct str * out, struct str * savevar) { + + str_clear(out); + str_append_string(out, "self.cursor = "); + if (p->mode != m_forward) str_append_string(out, "self.limit - "); + str_append(out, savevar); +} + +static void write_restorecursor(struct generator * g, struct node * p, + struct str * savevar) { + + struct str * temp = str_new(); + write_margin(g); + restore_string(p, temp, savevar); + write_str(g, temp); + write_newline(g); + str_delete(temp); +} + +static void write_inc_cursor(struct generator * g, struct node * p) { + + write_margin(g); + write_string(g, p->mode == m_forward ? "self.cursor += 1" : "self.cursor -= 1"); + write_newline(g); +} + +static void wsetlab_begin(struct generator * g) { + + w(g, "~Mtry:~N~+"); +} + +static void wsetlab_end(struct generator * g, int n) { + g->I[0] = n; + w(g, "~-~Mexcept lab~I0: pass~N"); +} + +static void wgotol(struct generator * g, int n) { + g->I[0] = n; + w(g, "~Mraise lab~I0()~N"); +} + +static void write_failure(struct generator * g) { + + if (str_len(g->failure_str) != 0) { + write_margin(g); + write_str(g, g->failure_str); + write_newline(g); + } + switch (g->failure_label) + { + case x_return: + w(g, "~Mreturn False~N"); + g->unreachable = true; + break; + default: + g->I[0] = g->failure_label; + w(g, "~Mraise lab~I0()~N"); + } +} + +static void write_failure_if(struct generator * g, char * s, struct node * p) { + + writef(g, "~Mif ", p); + writef(g, s, p); + writef(g, ":", p); + write_block_start(g); + write_failure(g); + write_block_end(g); + g->unreachable = false; +} + +/* if at limit fail */ +static void write_check_limit(struct generator * g, struct node * p) { + + if (p->mode == m_forward) { + write_failure_if(g, "self.cursor >= self.limit", p); + } else { + write_failure_if(g, "self.cursor <= self.limit_backward", p); + } +} + +/* Formatted write. */ +static void writef(struct generator * g, const char * input, struct node * p) { + + int i = 0; + int l = strlen(input); + + while (i < l) { + int ch = input[i++]; + if (ch == '~') { + switch(input[i++]) { + default: write_char(g, input[i - 1]); continue; + case 'C': write_comment(g, p); continue; + case 'f': write_block_start(g); + write_failure(g); + g->unreachable = false; + write_block_end(g); + continue; + case 'M': write_margin(g); continue; + case 'N': write_newline(g); continue; + case '{': write_block_start(g); continue; + case '}': write_block_end(g); continue; + case 'S': write_string(g, g->S[input[i++] - '0']); continue; + case 'B': write_b(g, g->B[input[i++] - '0']); continue; + case 'I': write_int(g, g->I[input[i++] - '0']); continue; + case 'V': write_varref(g, g->V[input[i++] - '0']); continue; + case 'W': write_varname(g, g->V[input[i++] - '0']); continue; + case 'L': write_literal_string(g, g->L[input[i++] - '0']); continue; + case '+': g->margin++; continue; + case '-': g->margin--; continue; + case 'n': write_string(g, g->options->name); continue; + } + } else { + write_char(g, ch); + } + } +} + +static void w(struct generator * g, const char * s) { + writef(g, s, 0); +} + +static void generate_AE(struct generator * g, struct node * p) { + char * s; + switch (p->type) { + case c_name: + write_varref(g, p->name); break; + case c_number: + write_int(g, p->number); break; + case c_maxint: + write_string(g, "MAXINT"); break; + case c_minint: + write_string(g, "MININT"); break; + case c_neg: + write_string(g, "-"); generate_AE(g, p->right); break; + case c_multiply: + s = " * "; goto label0; + case c_plus: + s = " + "; goto label0; + case c_minus: + s = " - "; goto label0; + case c_divide: + s = " / "; + label0: + write_string(g, "("); generate_AE(g, p->left); + write_string(g, s); generate_AE(g, p->right); write_string(g, ")"); break; + case c_sizeof: + g->V[0] = p->name; + w(g, "(~V0.length)"); break; + case c_cursor: + w(g, "self.cursor"); break; + case c_limit: + w(g, p->mode == m_forward ? "self.limit" : "self.limit_backward"); break; + case c_size: + w(g, "(self.current.length)"); break; + } +} + +/* K_needed() tests to see if we really need to keep c. Not true when the + the command does not touch the cursor. self and repeat_score() could be + elaborated almost indefinitely. +*/ + +static int K_needed(struct generator * g, struct node * p) { + + while (p != 0) { + switch (p->type) { + case c_dollar: + case c_leftslice: + case c_rightslice: + case c_mathassign: + case c_plusassign: + case c_minusassign: + case c_multiplyassign: + case c_divideassign: + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: + case c_sliceto: + case c_booltest: + case c_true: + case c_false: + case c_debug: + break; + + case c_call: + if (K_needed(g, p->name->definition)) return true; + break; + + case c_bra: + if (K_needed(g, p->left)) return true; + break; + + default: return true; + } + p = p->right; + } + return false; +} + +static int repeat_score(struct generator * g, struct node * p) { + + int score = 0; + while (p != 0) { + switch (p->type) { + case c_dollar: + case c_leftslice: + case c_rightslice: + case c_mathassign: + case c_plusassign: + case c_minusassign: + case c_multiplyassign: + case c_divideassign: + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: + case c_sliceto: /* case c_not: must not be included here! */ + case c_debug: + break; + + case c_call: + score += repeat_score(g, p->name->definition); + break; + + case c_bra: + score += repeat_score(g, p->left); + break; + + case c_name: + case c_literalstring: + case c_next: + case c_grouping: + case c_non: + case c_hop: + score = score + 1; + break; + + default: + score = 2; + break; + } + p = p->right; + } + return score; +} + +/* tests if an expression requires cursor reinstatement in a repeat */ + +static int repeat_restore(struct generator * g, struct node * p) { + + return repeat_score(g, p) >= 2; +} + +static void generate_bra(struct generator * g, struct node * p) { + + write_comment(g, p); + p = p->left; + while (p != 0) { + generate(g, p); + p = p->right; + } +} + +static void generate_and(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + int keep_c = K_needed(g, p->left); + + write_comment(g, p); + + if (keep_c) write_savecursor(g, p, savevar); + + p = p->left; + while (p != 0) { + generate(g, p); + if (g->unreachable) break; + if (keep_c && p->right != 0) write_restorecursor(g, p, savevar); + p = p->right; + } + str_delete(savevar); +} + +static void generate_or(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + int keep_c = K_needed(g, p->left); + + int a0 = g->failure_label; + struct str * a1 = str_copy(g->failure_str); + + int out_lab = new_label(g); + write_comment(g, p); + wsetlab_begin(g); + + if (keep_c) write_savecursor(g, p, savevar); + + p = p->left; + str_clear(g->failure_str); + + if (p == 0) { + /* p should never be 0 after an or: there should be at least two + * sub nodes. */ + fprintf(stderr, "Error: \"or\" node without children nodes."); + exit (1); + } + while (p->right != 0) { + g->failure_label = new_label(g); + int label = g->failure_label; + wsetlab_begin(g); + generate(g, p); + if (!g->unreachable) wgotol(g, out_lab); + wsetlab_end(g, label); + g->unreachable = false; + if (keep_c) write_restorecursor(g, p, savevar); + p = p->right; + } + + g->failure_label = a0; + str_delete(g->failure_str); + g->failure_str = a1; + + generate(g, p); + wsetlab_end(g, out_lab); + str_delete(savevar); +} + +static void generate_backwards(struct generator * g, struct node * p) { + + write_comment(g, p); + writef(g,"~Mself.limit_backward = self.cursor~N" + "~Mself.cursor = self.limit~N", p); + generate(g, p->left); + w(g, "~Mself.cursor = self.limit_backward~N"); +} + + +static void generate_not(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + int keep_c = K_needed(g, p->left); + + int a0 = g->failure_label; + struct str * a1 = str_copy(g->failure_str); + + write_comment(g, p); + if (keep_c) { + write_savecursor(g, p, savevar); + } + + g->failure_label = new_label(g); + int label = g->failure_label; + str_clear(g->failure_str); + + wsetlab_begin(g); + + generate(g, p->left); + + g->failure_label = a0; + str_delete(g->failure_str); + g->failure_str = a1; + + if (!g->unreachable) write_failure(g); + + wsetlab_end(g, label); + g->unreachable = false; + + if (keep_c) write_restorecursor(g, p, savevar); + str_delete(savevar); +} + + +static void generate_try(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + int keep_c = K_needed(g, p->left); + + write_comment(g, p); + if (keep_c) write_savecursor(g, p, savevar); + + g->failure_label = new_label(g); + int label = g->failure_label; + + if (keep_c) restore_string(p, g->failure_str, savevar); + + wsetlab_begin(g); + generate(g, p->left); + wsetlab_end(g, label); + g->unreachable = false; + + str_delete(savevar); +} + +static void generate_set(struct generator * g, struct node * p) { + + write_comment(g, p); + g->V[0] = p->name; + writef(g, "~Mself.~V0 = True~N", p); +} + +static void generate_unset(struct generator * g, struct node * p) { + + write_comment(g, p); + g->V[0] = p->name; + writef(g, "~Mself.~V0 = False~N", p); +} + +static void generate_fail(struct generator * g, struct node * p) { + + write_comment(g, p); + generate(g, p->left); + if (!g->unreachable) write_failure(g); +} + +/* generate_test() also implements 'reverse' */ + +static void generate_test(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + int keep_c = K_needed(g, p->left); + + write_comment(g, p); + + if (keep_c) { + write_savecursor(g, p, savevar); + } + + generate(g, p->left); + + if (!g->unreachable) { + if (keep_c) { + write_restorecursor(g, p, savevar); + } + } + str_delete(savevar); +} + +static void generate_do(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + int keep_c = K_needed(g, p->left); + write_comment(g, p); + if (keep_c) write_savecursor(g, p, savevar); + + g->failure_label = new_label(g); + int label = g->failure_label; + str_clear(g->failure_str); + + wsetlab_begin(g); + generate(g, p->left); + wsetlab_end(g, label); + g->unreachable = false; + + if (keep_c) write_restorecursor(g, p, savevar); + str_delete(savevar); +} + +static void generate_GO(struct generator * g, struct node * p, int style) { + + int end_unreachable = false; + struct str * savevar = vars_newname(g); + int keep_c = style == 1 || repeat_restore(g, p->left); + + int a0 = g->failure_label; + struct str * a1 = str_copy(g->failure_str); + + int golab = new_label(g); + g->I[0] = golab; + write_comment(g, p); + w(g, "~Mtry:~N~+" + "~Mwhile True:~N~+"); + if (keep_c) write_savecursor(g, p, savevar); + + g->failure_label = new_label(g); + int label = g->failure_label; + wsetlab_begin(g); + generate(g, p->left); + + if (g->unreachable) { + /* Cannot break out of self loop: therefore the code after the + * end of the loop is unreachable.*/ + end_unreachable = true; + } else { + /* include for goto; omit for gopast */ + if (style == 1) write_restorecursor(g, p, savevar); + g->I[0] = golab; + w(g, "~Mraise lab~I0()~N"); + } + g->unreachable = false; + wsetlab_end(g, label); + if (keep_c) write_restorecursor(g, p, savevar); + + g->failure_label = a0; + str_delete(g->failure_str); + g->failure_str = a1; + + write_check_limit(g, p); + write_inc_cursor(g, p); + w(g, "~-~-"); + g->I[0] = golab; + w(g, "~Mexcept lab~I0: pass~N"); + str_delete(savevar); + g->unreachable = end_unreachable; +} + +static void generate_loop(struct generator * g, struct node * p) { + + struct str * loopvar = vars_newname(g); + write_comment(g, p); + g->B[0] = str_data(loopvar); + w(g, "~Mfor ~B0 in range ("); + generate_AE(g, p->AE); + g->B[0] = str_data(loopvar); + writef(g, ", 0, -1):~N", p); + writef(g, "~{", p); + + generate(g, p->left); + + w(g, "~}"); + str_delete(loopvar); + g->unreachable = false; +} + +static void generate_repeat(struct generator * g, struct node * p, struct str * loopvar) { + + struct str * savevar = vars_newname(g); + int keep_c = repeat_restore(g, p->left); + int rep_break_lab = new_label(g); + int rep_continue_lab = new_label(g); + write_comment(g, p); + writef(g, "~Mtry:~N~+" + "~Mwhile True:~N~+" + "~Mtry:~N~+", p); + if (keep_c) write_savecursor(g, p, savevar); + + g->failure_label = new_label(g); + int label = g->failure_label; + str_clear(g->failure_str); + wsetlab_begin(g); + generate(g, p->left); + + if (!g->unreachable) { + if (loopvar != 0) { + g->B[0] = str_data(loopvar); + w(g, "~M~B0 -= 1~N"); + } + + g->I[0] = rep_continue_lab; + w(g, "~Mraise lab~I0()~N"); + } + + wsetlab_end(g, label); + g->unreachable = false; + + if (keep_c) write_restorecursor(g, p, savevar); + + g->I[0] = rep_continue_lab; + g->I[1] = rep_break_lab; + w(g, "~Mraise lab~I1()~N~}" + "~Mexcept lab~I0: pass~N" + "~}~}" + "~Mexcept lab~I1: pass~N"); + str_delete(savevar); +} + +static void generate_atleast(struct generator * g, struct node * p) { + + struct str * loopvar = vars_newname(g); + write_comment(g, p); + g->B[0] = str_data(loopvar); + w(g, "~M~B0 = "); + generate_AE(g, p->AE); + w(g, "~N"); + { + int a0 = g->failure_label; + struct str * a1 = str_copy(g->failure_str); + + generate_repeat(g, p, loopvar); + + g->failure_label = a0; + str_delete(g->failure_str); + g->failure_str = a1; + } + g->B[0] = str_data(loopvar); + write_failure_if(g, "~B0 > 0", p); + str_delete(loopvar); +} + +static void generate_setmark(struct generator * g, struct node * p) { + + write_comment(g, p); + g->V[0] = p->name; + writef(g, "~Mself.~V0 = self.cursor~N", p); +} + +static void generate_tomark(struct generator * g, struct node * p) { + + write_comment(g, p); + g->S[0] = p->mode == m_forward ? ">" : "<"; + + w(g, "~Mif self.cursor ~S0 self."); generate_AE(g, p->AE); w(g, ":"); + write_block_start(g); + write_failure(g); + write_block_end(g); + g->unreachable = false; + w(g, "~Mself.cursor = self."); generate_AE(g, p->AE); writef(g, "~N", p); +} + +static void generate_atmark(struct generator * g, struct node * p) { + + write_comment(g, p); + w(g, "~Mif self.cursor != self."); generate_AE(g, p->AE); writef(g, ":", p); + write_block_start(g); + write_failure(g); + write_block_end(g); + g->unreachable = false; +} + + +static void generate_hop(struct generator * g, struct node * p) { + + write_comment(g, p); + g->S[0] = p->mode == m_forward ? "+" : "-"; + + w(g, "~Mc = self.cursor ~S0 "); + generate_AE(g, p->AE); + w(g, "~N"); + + g->S[0] = p->mode == m_forward ? "0" : "self.limit_backward"; + + write_failure_if(g, "~S0 > c or c > self.limit", p); + writef(g, "~Mself.cursor = c~N", p); +} + +static void generate_delete(struct generator * g, struct node * p) { + + write_comment(g, p); + writef(g, "~Mif not self.slice_del():~N" + "~+~Mreturn False~N~-" + "~N", p); +} + + +static void generate_next(struct generator * g, struct node * p) { + + write_comment(g, p); + write_check_limit(g, p); + write_inc_cursor(g, p); +} + +static void generate_tolimit(struct generator * g, struct node * p) { + + write_comment(g, p); + g->S[0] = p->mode == m_forward ? "self.limit" : "self.limit_backward"; + writef(g, "~Mself.cursor = ~S0~N", p); +} + +static void generate_atlimit(struct generator * g, struct node * p) { + + write_comment(g, p); + g->S[0] = p->mode == m_forward ? "self.limit" : "self.limit_backward"; + g->S[1] = p->mode == m_forward ? "<" : ">"; + write_failure_if(g, "self.cursor ~S1 ~S0", p); +} + +static void generate_leftslice(struct generator * g, struct node * p) { + + write_comment(g, p); + g->S[0] = p->mode == m_forward ? "self.bra" : "self.ket"; + writef(g, "~M~S0 = self.cursor~N", p); +} + +static void generate_rightslice(struct generator * g, struct node * p) { + + write_comment(g, p); + g->S[0] = p->mode == m_forward ? "self.ket" : "self.bra"; + writef(g, "~M~S0 = self.cursor~N", p); +} + +static void generate_assignto(struct generator * g, struct node * p) { + + write_comment(g, p); + g->V[0] = p->name; + writef(g, "~Mself.~V0 = self.assign_to(self.~V0)~N", p); +} + +static void generate_sliceto(struct generator * g, struct node * p) { + + write_comment(g, p); + g->V[0] = p->name; + writef(g, "~Mself.~V0 = self.slice_to(self.~V0)~N" + "~Mif self.~V0 == '':~N" + "~+~Mreturn False~N~-" + , p); +} + +static void generate_address(struct generator * g, struct node * p) { + + symbol * b = p->literalstring; + if (b != 0) { + write_literal_string(g, b); + } else { + write_varref(g, p->name); + } +} + +static void generate_insert(struct generator * g, struct node * p, int style) { + + int keep_c = style == c_attach; + write_comment(g, p); + if (p->mode == m_backward) keep_c = !keep_c; + if (keep_c) w(g, "~Mc = self.cursor~N"); + writef(g, "~Mself.insert(self.cursor, self.cursor, ", p); + generate_address(g, p); + writef(g, ")~N", p); + if (keep_c) w(g, "~Mself.cursor = c~N"); +} + +static void generate_assignfrom(struct generator * g, struct node * p) { + + int keep_c = p->mode == m_forward; /* like 'attach' */ + + write_comment(g, p); + if (keep_c) writef(g, "~Mc = self.cursor~N", p); + if (p->mode == m_forward) { + writef(g, "~Mself.insert(self.cursor, self.limit, ", p); + } else { + writef(g, "~Mself.insert(self.limit_backward, self.cursor, ", p); + } + generate_address(g, p); + writef(g, ")~N", p); + if (keep_c) w(g, "~Mself.cursor = c~N"); +} + + +static void generate_slicefrom(struct generator * g, struct node * p) { + + write_comment(g, p); + w(g, "~Mif not self.slice_from("); + generate_address(g, p); + writef(g, "):~N" + "~+~Mreturn False~N~-", p); +} + +static void generate_setlimit(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + struct str * varname = vars_newname(g); + write_comment(g, p); + write_savecursor(g, p, savevar); + generate(g, p->left); + + if (!g->unreachable) { + g->B[0] = str_data(varname); + if (p->mode == m_forward) { + w(g, "~M~B0 = self.limit - self.cursor~N"); + w(g, "~Mself.limit = self.cursor~N"); + } else { + w(g, "~M~B0 = self.limit_backward~N"); + w(g, "~Mself.limit_backward = self.cursor~N"); + } + write_restorecursor(g, p, savevar); + + if (p->mode == m_forward) { + str_assign(g->failure_str, "self.limit += "); + str_append(g->failure_str, varname); + } else { + str_assign(g->failure_str, "self.limit_backward = "); + str_append(g->failure_str, varname); + } + generate(g, p->aux); + + if (!g->unreachable) { + write_margin(g); + write_str(g, g->failure_str); + write_newline(g); + } + } + str_delete(varname); + str_delete(savevar); +} + +/* dollar sets snowball up to operate on a string variable as if it were the + * current string */ +static void generate_dollar(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + write_comment(g, p); + g->V[0] = p->name; + + str_assign(g->failure_str, "self.copy_from(self, "); + str_append(g->failure_str, savevar); + str_append_string(g->failure_str, ")"); + g->B[0] = str_data(savevar); + writef(g, "~M~n ~B0 = self~N" + "~Mself.current = self.~V0.toString()~N" + "~Mself.cursor = 0~N" + "~Mself.limit = (self.current.length)~N", p); + generate(g, p->left); + if (!g->unreachable) { + write_margin(g); + write_str(g, g->failure_str); + write_newline(g); + } + str_delete(savevar); +} + +static void generate_integer_assign(struct generator * g, struct node * p, char * s) { + + g->V[0] = p->name; + g->S[0] = s; + if (p->AE->type == c_name) + { + w(g, "~Mself.~V0 ~S0 self."); generate_AE(g, p->AE); w(g, ";~N"); + } + else + { + w(g, "~Mself.~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); + } +} + +static void generate_integer_test(struct generator * g, struct node * p, char * s) { + + g->V[0] = p->name; + g->S[0] = s; + if (p->AE->type == c_name) + { + w(g, "~Mif not (self.~V0 ~S0 self."); generate_AE(g, p->AE); w(g, "):"); + } + else + { + w(g, "~Mif not self.~V0 ~S0 "); + generate_AE(g, p->AE); + w(g, ":"); + } + write_block_start(g); + write_failure(g); + write_block_end(g); + g->unreachable = false; +} + +static void generate_call(struct generator * g, struct node * p) { + + write_comment(g, p); + g->V[0] = p->name; + write_failure_if(g, "not self.~V0()", p); +} + +static void generate_grouping(struct generator * g, struct node * p, int complement) { + + struct grouping * q = p->name->grouping; + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->S[1] = complement ? "out" : "in"; + g->V[0] = p->name; + g->I[0] = q->smallest_ch; + g->I[1] = q->largest_ch; + if (q->no_gaps) + write_failure_if(g, "not self.~S1_range~S0(~I0, ~I1)", p); + else + write_failure_if(g, "not self.~S1_grouping~S0(~n.~V0, ~I0, ~I1)", p); +} + +static void generate_namedstring(struct generator * g, struct node * p) { + + write_comment(g, p); + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->V[0] = p->name; + write_failure_if(g, "not self.eq_v~S0(self.~V0)", p); +} + +static void generate_literalstring(struct generator * g, struct node * p) { + + symbol * b = p->literalstring; + write_comment(g, p); + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->I[0] = SIZE(b); + g->L[0] = b; + write_failure_if(g, "not self.eq_s~S0(~I0, ~L0)", p); +} + +static void generate_define(struct generator * g, struct node * p) { + + struct name * q = p->name; + symbol stem[] = {'s', 't', 'e', 'm'}; + int find = 0; + int i = 0; + if (SIZE(q->b) == 4) + { + find = 1; + for (i = 0; i < 4; i++) + { + if (q->b[i] != stem[i]) + { + find = 0; + break; + } + } + } + struct str * saved_output = g->outbuf; + + g->V[0] = p->name; + if (find == 1) + { + w(g, "~N~Mdef _~V0(self):~+~N"); + } + else + { + w(g, "~N~Mdef ~V0(self):~+~N"); + } + g->outbuf = str_new(); + + g->next_label = 0; + g->var_number = 0; + + str_clear(g->failure_str); + g->failure_label = x_return; + g->unreachable = false; + generate(g, p->left); + if (!g->unreachable) w(g, "~Mreturn True~N"); + w(g, "~-"); + + str_append(saved_output, g->outbuf); + str_delete(g->outbuf); + g->outbuf = saved_output; +} + +static void generate_substring(struct generator * g, struct node * p) { + + struct among * x = p->among; + + write_comment(g, p); + + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->I[0] = x->number; + g->I[1] = x->literalstring_count; + + if (x->command_count == 0 && x->starter == 0) { + write_failure_if(g, "self.find_among~S0(~n.a_~I0, ~I1) == 0", p); + } else { + writef(g, "~Mamong_var = self.find_among~S0(~n.a_~I0, ~I1)~N", p); + write_failure_if(g, "among_var == 0", p); + } +} + +static void generate_among(struct generator * g, struct node * p) { + + struct among * x = p->among; + int case_number = 1; + + if (x->substring == 0) generate_substring(g, p); + if (x->command_count == 0 && x->starter == 0) return; + + if (x->starter != 0) generate(g, x->starter); + + p = p->left; + if (p != 0 && p->type != c_literalstring) p = p->right; + w(g, "~Mif among_var == 0:~N~+"); + write_failure(g); + g->unreachable = false; + w(g, "~-"); + + while (p != 0) { + if (p->type == c_bra && p->left != 0) { + g->I[0] = case_number++; + w(g, "~Melif among_var == ~I0:~N~+"); + generate(g, p); + w(g, "~-"); + g->unreachable = false; + } + p = p->right; + } +} + +static void generate_booltest(struct generator * g, struct node * p) { + + write_comment(g, p); + g->V[0] = p->name; + write_failure_if(g, "not self.~V0", p); +} + +static void generate_false(struct generator * g, struct node * p) { + + write_comment(g, p); + write_failure(g); +} + +static void generate_debug(struct generator * g, struct node * p) { + + write_comment(g, p); + g->I[0] = g->debug_count++; + g->I[1] = p->line_number; + writef(g, "~Mself.debug(~I0, ~I1);~N", p); +} + +static void generate(struct generator * g, struct node * p) { + + int a0; + struct str * a1; + + if (g->unreachable) return; + + a0 = g->failure_label; + a1 = str_copy(g->failure_str); + + switch (p->type) + { + case c_define: generate_define(g, p); break; + case c_bra: generate_bra(g, p); break; + case c_and: generate_and(g, p); break; + case c_or: generate_or(g, p); break; + case c_backwards: generate_backwards(g, p); break; + case c_not: generate_not(g, p); break; + case c_set: generate_set(g, p); break; + case c_unset: generate_unset(g, p); break; + case c_try: generate_try(g, p); break; + case c_fail: generate_fail(g, p); break; + case c_reverse: + case c_test: generate_test(g, p); break; + case c_do: generate_do(g, p); break; + case c_goto: generate_GO(g, p, 1); break; + case c_gopast: generate_GO(g, p, 0); break; + case c_repeat: generate_repeat(g, p, 0); break; + case c_loop: generate_loop(g, p); break; + case c_atleast: generate_atleast(g, p); break; + case c_setmark: generate_setmark(g, p); break; + case c_tomark: generate_tomark(g, p); break; + case c_atmark: generate_atmark(g, p); break; + case c_hop: generate_hop(g, p); break; + case c_delete: generate_delete(g, p); break; + case c_next: generate_next(g, p); break; + case c_tolimit: generate_tolimit(g, p); break; + case c_atlimit: generate_atlimit(g, p); break; + case c_leftslice: generate_leftslice(g, p); break; + case c_rightslice: generate_rightslice(g, p); break; + case c_assignto: generate_assignto(g, p); break; + case c_sliceto: generate_sliceto(g, p); break; + case c_assign: generate_assignfrom(g, p); break; + case c_insert: + case c_attach: generate_insert(g, p, p->type); break; + case c_slicefrom: generate_slicefrom(g, p); break; + case c_setlimit: generate_setlimit(g, p); break; + case c_dollar: generate_dollar(g, p); break; + case c_mathassign: generate_integer_assign(g, p, "="); break; + case c_plusassign: generate_integer_assign(g, p, "+="); break; + case c_minusassign: generate_integer_assign(g, p, "-="); break; + case c_multiplyassign:generate_integer_assign(g, p, "*="); break; + case c_divideassign: generate_integer_assign(g, p, "/="); break; + case c_eq: generate_integer_test(g, p, "=="); break; + case c_ne: generate_integer_test(g, p, "!="); break; + case c_gr: generate_integer_test(g, p, ">"); break; + case c_ge: generate_integer_test(g, p, ">="); break; + case c_ls: generate_integer_test(g, p, "<"); break; + case c_le: generate_integer_test(g, p, "<="); break; + case c_call: generate_call(g, p); break; + case c_grouping: generate_grouping(g, p, false); break; + case c_non: generate_grouping(g, p, true); break; + case c_name: generate_namedstring(g, p); break; + case c_literalstring: generate_literalstring(g, p); break; + case c_among: generate_among(g, p); break; + case c_substring: generate_substring(g, p); break; + case c_booltest: generate_booltest(g, p); break; + case c_false: generate_false(g, p); break; + case c_true: break; + case c_debug: generate_debug(g, p); break; + default: fprintf(stderr, "%d encountered\n", p->type); + exit(1); + } + + g->failure_label = a0; + str_delete(g->failure_str); + g->failure_str = a1; +} + +static void generate_start_comment(struct generator * g) { + + w(g, "# self file was generated automatically by the Snowball to Python compiler~N"); + w(g, "~N"); +} + +static void generate_class_begin(struct generator * g) { + + w(g, "from .basestemmer import "); + w(g, g->options->parent_class_name); + w(g, "~N" + "from .among import Among~N" + "~N" + "~N" + "class ~n("); + w(g, g->options->parent_class_name); + w(g, "):~N" + "~+~M'''~N" + "~Mself class was automatically generated by a Snowball to Python interpreter~N" + "~MIt implements the stemming algorithm defined by a snowball script.~N" + "~M'''~N" + "~MserialVersionUID = 1~N" + "~N"); +} + +static void generate_equals(struct generator * g) { + + w(g, "~N" + "~Mdef equals(self, o):~N" + "~+~Mreturn isinstance(o, "); + w(g, g->options->name); + w(g, ")~N~-" + "~N" + "~Mdef hashCode(self):~N" + "~+~Mreturn hash(\"~n\")~N~-"); +} + +static void generate_among_table(struct generator * g, struct among * x) { + + struct amongvec * v = x->b; + + g->I[0] = x->number; + g->I[1] = x->literalstring_count; + + w(g, "~Ma_~I0 = [~N~+"); + { + int i; + for (i = 0; i < x->literalstring_count; i++) + { + g->I[0] = i; + g->I[1] = v->i; + g->I[2] = v->result; + g->L[0] = v->b; + g->S[0] = i < x->literalstring_count - 1 ? "," : ""; + + w(g, "~MAmong(~L0, ~I1, ~I2"); + if (v->function != 0) + { + w(g, ", \""); + write_varname(g, v->function); + w(g, "\""); + } + w(g, ")~S0~N"); + v++; + } + } + w(g, "~-~M]~N~N"); +} + +static void generate_amongs(struct generator * g) { + + struct among * x = g->analyser->amongs; + while (x != 0) { + generate_among_table(g, x); + x = x->next; + } +} + +static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } + +static int bit_is_set(symbol * b, int i) { return b[i/8] & 1 << i%8; } + +static void generate_grouping_table(struct generator * g, struct grouping * q) { + + int range = q->largest_ch - q->smallest_ch + 1; + int size = (range + 7)/ 8; /* assume 8 bits per symbol */ + symbol * b = q->b; + symbol * map = create_b(size); + int i; + for (i = 0; i < size; i++) map[i] = 0; + + /* Using unicode would require revision here */ + + for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); + + q->no_gaps = true; + for (i = 0; i < range; i++) unless (bit_is_set(map, i)) q->no_gaps = false; + + unless (q->no_gaps) { + g->V[0] = q->name; + + w(g, "~M~V0 = ["); + for (i = 0; i < size; i++) { + write_int(g, map[i]); + if (i < size - 1) w(g, ", "); + } + w(g, "]~N~N"); + } + lose_b(map); +} + +static void generate_groupings(struct generator * g) { + struct grouping * q = g->analyser->groupings; + until (q == 0) { + generate_grouping_table(g, q); + q = q->next; + } +} + +static void generate_members(struct generator * g) { + + struct name * q = g->analyser->names; + until (q == 0) { + g->V[0] = q; + switch (q->type) { + case t_string: + w(g, " ~W0 = \"\"~N"); + break; + case t_integer: + w(g, " ~W0 = 0~N"); + break; + case t_boolean: + w(g, " ~W0 = False~N"); + break; + } + q = q->next; + } + w(g, "~N"); +} + +static void generate_copyfrom(struct generator * g) { + + struct name * q; + w(g, "~Mdef copy_from(self, other):~+~N"); + for (q = g->analyser->names; q != 0; q = q->next) { + g->V[0] = q; + switch (q->type) { + case t_string: + case t_integer: + case t_boolean: + w(g, "~Mself.~W0 = other.~W0~N"); + break; + } + } + w(g, "~Msuper.copy_from(other)~N"); + w(g, "~-~M~N"); +} + +static void generate_methods(struct generator * g) { + + struct node * p = g->analyser->program; + while (p != 0) { + generate(g, p); + g->unreachable = false; + p = p->right; + } +} + +static void generate_label_classes(struct generator * g) +{ + int i; + for (i = 0; i <= g->max_label; i++) + { + g->I[0] = i; + w(g, "class lab~I0(BaseException): pass~N"); + } +} + +extern void generate_program_python(struct generator * g) { + + g->outbuf = str_new(); + g->failure_str = str_new(); + + generate_start_comment(g); + generate_class_begin(g); + + generate_amongs(g); + generate_groupings(g); + + generate_members(g); + generate_copyfrom(g); + generate_methods(g); + generate_equals(g); + + generate_label_classes(g); + + output_str(g->options->output_python, g->outbuf); + str_delete(g->failure_str); + str_delete(g->outbuf); +} + +extern struct generator * create_generator_python(struct analyser * a, struct options * o) { + + NEW(generator, g); + g->analyser = a; + g->options = o; + g->margin = 0; + g->debug_count = 0; + g->unreachable = false; + g->max_label = 0; + return g; +} + +extern void close_generator_python(struct generator * g) { + + FREE(g); +} + diff --git a/compiler/header.h b/compiler/header.h index 9baf1d917..3ea771e65 100644 --- a/compiler/header.h +++ b/compiler/header.h @@ -259,11 +259,14 @@ struct generator { struct str * outbuf; /* temporary str to store output */ struct str * declarations; /* str storing variable declarations */ int next_label; +#ifndef DISABLE_PYTHON + int max_label; +#endif int margin; const char * failure_string; /* String to output in case of a failure. */ -#ifndef DISABLE_JAVA - struct str * failure_str; /* This is used by the java generator instead of failure_string */ +#if !defined(DISABLE_JAVA) && !defined(DISABLE_PYTHON) + struct str * failure_str; /* This is used by some generators instead of failure_string */ #endif int label_used; /* Keep track of whether the failure label is used. */ @@ -293,10 +296,13 @@ struct options { FILE * output_h; #ifndef DISABLE_JAVA FILE * output_java; +#endif +#ifndef DISABLE_PYTHON + FILE * output_python; #endif byte syntax_tree; byte widechars; - enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS } make_lang; + enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS, LANG_PYTHON } make_lang; char * externals_prefix; char * variables_prefix; char * runtime_path; @@ -322,3 +328,11 @@ extern void close_generator_java(struct generator * g); extern void generate_program_java(struct generator * g); #endif + +#ifndef DISABLE_PYTHON +/* Generator for Python code. */ +extern struct generator * create_generator_python(struct analyser * a, struct options * o); +extern void close_generator_python(struct generator * g); + +extern void generate_program_python(struct generator * g); +#endif diff --git a/doc/libstemmer_python_README b/doc/libstemmer_python_README new file mode 100644 index 000000000..6474e35c1 --- /dev/null +++ b/doc/libstemmer_python_README @@ -0,0 +1,93 @@ +Snowball stemming library collection for Python +=============================================== + +How to use library +------------------ + +The ``snowballstemmer`` module has two functions. + +The ``snowballstemmer.algorithms`` function returns a list of available algorithm names. + +The ``snowballstemmer.stemmer`` function accepts algorithm name and returns ``Stemmer`` objects. + +``Stemmer`` objects have ``Stemmer.stemWord(word)`` method and ``Stemmer.stemWords(word[])`` method. + +.. code-block:: python + + import snowballstemmer + + stemmer = snowballstemmer.stemmer('english'); + print(stemmer.stemWords("We are the world".split())); + +``Stemmer`` objects have ``Stemmer.maxCacheSize`` property. They cache result within the value. Default is ``10000``. + +Accelerates Stemming +-------------------- + +if **PyStemmer** is installed, ``snowballstemmer.stemmer`` returns ``PyStemmer``\ 's ``Stemmer`` objects. This ``Stemmer`` object has same methods (``Stemmer.stemWord()``, ``Stemmer.stemWords()``). + +**PyStemmer** is a Snowball's ``libstemmer_c`` wrapper module and it returns 100% compatible result with **snowballstemmer**. + +**PyStemmer** has faster speed because it uses C-lang module, and **snowballstemmer** has higher usability because it is pure Python module. + +* `PyStemmer `_ + +Benchmark +~~~~~~~~~ + +Test Case: Snowball stemmer check data (16 algorithms, total 582560 words, cache hit 0%) +Computer: MacBook Pro 3rd Gen Corei7 2.3GHz + +* Python 2.7 + **snowballstemmer** : 2m 30s +* PyPy 1.9 + **snowballstemmer** : 45s +* Python 2.7 + **PyStemmer** : 5s + +This test case is much harder than usual usecases! + +The TestApp example +------------------- + +The ``testapp.py`` example program allows you to run any of the stemmers +on a sample vocabulary. + +Usage:: + + testapp.py "sentences ... " + +.. code-block:: bash + + $ python testapp.py English "sentences... " + +License +------- + +It is a BSD licensed library. + +Copyright (c) 2013, Yoshiki Shibukawa +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + Neither the name of the nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/python/MANIFEST.in b/python/MANIFEST.in new file mode 100644 index 000000000..f5dafc46a --- /dev/null +++ b/python/MANIFEST.in @@ -0,0 +1,4 @@ +include *.rst +include setup.* +recursive-include src *.py +include MANIFEST.in diff --git a/python/create_init.py b/python/create_init.py new file mode 100644 index 000000000..c5546ce00 --- /dev/null +++ b/python/create_init.py @@ -0,0 +1,51 @@ +#! /bin/sh/env python + +import sys +import re +import os + +python_out_folder = sys.argv[1] + +filematch = re.compile(r"(\w+)_stemmer\.py$") + +imports = [] +languages = ['_languages = {'] + +for pyscript in os.listdir(python_out_folder): + match = filematch.match(pyscript) + if (match): + langname = match.group(1) + titlecase = langname.title() + languages.append(" '%(lang)s': %(title)sStemmer," % {'lang': langname, 'title': titlecase}) + imports.append('from .%(lang)s_stemmer import %(title)sStemmer' % {'lang': langname, 'title': titlecase}) +languages.append('}'); +src = '''__all__ = ('language', 'stemmer') + +%(imports)s + +%(languages)s + +try: + import Stemmer + cext_available = True +except ImportError: + cext_available = False + +def algorithms(): + if cext_available: + return Stemmer.language() + else: + return list(_languages.key()) + +def stemmer(lang): + if cext_available: + return Stemmer.Stemmer(lang) + if lang.lower() in _languages: + return _languages[lang.lower()]() + else: + raise KeyError("Stemming algorithm '%%s' not found" %% lang) +''' % {'imports': '\n'.join(imports), 'languages': '\n'.join(languages)} + +out = open(os.path.join(python_out_folder, '__init__.py'), 'w') +out.write(src) +out.close() diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 000000000..6ca5e0861 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +from distutils.core import setup + +setup(name='snowballstemmer', + version='1.1.0', + description='This package provides 16 stemmer algorithms (15 + Porter English stemmer) generated from Snowball algorithms.', + long_description=''' +It includes following language algorithms: + +* Danish +* Dutch +* English (Standard, Porter) +* Finnish +* French +* German +* Hungarian +* Italian +* Norwegian +* Portuguese +* Romanian +* Russian +* Spanish +* Swedish +* Turkish + +This is a pure Python stemming library. If `PyStemmer `_ is available, this module uses +it to accelerate. +''', + author='Yoshiki Shibukawa', + author_email='yoshiki at shibu.jp', + url='https://github.com/shibukawa/snowball_py', + keywords="stemmer", + license="BSD", + packages=['snowballstemmer'], + package_dir={"snowballstemmer": "src/snowballstemmer"}, + classifiers = [ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: BSD License', + 'Programming Language :: Python', + 'Natural Language :: Danish', + 'Natural Language :: Dutch', + 'Natural Language :: English', + 'Natural Language :: Finnish', + 'Natural Language :: French', + 'Natural Language :: German', + 'Natural Language :: Hungarian', + 'Natural Language :: Italian', + 'Natural Language :: Norwegian', + 'Natural Language :: Portuguese', + 'Natural Language :: Romanian', + 'Natural Language :: Russian', + 'Natural Language :: Spanish', + 'Natural Language :: Swedish', + 'Natural Language :: Turkish', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 3', + 'Topic :: Database', + 'Topic :: Internet :: WWW/HTTP :: Indexing/Search', + 'Topic :: Text Processing :: Indexing', + 'Topic :: Text Processing :: Linguistic' + ] +) diff --git a/python/snowballstemmer/among.py b/python/snowballstemmer/among.py new file mode 100644 index 000000000..5a99ad2c4 --- /dev/null +++ b/python/snowballstemmer/among.py @@ -0,0 +1,15 @@ + +class Among(object): + def __init__(self, s, substring_i, result, method=None): + """ + @ivar s_size search string size + @ivar s search string + @ivar substring index to longest matching substring + @ivar result of the lookup + @ivar method method to use if substring matches + """ + self.s_size = len(s) + self.s = s + self.substring_i = substring_i + self.result = result + self.method = method diff --git a/python/snowballstemmer/basestemmer.py b/python/snowballstemmer/basestemmer.py new file mode 100644 index 000000000..cd12f12bc --- /dev/null +++ b/python/snowballstemmer/basestemmer.py @@ -0,0 +1,351 @@ +class BaseStemmer(object): + def __init__(self): + self.set_current("") + self.maxCacheSize = 10000 + self._cache = {} + self._counter = 0 + + def set_current(self, value): + ''' + Set the self.current string. + ''' + self.current = value + self.cursor = 0 + self.limit = len(self.current) + self.limit_backward = 0 + self.bra = self.cursor + self.ket = self.limit + + def get_current(self): + ''' + Get the self.current string. + ''' + return self.current + + def copy_from(self, other): + self.current = other.current + self.cursor = other.cursor + self.limit = other.limit + self.limit_backward = other.limit_backward + self.bra = other.bra + self.ket = other.ket + + def in_grouping(self, s, min, max): + if self.cursor >= self.limit: + return False + ch = ord(self.current[self.cursor]) + if ch > max or ch < min: + return False + ch -= min + if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0: + return False + self.cursor += 1 + return True + + def in_grouping_b(self, s, min, max): + if self.cursor <= self.limit_backward: + return False + ch = ord(self.current[self.cursor - 1]) + if ch > max or ch < min: + return False + ch -= min + if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0: + return False + self.cursor -= 1 + return True + + def out_grouping(self, s, min, max): + if self.cursor >= self.limit: + return False + ch = ord(self.current[self.cursor]) + if ch > max or ch < min: + self.cursor += 1 + return True + ch -= min + if (s[ch >> 3] & (0X1 << (ch & 0x7))) == 0: + self.cursor += 1 + return True + return False + + def out_grouping_b(self, s, min, max): + if self.cursor <= self.limit_backward: + return False + ch = ord(self.current[self.cursor - 1]) + if ch > max or ch < min: + self.cursor -= 1 + return True + ch -= min + if (s[ch >> 3] & (0X1 << (ch & 0x7))) == 0: + self.cursor -= 1 + return True + return False + + def in_range(self, min, max): + if self.cursor >= self.limit: + return False + ch = ord(self.current[self.cursor]) + if ch > max or ch < min: + return False + self.cursor += 1 + return True + + def in_range_b(self, min, max): + if self.cursor <= self.limit_backward: + return False + ch = ord(self.current[self.cursor - 1]) + if ch > max or ch < min: + return False + self.cursor -= 1 + return True + + def out_range(self, min, max): + if self.cursor >= self.limit: + return False + ch = ord(self.current[self.cursor]) + if not (ch > max or ch < min): + return False + self.cursor += 1 + return True + + def out_range_b(self, min, max): + if self.cursor <= self.limit_backward: + return False + ch = ord(self.current[self.cursor - 1]) + if not (ch > max or ch < min): + return False + self.cursor -= 1 + return True + + def eq_s(self, s_size, s): + if self.limit - self.cursor < s_size: + return False + if self.current[self.cursor:self.cursor + s_size] != s: + return False + self.cursor += s_size + return True + + def eq_s_b(self, s_size, s): + if self.cursor - self.limit_backward < s_size: + return False + if self.current[self.cursor - s_size:self.cursor] != s: + return False + self.cursor -= s_size + return True + + def eq_v(self, s): + return self.eq_s(len(s), s) + + def eq_v_b(self, s): + return self.eq_s_b(len(s), s) + + def find_among(self, v, v_size): + i = 0 + j = v_size + + c = self.cursor + l = self.limit + + common_i = 0 + common_j = 0 + + first_key_inspected = False + + while True: + k = i + ((j - i) >> 1) + diff = 0 + common = min(common_i, common_j) # smalle + w = v[k] + for i2 in range(common, w.s_size): + if c + common == l: + diff = -1 + break + diff = ord(self.current[c + common]) - ord(w.s[i2]) + if diff != 0: + break + common += 1 + if diff < 0: + j = k + common_j = common + else: + i = k + common_i = common + if j - i <= 1: + if i > 0: + break # v->s has been inspected + if j == i: + break # only one item in v + # - but now we need to go round once more to get + # v->s inspected. self looks messy, but is actually + # the optimal approach. + if first_key_inspected: + break + first_key_inspected = True + while True: + w = v[i] + if common_i >= w.s_size: + self.cursor = c + w.s_size + if w.method is None: + return w.result + method = getattr(self, w.method) + res = method() + self.cursor = c + w.s_size + if res: + return w.result + i = w.substring_i + if i < 0: + return 0 + return -1 # not reachable + + def find_among_b(self, v, v_size): + ''' + find_among_b is for backwards processing. Same comments apply + ''' + i = 0 + j = v_size + + c = self.cursor + lb = self.limit_backward; + + common_i = 0 + common_j = 0 + + first_key_inspected = False + + while True: + k = i + ((j - i) >> 1) + diff = 0 + common = min(common_i, common_j) + w = v[k] + for i2 in range(w.s_size - 1 - common, -1, -1): + if c - common == lb: + diff = -1 + break + diff = ord(self.current[c - 1 - common]) - ord(w.s[i2]) + if diff != 0: + break + common += 1 + if diff < 0: + j = k + common_j = common + else: + i = k + common_i = common + if j - i <= 1: + if i > 0: + break + if j == i: + break + if first_key_inspected: + break + first_key_inspected = True + while True: + w = v[i] + if common_i >= w.s_size: + self.cursor = c - w.s_size + if w.method is None: + return w.result + method = getattr(self, w.method) + res = method() + self.cursor = c - w.s_size + if res: + return w.result + i = w.substring_i + if i < 0: + return 0 + return -1 # not reachable + + def replace_s(self, c_bra, c_ket, s): + ''' + to replace chars between c_bra and c_ket in self.current by the + chars in s. + + @type c_bra int + @type c_ket int + @type s: string + ''' + adjustment = len(s) - (c_ket - c_bra) + self.current = self.current[0:c_bra] + s + self.current[c_ket:] + self.limit += adjustment + if self.cursor >= c_ket: + self.cursor += adjustment + elif self.cursor > c_bra: + self.cursor = c_bra + return adjustment + + def slice_check(self): + if self.bra < 0 or self.bra > self.ket or self.ket > self.limit or self.limit > len(self.current): + return False + return True + + def slice_from(self, s): + ''' + @type s string + ''' + result = False + if self.slice_check(): + self.replace_s(self.bra, self.ket, s) + result = True + return result + + def slice_del(self): + return self.slice_from("") + + def insert(self, c_bra, c_ket, s): + ''' + @type c_bra int + @type c_ket int + @type s: string + ''' + adjustment = self.replace_s(c_bra, c_ket, s) + if c_bra <= self.bra: + self.bra += adjustment + if c_bra <= self.ket: + self.ket += adjustment + + def slice_to(self, s): + ''' + Copy the slice into the supplied StringBuffer + + @type s: string + ''' + result = '' + if self.slice_check(): + result = self.current[self.bra:self.ket] + return result + + def assign_to(self, s): + ''' + @type s: string + ''' + return self.current[0:self.limit] + + def _stem_word(self, word): + cache = self._cache.get(word) + if cache is None: + self.set_current(word) + self._stem() + result = self.get_current() + self._cache[word] = [result, self._counter] + else: + cache[1] = self._counter + result = cache[0] + self._counter += 1 + return result + + def _clear_cache(self): + removecount = int(len(self._cache) - self.maxCacheSize * 8 / 10) + oldcaches = sorted(self._cache.items(), key=lambda cache: cache[1][1])[0:removecount] + for key, value in oldcaches: + del self._cache[key] + + def stemWord(self, word): + result = self._stem_word(word) + if len(self._cache) > self.maxCacheSize: + self._clear_cache() + return result + + def stemWords(self, words): + result = [self._stem_word(word) for word in words] + if len(self._cache) > self.maxCacheSize: + self.clear_cache() + return result diff --git a/python/stemwords.py b/python/stemwords.py new file mode 100644 index 000000000..5d1f47e3d --- /dev/null +++ b/python/stemwords.py @@ -0,0 +1,105 @@ +import sys +import re +import codecs +import snowballstemmer + +def usage(): + print('''usage: %s [-l ] [-i ] [-o ] [-c ] [-p[2]] [-h] + +The input file consists of a list of words to be stemmed, one per +line. Words should be in lower case, but (for English) A-Z letters +are mapped to their a-z equivalents anyway. If omitted, stdin is +used. + +If -c is given, the argument is the character encoding of the input +and output files. If it is omitted, the UTF-8 encoding is used. + +If -p is given the output file consists of each word of the input +file followed by \"->\" followed by its stemmed equivalent. +If -p2 is given the output file is a two column layout containing +the input words in the first column and the stemmed eqivalents in +the second column. + +Otherwise, the output file consists of the stemmed words, one per +line. + +-h displays this help''' % sys.argv[0]) + +def main(): + argv = sys.argv[1:] + if len(argv) < 5: + usage() + else: + pretty = 0 + input = '' + output = '' + encoding = 'utf_8' + language = 'English' + show_help = False + while len(argv): + arg = argv[0] + argv = argv[1:] + if arg == '-h': + show_help = True + break + elif arg == "-p": + pretty = 1 + elif arg == "-p2": + pretty = 2 + elif arg == "-l": + if len(argv) == 0: + show_help = True + break + language = argv[0] + argv = argv[1:] + elif arg == "-i": + if len(argv) == 0: + show_help = True + break + input = argv[0] + argv = argv[1:] + elif arg == "-o": + if len(argv) == 0: + show_help = True + break + output = argv[0] + argv = argv[1:] + elif arg == "-c": + if len(argv) == 0: + show_help = True + break + encoding = argv[0] + if show_help or input == '' or output == '': + usage() + else: + stemming(language, input, output, encoding, pretty) + + +def stemming(lang, input, output, encoding, pretty): + result = [] + stemmer = snowballstemmer.stemmer(lang) + for original in codecs.open(input, "r", encoding).readlines(): + original = original.strip() + # Convert only ASCII-letters to lowercase, to match C behavior + original = ''.join((lower_(c) if 'A' <= c <= 'Z' else c for c in original)) + stemmed = stemmer.stemWord(original) + if result: + result.append('\n') + if pretty == 0: + if stemmed != "": + result.append(stemmed) + elif pretty == 1: + result.append(original, " -> ", stemmed) + elif pretty == 2: + result.append(original) + if len(original) < 30: + result.append(" " * (30 - len(original))) + else: + result.append("\n") + result.append(" " * 30) + result.append(stemmed) + outfile = codecs.open(output, "w", encoding) + outfile.write(''.join(result) + '\n') + outfile.close() + +main() diff --git a/python/testapp.py b/python/testapp.py new file mode 100644 index 000000000..156613442 --- /dev/null +++ b/python/testapp.py @@ -0,0 +1,28 @@ +import sys +import re +import snowballstemmer + + +def usage(): + print("testapp.py \"sentence\"...") + +def main(): + argv = sys.argv + if len(argv) < 1: + usage() + return + algorithm = 'english' + if len(argv) > 2: + algorithm = argv[1] + argv = argv[2:] + else: + argv = argv[1:] + stemmer = snowballstemmer.stemmer(algorithm) + splitter = re.compile(r"[\s\.-]") + for arg in argv: + for word in splitter.split(arg): + if word == '': + continue + original = word.lower() + print(original + " -> " + stemmer.stemWord(original)) +main()