Skip to content

Commit b6320d8

Browse files
committed
support selection of possible languages for lingua
1 parent d5f2118 commit b6320d8

File tree

4 files changed

+37
-10
lines changed

4 files changed

+37
-10
lines changed

docs/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Added
11+
12+
- support for selection of possible languages for `lingua` language detection
13+
1014
## [3.2.0] - 2024-08-14
1115

1216
### Changed

docs/filters/script_and_language_identification_filters.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ Parameters:
3838
* `id_method`: language indentification method (`langid`, `lingua`, `cld2`, `fasttext`; default `langid`)
3939
* `thresholds`: minimum identification confidence score for the segments (a single float or a list of floats per language)
4040
* `fasttext_model_path`: path for a `fasttext` model (required only for the `fasttext` method; default `null`)
41-
* `langid_languages`: limit detection to a list of possible languages (valid only for the `langid` method; default `null`)
41+
* `langid_languages`: limit detection to a list of ISO 639-1 codes for possible languages (valid only for the `langid` and `lingua` methods; default `null`)
4242
* `cld2_options`: a dictionary of options for the `cld2` method (valid only for the `cld2` method; default `null`)
4343
* `lingua_mode`: a string specifying whether to use lingua's `high` or `low` accuracy mode
4444

opusfilter/filters.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,9 @@ def __init__(self, languages=None, id_method='langid', thresholds=None,
305305
self.cld2_options = None
306306
self.fasttext_model = None
307307
self.lingua_detector = None
308+
if langid_languages and id_method not in {'langid', 'lingua'}:
309+
raise ConfigurationError(
310+
"langid_languages option is supported only by the langid and lingua methods")
308311
if id_method == 'fasttext':
309312
self.init_fastttext(fasttext_model_path)
310313
else:
@@ -313,17 +316,13 @@ def __init__(self, languages=None, id_method='langid', thresholds=None,
313316
"path to model was set")
314317
if id_method == 'langid':
315318
self.init_langid(langid_languages)
316-
else:
317-
if langid_languages:
318-
raise ConfigurationError(
319-
"langid_languages option is supported only by the method langid")
320319
if id_method == 'cld2':
321320
self.cld2_options = cld2_options if cld2_options else {}
322321
else:
323322
if cld2_options:
324323
raise ConfigurationError("cld2_options is supported only by the method cld2")
325324
if id_method == "lingua":
326-
self.init_lingua(lingua_mode if lingua_mode else 'low')
325+
self.init_lingua(lingua_mode if lingua_mode else 'low', langid_languages)
327326
else:
328327
if lingua_mode:
329328
raise ConfigurationError("lingua_mode is supported only by the method lingua")
@@ -351,11 +350,17 @@ def init_fastttext(self, fasttext_model_path):
351350
raise
352351
self.fasttext_model = fasttext.load_model(os.path.join(self.workdir, fasttext_model_path))
353352

354-
def init_lingua(self, lingua_mode):
353+
def init_lingua(self, lingua_mode, languages):
355354
"""Initialize lingua identifier"""
356-
from lingua import LanguageDetectorBuilder
357-
# TODO: support lingua_languages just like langid_languages
358-
from_languages = LanguageDetectorBuilder.from_all_languages()
355+
from lingua import LanguageDetectorBuilder, IsoCode639_1
356+
if languages:
357+
for code in languages:
358+
if not hasattr(IsoCode639_1, code.upper()):
359+
raise ConfigurationError(f"Language {code} not supported by lingua")
360+
from_languages = LanguageDetectorBuilder.from_iso_codes_639_1(
361+
*[getattr(IsoCode639_1, code.upper()) for code in languages])
362+
else:
363+
from_languages = LanguageDetectorBuilder.from_all_languages()
359364
if lingua_mode == "high":
360365
self.lingua_detector = from_languages.with_preloaded_language_models().build()
361366
elif lingua_mode == "low":

tests/test_lid.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,21 @@ def test_accept_high(self):
149149
pair_expecteds = [True, False]
150150
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
151151
self.assertEqual(model.accept(pair_score), pair_expected)
152+
153+
def test_limited_languages(self):
154+
model = LanguageIDFilter(
155+
languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99],
156+
langid_languages=['en', 'fr', 'de', 'fi'])
157+
pair_scores = model.score(self.pairs_inputs)
158+
pair_expecteds = [True, False]
159+
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
160+
self.assertEqual(model.accept(pair_score), pair_expected)
161+
162+
def test_badly_limited_languages(self):
163+
model = LanguageIDFilter(
164+
languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99],
165+
langid_languages=['en', 'de', 'fi'])
166+
pair_scores = model.score(self.pairs_inputs)
167+
pair_expecteds = [False, False]
168+
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
169+
self.assertEqual(model.accept(pair_score), pair_expected)

0 commit comments

Comments
 (0)