Skip to content

Commit 6e38358

Browse files
Modify RULE loading/writing to use frontmatter
Rule classmethod from_files is renamed and modified to from_file and now takes only the rule_file as argument. Also modifies tests to use frontmatter. Signed-off-by: AyanSinhaMahapatra <[email protected]>
1 parent bd3545c commit 6e38358

File tree

4 files changed

+74
-95
lines changed

4 files changed

+74
-95
lines changed

src/licensedcode/models.py

+65-79
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import hashlib
1111
import io
1212
import os
13-
import re
1413
import sys
1514
import traceback
1615
from collections import Counter
@@ -34,14 +33,18 @@
3433
from licensedcode import MIN_MATCH_HIGH_LENGTH
3534
from licensedcode import MIN_MATCH_LENGTH
3635
from licensedcode import SMALL_RULE
36+
from licensedcode.frontmatter import SaneYAMLHandler
37+
from licensedcode.frontmatter import FrontmatterPost
38+
from licensedcode.frontmatter import dumps_frontmatter
39+
from licensedcode.frontmatter import load_frontmatter
40+
from licensedcode.frontmatter import get_rule_text
3741
from licensedcode.languages import LANG_INFO as known_languages
3842
from licensedcode.spans import Span
3943
from licensedcode.tokenize import index_tokenizer
4044
from licensedcode.tokenize import index_tokenizer_with_stopwords
4145
from licensedcode.tokenize import key_phrase_tokenizer
4246
from licensedcode.tokenize import KEY_PHRASE_OPEN
4347
from licensedcode.tokenize import KEY_PHRASE_CLOSE
44-
from licensedcode.tokenize import query_lines
4548

4649
"""
4750
Reference License and license Rule structures persisted as a combo of a YAML
@@ -912,40 +915,32 @@ def load_rules(rules_data_dir=rules_data_dir, with_checks=True):
912915
space_problems = []
913916
model_errors = []
914917

915-
for data_file in resource_iter(location=rules_data_dir, with_dirs=False):
916-
if data_file.endswith('.yml'):
917-
base_name = file_base_name(data_file)
918+
for rule_file in resource_iter(location=rules_data_dir, with_dirs=False):
919+
if rule_file.endswith('.RULE'):
920+
base_name = file_base_name(rule_file)
918921

919922
if with_checks and ' ' in base_name:
920-
space_problems.append(data_file)
921-
922-
text_file = join(rules_data_dir, f'{base_name}.RULE')
923+
space_problems.append(rule_file)
923924

924925
try:
925-
yield Rule.from_files(data_file=data_file, text_file=text_file)
926+
yield Rule.from_file(rule_file=rule_file)
926927
except Exception as re:
927928
if with_checks:
928929
model_errors.append(str(re))
929930

930931
if with_checks:
931932
# accumulate sets to ensures we do not have illegal names or extra
932933
# orphaned files
933-
data_file_lower = data_file.lower()
934-
if data_file_lower in lower_case_files:
935-
case_problems.add(data_file_lower)
936-
else:
937-
lower_case_files.add(data_file_lower)
938-
939-
text_file_lower = text_file.lower()
940-
if text_file_lower in lower_case_files:
941-
case_problems.add(text_file_lower)
934+
rule_file_lower = rule_file.lower()
935+
if rule_file_lower in lower_case_files:
936+
case_problems.add(rule_file_lower)
942937
else:
943-
lower_case_files.add(text_file_lower)
938+
lower_case_files.add(rule_file_lower)
944939

945-
processed_files.update([data_file, text_file])
940+
processed_files.add(rule_file)
946941

947-
if with_checks and not data_file.endswith('~'):
948-
seen_files.add(data_file)
942+
if with_checks and not rule_file.endswith('~'):
943+
seen_files.add(rule_file)
949944

950945
if with_checks:
951946
unknown_files = seen_files - processed_files
@@ -955,29 +950,29 @@ def load_rules(rules_data_dir=rules_data_dir, with_checks=True):
955950
if model_errors:
956951
errors = '\n'.join(model_errors)
957952
msg += (
958-
'\nInvalid rule YAML file in directory: '
959-
f'{rules_data_dir!r}\n{errors}'
953+
'\nInvalid rule file in directory: '
954+
f'{rules_data_dir!r}\n'
960955
)
961956

962957
if unknown_files:
963958
files = '\n'.join(sorted(f'file://{f}"' for f in unknown_files))
964959
msg += (
965960
'\nOrphaned files in rule directory: '
966-
f'{rules_data_dir!r}\n{files}'
961+
f'{rules_data_dir!r}\n'
967962
)
968963

969964
if case_problems:
970965
files = '\n'.join(sorted(f'"file://{f}"' for f in case_problems))
971966
msg += (
972967
'\nRule files with non-unique name in rule directory: '
973-
f'{rules_data_dir!r}\n{files}'
968+
f'{rules_data_dir!r}\n'
974969
)
975970

976971
if space_problems:
977972
files = '\n'.join(sorted(f'"file://{f}"' for f in space_problems))
978973
msg += (
979974
'\nRule filename cannot contain spaces: '
980-
f'{rules_data_dir!r}\n{files}'
975+
f'{rules_data_dir!r}\n'
981976
)
982977

983978
raise InvalidRule(msg)
@@ -1390,28 +1385,18 @@ class BasicRule:
13901385
'position is using the magic -1 key.')
13911386
)
13921387

1393-
def data_file(
1388+
def rule_file(
13941389
self,
13951390
rules_data_dir=rules_data_dir,
13961391
licenses_data_dir=licenses_data_dir,
13971392
):
1398-
data_file_base_name = file_base_name(self.identifier)
1399-
data_file_name = f'{data_file_base_name}.yml'
1393+
rule_file_base_name = file_base_name(self.identifier)
1394+
rule_file_name = f'{rule_file_base_name}.RULE'
14001395

14011396
if self.is_from_license:
1402-
return join(licenses_data_dir, data_file_name)
1403-
else:
1404-
return join(rules_data_dir, data_file_name)
1405-
1406-
def text_file(
1407-
self,
1408-
rules_data_dir=rules_data_dir,
1409-
licenses_data_dir=licenses_data_dir,
1410-
):
1411-
if self.is_from_license:
1412-
return join(licenses_data_dir, f'{self.identifier}')
1397+
return join(licenses_data_dir, rule_file_name)
14131398
else:
1414-
return join(rules_data_dir, f'{self.identifier}')
1399+
return join(rules_data_dir, rule_file_name)
14151400

14161401
def __attrs_post_init__(self, *args, **kwargs):
14171402
self.setup()
@@ -1431,13 +1416,13 @@ def setup(self):
14311416
trace = traceback.format_exc()
14321417
raise InvalidRule(
14331418
f'Unable to parse Rule license expression: {exp!r} '
1434-
f'for: file://{self.data_file}\n{trace}'
1419+
f'for: file://{self.identifier}\n{trace}'
14351420
) from e
14361421

14371422
if expression is None:
14381423
raise InvalidRule(
14391424
f'Invalid rule License expression parsed to empty: '
1440-
f'{self.license_expression!r} for: file://{self.data_file}'
1425+
f'{self.license_expression!r} for: file://{self.identifier}'
14411426
)
14421427

14431428
self.license_expression = expression.render()
@@ -1655,15 +1640,7 @@ def to_dict(self):
16551640
return data
16561641

16571642

1658-
def get_rule_text(location=None, text=None):
1659-
"""
1660-
Return the rule ``text`` prepared for indexing.
1661-
###############
1662-
# IMPORTANT: we use the same process as used to load query text for symmetry
1663-
###############
1664-
"""
1665-
numbered_lines = query_lines(location=location, query_string=text, plain_text=True)
1666-
return '\n'.join(l.strip() for _, l in numbered_lines)
1643+
16671644

16681645

16691646
def has_only_lower_license_keys(license_expression, licensing=Licensing()):
@@ -1711,13 +1688,13 @@ def __attrs_post_init__(self, *args, **kwargs):
17111688
self.setup()
17121689

17131690
@classmethod
1714-
def from_files(cls, data_file, text_file):
1691+
def from_file(cls, rule_file):
17151692
"""
17161693
Return a new Rule object loaded from a data file stored at
17171694
``data_file`` and a companion ``text_file``.
17181695
"""
17191696
rule = Rule()
1720-
rule.load_data(data_file=data_file, text_file=text_file)
1697+
rule.load_data(rule_file=rule_file)
17211698
return rule
17221699

17231700
@classmethod
@@ -1786,29 +1763,29 @@ def _from_expression(cls, license_expression=None, identifier=None, **kwargs):
17861763
rule.setup()
17871764
return rule
17881765

1789-
def load_data(self, data_file, text_file):
1766+
def load_data(self, rule_file):
17901767
"""
1791-
Load data from ``data_file`` and ``text_file``. Check presence of text
1792-
file to determine if this is a special synthetic rule.
1768+
Load data from ``rule_file`` which has both the text and the data (as YAML forntmatter).
1769+
Check presence of text file to determine if this is a special synthetic rule.
17931770
"""
17941771
if self.is_synthetic:
17951772
if not self.text:
17961773
raise InvalidRule(
17971774
f'Invalid synthetic rule without text: {self}: {self.text!r}')
17981775
return self
17991776

1800-
if not data_file or not text_file:
1777+
if not rule_file :
18011778
raise InvalidRule(
1802-
f'Cannot load rule without its corresponding text_file and data file: '
1803-
f'{self}: file://{data_file} file://{text_file}')
1779+
f'Cannot load rule without its corresponding rule_file: '
1780+
f'{self}: file://{rule_file}')
18041781

1805-
self.identifier = file_name(text_file)
1782+
self.identifier = file_name(rule_file)
18061783

18071784
try:
1808-
self.load(data_file=data_file, text_file=text_file)
1785+
self.load(rule_file=rule_file)
18091786
except Exception:
18101787
trace = traceback.format_exc()
1811-
raise InvalidRule(f'While loading: file://{data_file}\n{trace}')
1788+
raise InvalidRule(f'While loading: file://{rule_file}\n{trace}')
18121789

18131790
return self
18141791

@@ -1895,10 +1872,12 @@ def compute_thresholds(self, small_rule=SMALL_RULE):
18951872

18961873
def dump(self, rules_data_dir):
18971874
"""
1898-
Dump a representation of this rule as two files stored in
1899-
``rules_data_dir``:
1900-
- a .yml for the rule data in YAML (e.g., data_file)
1901-
- a .RULE: the rule text as a UTF-8 file (e.g., text_file)
1875+
Dump a representation of this rule as a .RULE file stored in
1876+
``rules_data_dir`` as a UTF-8 file having:
1877+
- the rule data as YAML frontmatter
1878+
- the rule text
1879+
and this is a `rule_file`.
1880+
19021881
Does nothing if this rule was created from a License (e.g.,
19031882
`is_from_license` is True)
19041883
"""
@@ -1911,28 +1890,35 @@ def write(location, byte_string):
19111890
with io.open(location, 'wb') as of:
19121891
of.write(byte_string)
19131892

1914-
data_file = self.data_file(rules_data_dir=rules_data_dir)
1915-
as_yaml = saneyaml.dump(self.to_dict(), indent=4, encoding='utf-8')
1916-
write(data_file, as_yaml)
1893+
rule_file = self.rule_file(rules_data_dir=rules_data_dir)
19171894

1918-
text_file = self.text_file(rules_data_dir=rules_data_dir)
1919-
write(text_file, self.text.encode('utf-8'))
1895+
metadata = self.to_dict()
1896+
content = self.text.encode('utf-8')
1897+
rule_post = FrontmatterPost(content=content, handler=SaneYAMLHandler(), **metadata)
1898+
output_string = dumps_frontmatter(post=rule_post)
19201899

1921-
def load(self, data_file, text_file, with_checks=True):
1900+
write(rule_file, output_string.encode('utf-8'))
1901+
1902+
def load(self, rule_file, with_checks=True):
19221903
"""
1923-
Load self from a .RULE YAML file stored in data_file and text_file.
1904+
Load self from a .RULE file with YAMl frontmatter stored in data_file and text_file.
19241905
Unknown fields are ignored and not bound to the Rule object.
19251906
Optionally check for consistency if ``with_checks`` is True.
19261907
"""
19271908
try:
1928-
with io.open(data_file, encoding='utf-8') as f:
1929-
data = saneyaml.load(f.read(), allow_duplicate_keys=False)
1909+
post = load_frontmatter(rule_file)
1910+
data = post.metadata
1911+
if not post.content:
1912+
raise InvalidRule(
1913+
f'Cannot load rule with empty text: '
1914+
f'{self}: file://{rule_file}'
1915+
)
19301916

1931-
self.text = get_rule_text(location=text_file)
1917+
self.text = post.content
19321918

19331919
except Exception as e:
19341920
print('#############################')
1935-
print('INVALID LICENSE RULE FILE:', f'file://{data_file}', f'file://{text_file}')
1921+
print('INVALID LICENSE RULE FILE:', f'file://{rule_file}')
19361922
print('#############################')
19371923
print(e)
19381924
print('#############################')

tests/licensedcode/test_match_hash.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def test_match_hash_can_match_exactly(self):
2626
rule_dir = self.get_test_loc('hash/rules')
2727
rules = list(models.load_rules(rule_dir))
2828
idx = index.LicenseIndex(rules)
29-
query_doc = self.get_test_loc('hash/rules/lgpl-2.0-plus_23.RULE')
29+
query_doc = self.get_test_loc('hash/old_rules/lgpl-2.0-plus_23.RULE')
3030

3131
matches = idx.match(query_doc)
3232
assert len(matches) == 1

tests/licensedcode/test_models.py

+7-14
Original file line numberDiff line numberDiff line change
@@ -164,12 +164,7 @@ def test_rule_from_license_have_text_file_and_data_file_are_computed_correctly(s
164164
lic = licenses['gpl-1.0']
165165
rule = models.build_rule_from_license(license_obj=lic)
166166

167-
assert rule.text_file(
168-
licenses_data_dir=licenses_data_dir,
169-
rules_data_dir=None,
170-
).startswith(licenses_data_dir)
171-
172-
assert rule.data_file(
167+
assert rule.rule_file(
173168
licenses_data_dir=licenses_data_dir,
174169
rules_data_dir=None,
175170
).startswith(licenses_data_dir)
@@ -510,20 +505,19 @@ def test_compute_relevance_is_using_rule_length(self):
510505
assert rule.relevance == 0
511506

512507
def test_rule_must_have_text(self):
513-
data_file = self.get_test_loc('models/rule_no_text/mit.yml')
508+
rule_file = self.get_test_loc('models/rule_no_text/mit.RULE')
514509
try:
515-
Rule.from_files(data_file=data_file, text_file=None)
510+
Rule.from_file(rule_file=rule_file)
516511
self.fail('Exception not raised.')
517512
except InvalidRule as e:
518-
assert str(e).startswith('Cannot load rule without its corresponding text_file and data file')
513+
assert 'Cannot load rule with empty text' in str(e)
519514

520515
def test_rule_cannot_contain_extra_unknown_attributes(self):
521-
data_file = self.get_test_loc('models/rule_with_extra_attributes/sun-bcl.yml')
522-
text_file = self.get_test_loc('models/rule_with_extra_attributes/sun-bcl.RULE')
516+
rule_file = self.get_test_loc('models/rule_with_extra_attributes/sun-bcl.RULE')
523517

524518
expected = 'data file has unknown attributes: license_expressionnotuce'
525519
try:
526-
Rule.from_files(data_file=data_file, text_file=text_file)
520+
Rule.from_file(rule_file=rule_file)
527521
self.fail('Exception not raised.')
528522
except Exception as e:
529523
assert expected in str(e)
@@ -579,8 +573,7 @@ def test_rule_text_file_and_data_file_are_computed_correctly(self):
579573
rule_dir = self.get_test_loc('models/data_text_files/rules')
580574
rules = list(models.load_rules(rule_dir))
581575
rule = rules[0]
582-
assert rule.text_file(rules_data_dir=rule_dir).startswith(rule_dir)
583-
assert rule.data_file(rules_data_dir=rule_dir).startswith(rule_dir)
576+
assert rule.rule_file(rules_data_dir=rule_dir).startswith(rule_dir)
584577

585578

586579
class TestGetKeyPhrases(TestCaseClass):

tests/licensedcode/test_query.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ def test_query_and_index_tokens_are_identical_for_same_text(self):
283283
rule_dir = self.get_test_loc('query/rtos_exact/')
284284
from licensedcode.models import load_rules
285285
idx = index.LicenseIndex(load_rules(rule_dir))
286-
query_loc = self.get_test_loc('query/rtos_exact/gpl-2.0-freertos.RULE')
286+
query_loc = self.get_test_loc('query/old_rtos_exact/gpl-2.0-freertos.RULE')
287287

288288
index_text_tokens = [idx.tokens_by_tid[t] for t in idx.tids_by_rid[0]]
289289

0 commit comments

Comments
 (0)