10
10
import hashlib
11
11
import io
12
12
import os
13
- import re
14
13
import sys
15
14
import traceback
16
15
from collections import Counter
34
33
from licensedcode import MIN_MATCH_HIGH_LENGTH
35
34
from licensedcode import MIN_MATCH_LENGTH
36
35
from licensedcode import SMALL_RULE
36
+ from licensedcode .frontmatter import SaneYAMLHandler
37
+ from licensedcode .frontmatter import FrontmatterPost
38
+ from licensedcode .frontmatter import dumps_frontmatter
39
+ from licensedcode .frontmatter import load_frontmatter
40
+ from licensedcode .frontmatter import get_rule_text
37
41
from licensedcode .languages import LANG_INFO as known_languages
38
42
from licensedcode .spans import Span
39
43
from licensedcode .tokenize import index_tokenizer
40
44
from licensedcode .tokenize import index_tokenizer_with_stopwords
41
45
from licensedcode .tokenize import key_phrase_tokenizer
42
46
from licensedcode .tokenize import KEY_PHRASE_OPEN
43
47
from licensedcode .tokenize import KEY_PHRASE_CLOSE
44
- from licensedcode .tokenize import query_lines
45
48
46
49
"""
47
50
Reference License and license Rule structures persisted as a combo of a YAML
@@ -912,40 +915,32 @@ def load_rules(rules_data_dir=rules_data_dir, with_checks=True):
912
915
space_problems = []
913
916
model_errors = []
914
917
915
- for data_file in resource_iter (location = rules_data_dir , with_dirs = False ):
916
- if data_file .endswith ('.yml ' ):
917
- base_name = file_base_name (data_file )
918
+ for rule_file in resource_iter (location = rules_data_dir , with_dirs = False ):
919
+ if rule_file .endswith ('.RULE ' ):
920
+ base_name = file_base_name (rule_file )
918
921
919
922
if with_checks and ' ' in base_name :
920
- space_problems .append (data_file )
921
-
922
- text_file = join (rules_data_dir , f'{ base_name } .RULE' )
923
+ space_problems .append (rule_file )
923
924
924
925
try :
925
- yield Rule .from_files ( data_file = data_file , text_file = text_file )
926
+ yield Rule .from_file ( rule_file = rule_file )
926
927
except Exception as re :
927
928
if with_checks :
928
929
model_errors .append (str (re ))
929
930
930
931
if with_checks :
931
932
# accumulate sets to ensures we do not have illegal names or extra
932
933
# orphaned files
933
- data_file_lower = data_file .lower ()
934
- if data_file_lower in lower_case_files :
935
- case_problems .add (data_file_lower )
936
- else :
937
- lower_case_files .add (data_file_lower )
938
-
939
- text_file_lower = text_file .lower ()
940
- if text_file_lower in lower_case_files :
941
- case_problems .add (text_file_lower )
934
+ rule_file_lower = rule_file .lower ()
935
+ if rule_file_lower in lower_case_files :
936
+ case_problems .add (rule_file_lower )
942
937
else :
943
- lower_case_files .add (text_file_lower )
938
+ lower_case_files .add (rule_file_lower )
944
939
945
- processed_files .update ([ data_file , text_file ] )
940
+ processed_files .add ( rule_file )
946
941
947
- if with_checks and not data_file .endswith ('~' ):
948
- seen_files .add (data_file )
942
+ if with_checks and not rule_file .endswith ('~' ):
943
+ seen_files .add (rule_file )
949
944
950
945
if with_checks :
951
946
unknown_files = seen_files - processed_files
@@ -955,29 +950,29 @@ def load_rules(rules_data_dir=rules_data_dir, with_checks=True):
955
950
if model_errors :
956
951
errors = '\n ' .join (model_errors )
957
952
msg += (
958
- '\n Invalid rule YAML file in directory: '
959
- f'{ rules_data_dir !r} \n { errors } '
953
+ '\n Invalid rule file in directory: '
954
+ f'{ rules_data_dir !r} \n '
960
955
)
961
956
962
957
if unknown_files :
963
958
files = '\n ' .join (sorted (f'file://{ f } "' for f in unknown_files ))
964
959
msg += (
965
960
'\n Orphaned files in rule directory: '
966
- f'{ rules_data_dir !r} \n { files } '
961
+ f'{ rules_data_dir !r} \n '
967
962
)
968
963
969
964
if case_problems :
970
965
files = '\n ' .join (sorted (f'"file://{ f } "' for f in case_problems ))
971
966
msg += (
972
967
'\n Rule files with non-unique name in rule directory: '
973
- f'{ rules_data_dir !r} \n { files } '
968
+ f'{ rules_data_dir !r} \n '
974
969
)
975
970
976
971
if space_problems :
977
972
files = '\n ' .join (sorted (f'"file://{ f } "' for f in space_problems ))
978
973
msg += (
979
974
'\n Rule filename cannot contain spaces: '
980
- f'{ rules_data_dir !r} \n { files } '
975
+ f'{ rules_data_dir !r} \n '
981
976
)
982
977
983
978
raise InvalidRule (msg )
@@ -1390,28 +1385,18 @@ class BasicRule:
1390
1385
'position is using the magic -1 key.' )
1391
1386
)
1392
1387
1393
- def data_file (
1388
+ def rule_file (
1394
1389
self ,
1395
1390
rules_data_dir = rules_data_dir ,
1396
1391
licenses_data_dir = licenses_data_dir ,
1397
1392
):
1398
- data_file_base_name = file_base_name (self .identifier )
1399
- data_file_name = f'{ data_file_base_name } .yml '
1393
+ rule_file_base_name = file_base_name (self .identifier )
1394
+ rule_file_name = f'{ rule_file_base_name } .RULE '
1400
1395
1401
1396
if self .is_from_license :
1402
- return join (licenses_data_dir , data_file_name )
1403
- else :
1404
- return join (rules_data_dir , data_file_name )
1405
-
1406
- def text_file (
1407
- self ,
1408
- rules_data_dir = rules_data_dir ,
1409
- licenses_data_dir = licenses_data_dir ,
1410
- ):
1411
- if self .is_from_license :
1412
- return join (licenses_data_dir , f'{ self .identifier } ' )
1397
+ return join (licenses_data_dir , rule_file_name )
1413
1398
else :
1414
- return join (rules_data_dir , f' { self . identifier } ' )
1399
+ return join (rules_data_dir , rule_file_name )
1415
1400
1416
1401
def __attrs_post_init__ (self , * args , ** kwargs ):
1417
1402
self .setup ()
@@ -1431,13 +1416,13 @@ def setup(self):
1431
1416
trace = traceback .format_exc ()
1432
1417
raise InvalidRule (
1433
1418
f'Unable to parse Rule license expression: { exp !r} '
1434
- f'for: file://{ self .data_file } \n { trace } '
1419
+ f'for: file://{ self .identifier } \n { trace } '
1435
1420
) from e
1436
1421
1437
1422
if expression is None :
1438
1423
raise InvalidRule (
1439
1424
f'Invalid rule License expression parsed to empty: '
1440
- f'{ self .license_expression !r} for: file://{ self .data_file } '
1425
+ f'{ self .license_expression !r} for: file://{ self .identifier } '
1441
1426
)
1442
1427
1443
1428
self .license_expression = expression .render ()
@@ -1655,15 +1640,7 @@ def to_dict(self):
1655
1640
return data
1656
1641
1657
1642
1658
- def get_rule_text (location = None , text = None ):
1659
- """
1660
- Return the rule ``text`` prepared for indexing.
1661
- ###############
1662
- # IMPORTANT: we use the same process as used to load query text for symmetry
1663
- ###############
1664
- """
1665
- numbered_lines = query_lines (location = location , query_string = text , plain_text = True )
1666
- return '\n ' .join (l .strip () for _ , l in numbered_lines )
1643
+
1667
1644
1668
1645
1669
1646
def has_only_lower_license_keys (license_expression , licensing = Licensing ()):
@@ -1711,13 +1688,13 @@ def __attrs_post_init__(self, *args, **kwargs):
1711
1688
self .setup ()
1712
1689
1713
1690
@classmethod
1714
- def from_files (cls , data_file , text_file ):
1691
+ def from_file (cls , rule_file ):
1715
1692
"""
1716
1693
Return a new Rule object loaded from a data file stored at
1717
1694
``data_file`` and a companion ``text_file``.
1718
1695
"""
1719
1696
rule = Rule ()
1720
- rule .load_data (data_file = data_file , text_file = text_file )
1697
+ rule .load_data (rule_file = rule_file )
1721
1698
return rule
1722
1699
1723
1700
@classmethod
@@ -1786,29 +1763,29 @@ def _from_expression(cls, license_expression=None, identifier=None, **kwargs):
1786
1763
rule .setup ()
1787
1764
return rule
1788
1765
1789
- def load_data (self , data_file , text_file ):
1766
+ def load_data (self , rule_file ):
1790
1767
"""
1791
- Load data from ``data_file `` and ``text_file``. Check presence of text
1792
- file to determine if this is a special synthetic rule.
1768
+ Load data from ``rule_file `` which has both the text and the data (as YAML forntmatter).
1769
+ Check presence of text file to determine if this is a special synthetic rule.
1793
1770
"""
1794
1771
if self .is_synthetic :
1795
1772
if not self .text :
1796
1773
raise InvalidRule (
1797
1774
f'Invalid synthetic rule without text: { self } : { self .text !r} ' )
1798
1775
return self
1799
1776
1800
- if not data_file or not text_file :
1777
+ if not rule_file :
1801
1778
raise InvalidRule (
1802
- f'Cannot load rule without its corresponding text_file and data file : '
1803
- f'{ self } : file://{ data_file } file:// { text_file } ' )
1779
+ f'Cannot load rule without its corresponding rule_file : '
1780
+ f'{ self } : file://{ rule_file } ' )
1804
1781
1805
- self .identifier = file_name (text_file )
1782
+ self .identifier = file_name (rule_file )
1806
1783
1807
1784
try :
1808
- self .load (data_file = data_file , text_file = text_file )
1785
+ self .load (rule_file = rule_file )
1809
1786
except Exception :
1810
1787
trace = traceback .format_exc ()
1811
- raise InvalidRule (f'While loading: file://{ data_file } \n { trace } ' )
1788
+ raise InvalidRule (f'While loading: file://{ rule_file } \n { trace } ' )
1812
1789
1813
1790
return self
1814
1791
@@ -1895,10 +1872,12 @@ def compute_thresholds(self, small_rule=SMALL_RULE):
1895
1872
1896
1873
def dump (self , rules_data_dir ):
1897
1874
"""
1898
- Dump a representation of this rule as two files stored in
1899
- ``rules_data_dir``:
1900
- - a .yml for the rule data in YAML (e.g., data_file)
1901
- - a .RULE: the rule text as a UTF-8 file (e.g., text_file)
1875
+ Dump a representation of this rule as a .RULE file stored in
1876
+ ``rules_data_dir`` as a UTF-8 file having:
1877
+ - the rule data as YAML frontmatter
1878
+ - the rule text
1879
+ and this is a `rule_file`.
1880
+
1902
1881
Does nothing if this rule was created from a License (e.g.,
1903
1882
`is_from_license` is True)
1904
1883
"""
@@ -1911,28 +1890,35 @@ def write(location, byte_string):
1911
1890
with io .open (location , 'wb' ) as of :
1912
1891
of .write (byte_string )
1913
1892
1914
- data_file = self .data_file (rules_data_dir = rules_data_dir )
1915
- as_yaml = saneyaml .dump (self .to_dict (), indent = 4 , encoding = 'utf-8' )
1916
- write (data_file , as_yaml )
1893
+ rule_file = self .rule_file (rules_data_dir = rules_data_dir )
1917
1894
1918
- text_file = self .text_file (rules_data_dir = rules_data_dir )
1919
- write (text_file , self .text .encode ('utf-8' ))
1895
+ metadata = self .to_dict ()
1896
+ content = self .text .encode ('utf-8' )
1897
+ rule_post = FrontmatterPost (content = content , handler = SaneYAMLHandler (), ** metadata )
1898
+ output_string = dumps_frontmatter (post = rule_post )
1920
1899
1921
- def load (self , data_file , text_file , with_checks = True ):
1900
+ write (rule_file , output_string .encode ('utf-8' ))
1901
+
1902
+ def load (self , rule_file , with_checks = True ):
1922
1903
"""
1923
- Load self from a .RULE YAML file stored in data_file and text_file.
1904
+ Load self from a .RULE file with YAMl frontmatter stored in data_file and text_file.
1924
1905
Unknown fields are ignored and not bound to the Rule object.
1925
1906
Optionally check for consistency if ``with_checks`` is True.
1926
1907
"""
1927
1908
try :
1928
- with io .open (data_file , encoding = 'utf-8' ) as f :
1929
- data = saneyaml .load (f .read (), allow_duplicate_keys = False )
1909
+ post = load_frontmatter (rule_file )
1910
+ data = post .metadata
1911
+ if not post .content :
1912
+ raise InvalidRule (
1913
+ f'Cannot load rule with empty text: '
1914
+ f'{ self } : file://{ rule_file } '
1915
+ )
1930
1916
1931
- self .text = get_rule_text ( location = text_file )
1917
+ self .text = post . content
1932
1918
1933
1919
except Exception as e :
1934
1920
print ('#############################' )
1935
- print ('INVALID LICENSE RULE FILE:' , f'file://{ data_file } ' , f'file:// { text_file } ' )
1921
+ print ('INVALID LICENSE RULE FILE:' , f'file://{ rule_file } ' )
1936
1922
print ('#############################' )
1937
1923
print (e )
1938
1924
print ('#############################' )
0 commit comments