1
1
"""XML Annotation processor."""
2
+
2
3
from collections import defaultdict
3
- from typing import Dict , Any , Tuple
4
+ from typing import Dict , Any
4
5
5
- from lxml import etree
6
6
from inscriptis .annotation .output import AnnotationProcessor
7
7
8
8
@@ -11,101 +11,20 @@ class XmlExtractor(AnnotationProcessor):
11
11
12
12
verbatim = True
13
13
14
- def traverse_element (self , root , text , start , end , annotations , idx ) -> int :
15
- while idx + 1 < len (annotations ):
16
- idx += 1
17
- next_start , next_end , label = annotations [idx ]["label" ]
18
- # recurse?
19
- if next_start < end :
20
- leaf = etree .Element (root , label )
21
- cascaded_end = self .traverse_element (leaf , text , next_start , next_end , idx )
22
- else :
23
- root .tail += text [start : cascaded_end ]
24
-
25
-
26
-
27
- def __call__ (self , annotated_text : Dict [str , Any ], root_element = 'r' ) -> str :
28
- text = annotated_text ["text" ]
29
- annotations = sorted (annotated_text ["label" ])
30
- root = etree .Element (root_element )
31
- current_annotation_idx = 0
32
- while current_annotation_idx < len (annotations ):
33
- current_annotation_idx = self .traverse_element (root , text , annotations , idx )
34
-
35
-
36
- for start , end , label in sorted (annotated_text ["label" ]):
37
- current_element = etree .SubElement (root , label )
38
- current_element .text = text [start :end ]
39
-
40
- return etree .tostring (root , pretty_print = True , xml_declaration = True , encoding = "UTF-8" )
41
-
42
- def call3 (self , annotated_text : Dict [str , Any ]) -> str :
43
- tag_indices = defaultdict (list )
44
-
45
- for start , end , label in sorted (annotated_text ["label" ]):
46
- length = end - start
47
- tag_indices [start ].append ((label , length ))
48
- tag_indices [end ].append (("/" + label , length ))
14
+ def __call__ (self , annotated_text : Dict [str , Any ], root_element = "content" ):
15
+ tag_dict = defaultdict (list )
16
+ for start , end , tag in reversed (annotated_text ["label" ]):
17
+ tag_dict [start ].append (f"<{ tag } >" )
18
+ tag_dict [end ].insert (0 , f"</{ tag } >" )
49
19
50
20
current_idx = 0
51
- tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n ' ]
52
21
text = annotated_text ["text" ]
53
- for index , tags in sorted (tag_indices .items ()):
22
+ tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n ' , "<content>\n " ]
23
+ for index , tags in sorted (tag_dict .items ()):
54
24
tagged_content .append (text [current_idx :index ])
55
-
56
- # Separate closing vs opening tags
57
- closing_tags = [t for t in tags if t [0 ].startswith ("/" )]
58
- opening_tags = [t for t in tags if not t [0 ].startswith ("/" )]
59
-
60
- # Sort closing tags by ascending length (so outer closes last)
61
- closing_tags .sort (key = lambda x : x [1 ])
62
- for tag , _ in closing_tags :
63
- tagged_content .append (f"<{ tag } >" )
64
-
65
- # Sort opening tags by descending length (so outer opens first)
66
- opening_tags .sort (key = lambda x : x [1 ], reverse = True )
67
- for tag , _ in opening_tags :
68
- tagged_content .append (f"<{ tag } >" )
69
-
70
25
current_idx = index
71
- tagged_content .append (text [current_idx :])
72
-
73
- return "" .join (tagged_content )
74
-
75
- def call2 (self , annotated_text : Dict [str , Any ]) -> str :
76
- """Provide an XML version of the given text and annotations.
77
-
78
- Args:
79
- annotated_text: a dictionary containing the plain text and the
80
- extracted annotations.
81
-
82
- Returns:
83
- A string with the XML-version of the content.
84
- """
85
- tag_indices = defaultdict (list )
26
+ tagged_content .extend (tags )
86
27
87
- for start , end , label in sorted (annotated_text ["label" ]):
88
- tag_indices [start ].append (label )
89
- tag_indices [end ].append ("/" + label )
90
-
91
- current_idx = 0
92
- tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n ' ]
93
- text = annotated_text ["text" ]
94
- for index , tags in sorted (tag_indices .items ()):
95
- tagged_content .append (text [current_idx :index ])
96
- # close tags
97
- tagged_content .extend (
98
- [
99
- "<" + tag + ">"
100
- for tag in sorted (tags , reverse = True )
101
- if tag .startswith ("/" )
102
- ]
103
- )
104
- # open tags
105
- tagged_content .extend (
106
- ["<" + tag + ">" for tag in sorted (tags ) if not tag .startswith ("/" )]
107
- )
108
- current_idx = index
109
28
tagged_content .append (text [current_idx :])
110
-
29
+ tagged_content . append ( " \n </content>" )
111
30
return "" .join (tagged_content )
0 commit comments