1
1
"""XML Annotation processor."""
2
2
from collections import defaultdict
3
- from typing import Dict , Any
3
+ from typing import Dict , Any , Tuple
4
4
5
+ from lxml import etree
5
6
from inscriptis .annotation .output import AnnotationProcessor
6
7
7
8
@@ -10,7 +11,68 @@ class XmlExtractor(AnnotationProcessor):
10
11
11
12
verbatim = True
12
13
13
- def __call__ (self , annotated_text : Dict [str , Any ]) -> str :
14
+ def traverse_element (self , root , text , start , end , annotations , idx ) -> int :
15
+ while idx + 1 < len (annotations ):
16
+ idx += 1
17
+ next_start , next_end , label = annotations [idx ]["label" ]
18
+ # recurse?
19
+ if next_start < end :
20
+ leaf = etree .Element (root , label )
21
+ cascaded_end = self .traverse_element (leaf , text , next_start , next_end , idx )
22
+ else :
23
+ root .tail += text [start : cascaded_end ]
24
+
25
+
26
+
27
+ def __call__ (self , annotated_text : Dict [str , Any ], root_element = 'r' ) -> str :
28
+ text = annotated_text ["text" ]
29
+ annotations = sorted (annotated_text ["label" ])
30
+ root = etree .Element (root_element )
31
+ current_annotation_idx = 0
32
+ while current_annotation_idx < len (annotations ):
33
+ current_annotation_idx = self .traverse_element (root , text , annotations , idx )
34
+
35
+
36
+ for start , end , label in sorted (annotated_text ["label" ]):
37
+ current_element = etree .SubElement (root , label )
38
+ current_element .text = text [start :end ]
39
+
40
+ return etree .tostring (root , pretty_print = True , xml_declaration = True , encoding = "UTF-8" )
41
+
42
+ def call3 (self , annotated_text : Dict [str , Any ]) -> str :
43
+ tag_indices = defaultdict (list )
44
+
45
+ for start , end , label in sorted (annotated_text ["label" ]):
46
+ length = end - start
47
+ tag_indices [start ].append ((label , length ))
48
+ tag_indices [end ].append (("/" + label , length ))
49
+
50
+ current_idx = 0
51
+ tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n ' ]
52
+ text = annotated_text ["text" ]
53
+ for index , tags in sorted (tag_indices .items ()):
54
+ tagged_content .append (text [current_idx :index ])
55
+
56
+ # Separate closing vs opening tags
57
+ closing_tags = [t for t in tags if t [0 ].startswith ("/" )]
58
+ opening_tags = [t for t in tags if not t [0 ].startswith ("/" )]
59
+
60
+ # Sort closing tags by ascending length (so outer closes last)
61
+ closing_tags .sort (key = lambda x : x [1 ])
62
+ for tag , _ in closing_tags :
63
+ tagged_content .append (f"<{ tag } >" )
64
+
65
+ # Sort opening tags by descending length (so outer opens first)
66
+ opening_tags .sort (key = lambda x : x [1 ], reverse = True )
67
+ for tag , _ in opening_tags :
68
+ tagged_content .append (f"<{ tag } >" )
69
+
70
+ current_idx = index
71
+ tagged_content .append (text [current_idx :])
72
+
73
+ return "" .join (tagged_content )
74
+
75
+ def call2 (self , annotated_text : Dict [str , Any ]) -> str :
14
76
"""Provide an XML version of the given text and annotations.
15
77
16
78
Args:
0 commit comments