1
- import { DC , Fedora } from "./Fedora" ;
1
+ import { Fedora } from "./Fedora" ;
2
2
import Config from "../models/Config" ;
3
3
import FedoraData from "../models/FedoraData" ;
4
- import { DOMParser } from "@xmldom/xmldom" ;
5
- import xpath = require( "xpath" ) ;
4
+ import MetadataExtractor from "./MetadataExtractor" ;
6
5
import TikaExtractor from "./TikaExtractor" ;
7
6
8
7
class HierarchyCollector {
9
8
private static instance : HierarchyCollector ;
10
9
11
10
fedora : Fedora ;
12
- // PIDs that define the top of a hierarchy. Typically this
13
- // includes the overall top PID, plus the top public PID.
14
- hierarchyTops : Array < string > ;
11
+ extractor : MetadataExtractor ;
15
12
config : Config ;
16
13
17
- constructor ( fedora : Fedora , config : Config ) {
14
+ constructor ( fedora : Fedora , extractor : MetadataExtractor , config : Config ) {
18
15
this . fedora = fedora ;
16
+ this . extractor = extractor ;
19
17
this . config = config ;
20
18
}
21
19
22
20
public static getInstance ( ) : HierarchyCollector {
23
21
if ( ! HierarchyCollector . instance ) {
24
- HierarchyCollector . instance = new HierarchyCollector ( Fedora . getInstance ( ) , Config . getInstance ( ) ) ;
22
+ HierarchyCollector . instance = new HierarchyCollector (
23
+ Fedora . getInstance ( ) ,
24
+ MetadataExtractor . getInstance ( ) ,
25
+ Config . getInstance ( )
26
+ ) ;
25
27
}
26
28
return HierarchyCollector . instance ;
27
29
}
28
30
29
- protected extractMetadata ( dc : DC ) : Record < string , Array < string > > {
30
- if ( typeof dc . children === "undefined" ) {
31
- throw new Error ( "Unexpected failure: childless Dublin Core!" ) ;
32
- }
33
- const metadata : Record < string , Array < string > > = { } ;
34
- dc . children . forEach ( ( field ) => {
35
- if ( typeof metadata [ field . name ] === "undefined" ) {
36
- metadata [ field . name ] = [ ] ;
37
- }
38
- metadata [ field . name ] . push ( field . value ) ;
39
- } ) ;
40
- return metadata ;
41
- }
42
-
43
- protected extractRDFXML (
44
- xml : Document ,
45
- namespaces : Record < string , string > ,
46
- xpathQuery : string
47
- ) : Record < string , Array < string > > {
48
- const rdfXPath = xpath . useNamespaces ( namespaces ) ;
49
- const relations : Record < string , Array < string > > = { } ;
50
- rdfXPath ( xpathQuery , xml ) . forEach ( ( relation : Node ) => {
51
- let values = rdfXPath ( "text()" , relation ) as Array < Node > ;
52
- // If there's a namespace on the node name, strip it:
53
- const nodeName = relation . nodeName . split ( ":" ) . pop ( ) ;
54
- if ( values . length === 0 ) {
55
- values = rdfXPath ( "./@rdf:resource" , relation ) as Array < Node > ;
56
- }
57
- if ( values . length > 0 ) {
58
- if ( typeof relations [ nodeName ] === "undefined" ) {
59
- relations [ nodeName ] = [ ] ;
60
- }
61
- relations [ nodeName ] . push ( values [ 0 ] . nodeValue ) ;
62
- }
63
- } ) ;
64
- return relations ;
65
- }
66
-
67
- protected extractRelations ( RELS : string ) : Record < string , Array < string > > {
68
- const xmlParser = new DOMParser ( ) ;
69
- const RELS_XML = xmlParser . parseFromString ( RELS , "text/xml" ) ;
70
- return this . extractRDFXML (
71
- RELS_XML ,
72
- {
73
- rdf : "http://www.w3.org/1999/02/22-rdf-syntax-ns#" ,
74
- } ,
75
- "//rdf:Description/*"
76
- ) ;
77
- }
78
-
79
- protected extractFedoraDetails ( RDF : string ) : Record < string , Array < string > > {
80
- const xmlParser = new DOMParser ( ) ;
81
- const RDF_XML = xmlParser . parseFromString ( RDF , "text/xml" ) ;
82
- const details = this . extractRDFXML (
83
- RDF_XML ,
84
- {
85
- rdf : "http://www.w3.org/1999/02/22-rdf-syntax-ns#" ,
86
- fedora : "http://fedora.info/definitions/v4/repository#" ,
87
- "fedora3-model" : "info:fedora/fedora-system:def/model#" ,
88
- "fedora3-view" : "info:fedora/fedora-system:def/view#" ,
89
- } ,
90
- "//rdf:Description/fedora:*|//rdf:Description/fedora3-model:*|//rdf:Description/fedora3-view:*"
91
- ) ;
92
- // The new (F6) created and lastModified properties should take
93
- // precedence over the legacy (F3) createdDate and lastModifiedDate
94
- // properties when present.
95
- if ( typeof details . created !== "undefined" ) {
96
- details . createdDate = details . created ;
97
- delete details . created ;
98
- }
99
- if ( typeof details . lastModified !== "undefined" ) {
100
- details . lastModifiedDate = details . lastModified ;
101
- delete details . lastModified ;
102
- }
103
- return details ;
104
- }
105
-
106
- protected extractFedoraDatastreams ( RDF : string ) : Array < string > {
107
- const xmlParser = new DOMParser ( ) ;
108
- const RDF_XML = xmlParser . parseFromString ( RDF , "text/xml" ) ;
109
- const raw =
110
- this . extractRDFXML (
111
- RDF_XML ,
112
- {
113
- rdf : "http://www.w3.org/1999/02/22-rdf-syntax-ns#" ,
114
- ldp : "http://www.w3.org/ns/ldp#" ,
115
- } ,
116
- "//ldp:contains"
117
- ) [ "contains" ] ?? [ ] ;
118
- return raw . map ( ( ds ) => {
119
- return ds . split ( "/" ) . pop ( ) ;
120
- } ) ;
121
- }
122
-
123
- protected extractLicense ( XML : string ) : string {
124
- const xmlParser = new DOMParser ( ) ;
125
- const parsedXml = xmlParser . parseFromString ( XML , "text/xml" ) ;
126
- const namespaces = {
127
- rdf : "http://www.w3.org/1999/02/22-rdf-syntax-ns#" ,
128
- METS : "http://www.loc.gov/METS/" ,
129
- xlink : "http://www.w3.org/1999/xlink" ,
130
- } ;
131
- const rdfXPath = xpath . useNamespaces ( namespaces ) ;
132
- let license = null ;
133
- rdfXPath ( "//@xlink:href" , parsedXml ) . forEach ( ( relation : Node ) => {
134
- license = relation . nodeValue ;
135
- } ) ;
136
- return license ;
137
- }
138
-
139
- protected extractAgents ( xml : string ) : Record < string , Array < string > > {
140
- const xmlParser = new DOMParser ( ) ;
141
- const RDF_XML = xmlParser . parseFromString ( xml , "text/xml" ) ;
142
- return this . extractRDFXML (
143
- RDF_XML ,
144
- {
145
- rdf : "http://www.w3.org/1999/02/22-rdf-syntax-ns#" ,
146
- METS : "http://www.loc.gov/METS/" ,
147
- } ,
148
- "//METS:agent/*"
149
- ) ;
150
- }
151
-
152
- protected extractFitsData ( xml : string ) : Record < string , Array < string > > {
153
- const xmlParser = new DOMParser ( ) ;
154
- const RDF_XML = xmlParser . parseFromString ( xml , "text/xml" ) ;
155
- const namespaces = {
156
- rdf : "http://www.w3.org/1999/02/22-rdf-syntax-ns#" ,
157
- fits : "http://hul.harvard.edu/ois/xml/ns/fits/fits_output" ,
158
- } ;
159
- const details = this . extractRDFXML (
160
- RDF_XML ,
161
- namespaces ,
162
- "//fits:fileinfo/fits:size|//fits:imageWidth|//fits:imageHeight"
163
- ) ;
164
- details . mimetype = [ ] ;
165
- const fitsXPath = xpath . useNamespaces ( namespaces ) ;
166
- fitsXPath ( "//fits:identity/@mimetype" , RDF_XML ) . forEach ( ( relation : Node ) => {
167
- details . mimetype . push ( relation . nodeValue ) ;
168
- } ) ;
169
- return details ;
170
- }
171
-
172
- protected extractThumbnailDetails ( xml : string ) : Record < string , Array < string > > {
173
- const xmlParser = new DOMParser ( ) ;
174
- const RDF_XML = xmlParser . parseFromString ( xml , "text/xml" ) ;
175
- return this . extractRDFXML (
176
- RDF_XML ,
177
- {
178
- rdf : "http://www.w3.org/1999/02/22-rdf-syntax-ns#" ,
179
- premis : "http://www.loc.gov/premis/rdf/v1#" ,
180
- } ,
181
- "//premis:*"
182
- ) ;
183
- }
184
-
185
31
async getFedoraData ( pid : string , fetchRdf = true ) : Promise < FedoraData > {
186
32
// Use Fedora to get data
187
33
const DCPromise = this . fedora . getDC ( pid ) ;
@@ -191,25 +37,25 @@ class HierarchyCollector {
191
37
// we can skip fetching more RDF in order to save some time!
192
38
const RDFPromise = fetchRdf ? this . fedora . getRdf ( pid ) : null ;
193
39
const [ DC , RELS , RDF ] = await Promise . all ( [ DCPromise , RELSPromise , RDFPromise ] ) ;
194
- const dataStreams = fetchRdf ? this . extractFedoraDatastreams ( RDF ) : [ ] ;
195
- const relations = this . extractRelations ( RELS ) ;
40
+ const dataStreams = fetchRdf ? this . extractor . extractFedoraDatastreams ( RDF ) : [ ] ;
41
+ const relations = this . extractor . extractRelations ( RELS ) ;
196
42
// Fetch license details if appropriate/available:
197
43
const extraDetails : Record < string , Record < string , Array < string > > > = { } ;
198
44
if ( dataStreams . includes ( "LICENSE" ) ) {
199
45
const licenseStream = await this . fedora . getDatastreamAsString ( pid , "LICENSE" ) ;
200
- extraDetails . license = { url : [ this . extractLicense ( licenseStream ) ] } ;
46
+ extraDetails . license = { url : [ this . extractor . extractLicense ( licenseStream ) ] } ;
201
47
}
202
48
if ( dataStreams . includes ( "AGENTS" ) ) {
203
49
const agentsStream = await this . fedora . getDatastreamAsString ( pid , "AGENTS" ) ;
204
- extraDetails . agents = this . extractAgents ( agentsStream ) ;
50
+ extraDetails . agents = this . extractor . extractAgents ( agentsStream ) ;
205
51
}
206
52
if ( dataStreams . includes ( "THUMBNAIL" ) ) {
207
53
const thumbRdf = await this . fedora . getRdf ( pid + "/THUMBNAIL/fcr:metadata" ) ;
208
- extraDetails . thumbnails = this . extractThumbnailDetails ( thumbRdf ) ;
54
+ extraDetails . thumbnails = this . extractor . extractThumbnailDetails ( thumbRdf ) ;
209
55
}
210
56
if ( dataStreams . includes ( "MASTER-MD" ) ) {
211
57
const fitsXml = await this . fedora . getDatastreamAsString ( pid , "MASTER-MD" ) ;
212
- extraDetails . fitsData = this . extractFitsData ( fitsXml ) ;
58
+ extraDetails . fitsData = this . extractor . extractFitsData ( fitsXml ) ;
213
59
}
214
60
extraDetails . fullText = { } ;
215
61
if ( dataStreams . includes ( "OCR-DIRTY" ) ) {
@@ -223,8 +69,8 @@ class HierarchyCollector {
223
69
return new FedoraData (
224
70
pid ,
225
71
relations ,
226
- this . extractMetadata ( DC ) ,
227
- fetchRdf ? this . extractFedoraDetails ( RDF ) : { } ,
72
+ this . extractor . extractMetadata ( DC ) ,
73
+ fetchRdf ? this . extractor . extractFedoraDetails ( RDF ) : { } ,
228
74
dataStreams ,
229
75
extraDetails
230
76
) ;
0 commit comments