Skip to content

Commit ac0b46d

Browse files
authored
Refactor metadata extraction to a separate class. (#83)
looks great thanks!
1 parent c4600e1 commit ac0b46d

File tree

2 files changed

+246
-172
lines changed

2 files changed

+246
-172
lines changed

api/src/services/HierarchyCollector.ts

Lines changed: 18 additions & 172 deletions
Original file line numberDiff line numberDiff line change
@@ -1,187 +1,33 @@
1-
import { DC, Fedora } from "./Fedora";
1+
import { Fedora } from "./Fedora";
22
import Config from "../models/Config";
33
import FedoraData from "../models/FedoraData";
4-
import { DOMParser } from "@xmldom/xmldom";
5-
import xpath = require("xpath");
4+
import MetadataExtractor from "./MetadataExtractor";
65
import TikaExtractor from "./TikaExtractor";
76

87
class HierarchyCollector {
98
private static instance: HierarchyCollector;
109

1110
fedora: Fedora;
12-
// PIDs that define the top of a hierarchy. Typically this
13-
// includes the overall top PID, plus the top public PID.
14-
hierarchyTops: Array<string>;
11+
extractor: MetadataExtractor;
1512
config: Config;
1613

17-
constructor(fedora: Fedora, config: Config) {
14+
constructor(fedora: Fedora, extractor: MetadataExtractor, config: Config) {
1815
this.fedora = fedora;
16+
this.extractor = extractor;
1917
this.config = config;
2018
}
2119

2220
public static getInstance(): HierarchyCollector {
2321
if (!HierarchyCollector.instance) {
24-
HierarchyCollector.instance = new HierarchyCollector(Fedora.getInstance(), Config.getInstance());
22+
HierarchyCollector.instance = new HierarchyCollector(
23+
Fedora.getInstance(),
24+
MetadataExtractor.getInstance(),
25+
Config.getInstance()
26+
);
2527
}
2628
return HierarchyCollector.instance;
2729
}
2830

29-
protected extractMetadata(dc: DC): Record<string, Array<string>> {
30-
if (typeof dc.children === "undefined") {
31-
throw new Error("Unexpected failure: childless Dublin Core!");
32-
}
33-
const metadata: Record<string, Array<string>> = {};
34-
dc.children.forEach((field) => {
35-
if (typeof metadata[field.name] === "undefined") {
36-
metadata[field.name] = [];
37-
}
38-
metadata[field.name].push(field.value);
39-
});
40-
return metadata;
41-
}
42-
43-
protected extractRDFXML(
44-
xml: Document,
45-
namespaces: Record<string, string>,
46-
xpathQuery: string
47-
): Record<string, Array<string>> {
48-
const rdfXPath = xpath.useNamespaces(namespaces);
49-
const relations: Record<string, Array<string>> = {};
50-
rdfXPath(xpathQuery, xml).forEach((relation: Node) => {
51-
let values = rdfXPath("text()", relation) as Array<Node>;
52-
// If there's a namespace on the node name, strip it:
53-
const nodeName = relation.nodeName.split(":").pop();
54-
if (values.length === 0) {
55-
values = rdfXPath("./@rdf:resource", relation) as Array<Node>;
56-
}
57-
if (values.length > 0) {
58-
if (typeof relations[nodeName] === "undefined") {
59-
relations[nodeName] = [];
60-
}
61-
relations[nodeName].push(values[0].nodeValue);
62-
}
63-
});
64-
return relations;
65-
}
66-
67-
protected extractRelations(RELS: string): Record<string, Array<string>> {
68-
const xmlParser = new DOMParser();
69-
const RELS_XML = xmlParser.parseFromString(RELS, "text/xml");
70-
return this.extractRDFXML(
71-
RELS_XML,
72-
{
73-
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
74-
},
75-
"//rdf:Description/*"
76-
);
77-
}
78-
79-
protected extractFedoraDetails(RDF: string): Record<string, Array<string>> {
80-
const xmlParser = new DOMParser();
81-
const RDF_XML = xmlParser.parseFromString(RDF, "text/xml");
82-
const details = this.extractRDFXML(
83-
RDF_XML,
84-
{
85-
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
86-
fedora: "http://fedora.info/definitions/v4/repository#",
87-
"fedora3-model": "info:fedora/fedora-system:def/model#",
88-
"fedora3-view": "info:fedora/fedora-system:def/view#",
89-
},
90-
"//rdf:Description/fedora:*|//rdf:Description/fedora3-model:*|//rdf:Description/fedora3-view:*"
91-
);
92-
// The new (F6) created and lastModified properties should take
93-
// precedence over the legacy (F3) createdDate and lastModifiedDate
94-
// properties when present.
95-
if (typeof details.created !== "undefined") {
96-
details.createdDate = details.created;
97-
delete details.created;
98-
}
99-
if (typeof details.lastModified !== "undefined") {
100-
details.lastModifiedDate = details.lastModified;
101-
delete details.lastModified;
102-
}
103-
return details;
104-
}
105-
106-
protected extractFedoraDatastreams(RDF: string): Array<string> {
107-
const xmlParser = new DOMParser();
108-
const RDF_XML = xmlParser.parseFromString(RDF, "text/xml");
109-
const raw =
110-
this.extractRDFXML(
111-
RDF_XML,
112-
{
113-
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
114-
ldp: "http://www.w3.org/ns/ldp#",
115-
},
116-
"//ldp:contains"
117-
)["contains"] ?? [];
118-
return raw.map((ds) => {
119-
return ds.split("/").pop();
120-
});
121-
}
122-
123-
protected extractLicense(XML: string): string {
124-
const xmlParser = new DOMParser();
125-
const parsedXml = xmlParser.parseFromString(XML, "text/xml");
126-
const namespaces = {
127-
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
128-
METS: "http://www.loc.gov/METS/",
129-
xlink: "http://www.w3.org/1999/xlink",
130-
};
131-
const rdfXPath = xpath.useNamespaces(namespaces);
132-
let license = null;
133-
rdfXPath("//@xlink:href", parsedXml).forEach((relation: Node) => {
134-
license = relation.nodeValue;
135-
});
136-
return license;
137-
}
138-
139-
protected extractAgents(xml: string): Record<string, Array<string>> {
140-
const xmlParser = new DOMParser();
141-
const RDF_XML = xmlParser.parseFromString(xml, "text/xml");
142-
return this.extractRDFXML(
143-
RDF_XML,
144-
{
145-
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
146-
METS: "http://www.loc.gov/METS/",
147-
},
148-
"//METS:agent/*"
149-
);
150-
}
151-
152-
protected extractFitsData(xml: string): Record<string, Array<string>> {
153-
const xmlParser = new DOMParser();
154-
const RDF_XML = xmlParser.parseFromString(xml, "text/xml");
155-
const namespaces = {
156-
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
157-
fits: "http://hul.harvard.edu/ois/xml/ns/fits/fits_output",
158-
};
159-
const details = this.extractRDFXML(
160-
RDF_XML,
161-
namespaces,
162-
"//fits:fileinfo/fits:size|//fits:imageWidth|//fits:imageHeight"
163-
);
164-
details.mimetype = [];
165-
const fitsXPath = xpath.useNamespaces(namespaces);
166-
fitsXPath("//fits:identity/@mimetype", RDF_XML).forEach((relation: Node) => {
167-
details.mimetype.push(relation.nodeValue);
168-
});
169-
return details;
170-
}
171-
172-
protected extractThumbnailDetails(xml: string): Record<string, Array<string>> {
173-
const xmlParser = new DOMParser();
174-
const RDF_XML = xmlParser.parseFromString(xml, "text/xml");
175-
return this.extractRDFXML(
176-
RDF_XML,
177-
{
178-
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
179-
premis: "http://www.loc.gov/premis/rdf/v1#",
180-
},
181-
"//premis:*"
182-
);
183-
}
184-
18531
async getFedoraData(pid: string, fetchRdf = true): Promise<FedoraData> {
18632
// Use Fedora to get data
18733
const DCPromise = this.fedora.getDC(pid);
@@ -191,25 +37,25 @@ class HierarchyCollector {
19137
// we can skip fetching more RDF in order to save some time!
19238
const RDFPromise = fetchRdf ? this.fedora.getRdf(pid) : null;
19339
const [DC, RELS, RDF] = await Promise.all([DCPromise, RELSPromise, RDFPromise]);
194-
const dataStreams = fetchRdf ? this.extractFedoraDatastreams(RDF) : [];
195-
const relations = this.extractRelations(RELS);
40+
const dataStreams = fetchRdf ? this.extractor.extractFedoraDatastreams(RDF) : [];
41+
const relations = this.extractor.extractRelations(RELS);
19642
// Fetch license details if appropriate/available:
19743
const extraDetails: Record<string, Record<string, Array<string>>> = {};
19844
if (dataStreams.includes("LICENSE")) {
19945
const licenseStream = await this.fedora.getDatastreamAsString(pid, "LICENSE");
200-
extraDetails.license = { url: [this.extractLicense(licenseStream)] };
46+
extraDetails.license = { url: [this.extractor.extractLicense(licenseStream)] };
20147
}
20248
if (dataStreams.includes("AGENTS")) {
20349
const agentsStream = await this.fedora.getDatastreamAsString(pid, "AGENTS");
204-
extraDetails.agents = this.extractAgents(agentsStream);
50+
extraDetails.agents = this.extractor.extractAgents(agentsStream);
20551
}
20652
if (dataStreams.includes("THUMBNAIL")) {
20753
const thumbRdf = await this.fedora.getRdf(pid + "/THUMBNAIL/fcr:metadata");
208-
extraDetails.thumbnails = this.extractThumbnailDetails(thumbRdf);
54+
extraDetails.thumbnails = this.extractor.extractThumbnailDetails(thumbRdf);
20955
}
21056
if (dataStreams.includes("MASTER-MD")) {
21157
const fitsXml = await this.fedora.getDatastreamAsString(pid, "MASTER-MD");
212-
extraDetails.fitsData = this.extractFitsData(fitsXml);
58+
extraDetails.fitsData = this.extractor.extractFitsData(fitsXml);
21359
}
21460
extraDetails.fullText = {};
21561
if (dataStreams.includes("OCR-DIRTY")) {
@@ -223,8 +69,8 @@ class HierarchyCollector {
22369
return new FedoraData(
22470
pid,
22571
relations,
226-
this.extractMetadata(DC),
227-
fetchRdf ? this.extractFedoraDetails(RDF) : {},
72+
this.extractor.extractMetadata(DC),
73+
fetchRdf ? this.extractor.extractFedoraDetails(RDF) : {},
22874
dataStreams,
22975
extraDetails
23076
);

0 commit comments

Comments
 (0)