Skip to content

Commit 9c50712

Browse files
authored
Merge pull request #453 from impresso/develop
Release v3.0.6
2 parents 02ee6fe + 79fbe96 commit 9c50712

File tree

2 files changed

+136
-29
lines changed

2 files changed

+136
-29
lines changed

src/schema/schemas/ImpressoNerEntity.json

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,30 @@
3535
"org.adm",
3636
"org.ent",
3737
"org.ent.pressagency",
38+
"org.ent.pressagency.AFP",
39+
"org.ent.pressagency.ANSA",
40+
"org.ent.pressagency.AP",
41+
"org.ent.pressagency.APA",
42+
"org.ent.pressagency.ATS-SDA",
43+
"org.ent.pressagency.Belga",
44+
"org.ent.pressagency.CTK",
45+
"org.ent.pressagency.DDP-DAPD",
46+
"org.ent.pressagency.DNB",
47+
"org.ent.pressagency.DPA",
48+
"org.ent.pressagency.Domei",
49+
"org.ent.pressagency.Europapress",
50+
"org.ent.pressagency.Extel",
51+
"org.ent.pressagency.Havas",
52+
"org.ent.pressagency.Kipa",
53+
"org.ent.pressagency.Reuters",
54+
"org.ent.pressagency.SPK-SMP",
55+
"org.ent.pressagency.Stefani",
56+
"org.ent.pressagency.TASS",
57+
"org.ent.pressagency.UP-UPI",
58+
"org.ent.pressagency.Wolff",
59+
"org.ent.pressagency.Xinhua",
60+
"org.ent.pressagency.ag",
61+
"org.ent.pressagency.unk",
3862
"pers",
3963
"pers.coll",
4064
"pers.ind",
@@ -44,7 +68,8 @@
4468
"prod.media",
4569
"time",
4670
"time.date.abs",
47-
"time.hour.abs"
71+
"time.hour.abs",
72+
"unk"
4873
]
4974
},
5075
"surfaceForm": {
@@ -82,8 +107,7 @@
82107
"type": "number",
83108
"description": "Confidence score for the named entity linking"
84109
}
85-
},
86-
"required": ["ner"]
110+
}
87111
},
88112
"wikidata": {
89113
"type": "object",
@@ -96,6 +120,10 @@
96120
"wikipediaPageName": {
97121
"type": "string",
98122
"description": "Wikipedia page name of the entity"
123+
},
124+
"wikipediaPageUrl": {
125+
"type": "string",
126+
"description": "Wikipedia page URL of the entity"
99127
}
100128
},
101129
"required": ["id"]
@@ -109,5 +137,5 @@
109137
"description": "Name of the entity"
110138
}
111139
},
112-
"required": ["id", "type", "surfaceForm", "offset", "confidence"]
140+
"required": ["id", "type", "confidence"]
113141
}

src/services/impresso-ner/impresso-ner.class.ts

Lines changed: 104 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import type { Params } from '@feathersjs/feathers'
2-
import axios, { AxiosResponse } from 'axios'
2+
import { Agent, request } from 'undici'
3+
import { logger } from '../../logger'
34

45
export interface RequestPayload {
56
text: string
@@ -13,6 +14,7 @@ interface DownstreamRequestBody {
1314
// See
1415
// https://github.com/impresso/impresso-annotation/blob/740a31e2c925e4a4d59be97710e390871754674d/frontend/impresso_annotation/templates/landing_page.html#L157
1516
// https://github.com/impresso/newsagency-classification/blob/7031c3992edf0d4354d9a29dea769fe7320f455f/lib/bert_classification/HIPE-scorer/tagset.txt
17+
// https://github.com/impresso/impresso-schemas/blob/31-revise-entity-json-schema/json/entities/entities.schema.json
1618
type NerType =
1719
| 'comp.demonym'
1820
| 'comp.function'
@@ -36,6 +38,30 @@ type NerType =
3638
| 'org.adm'
3739
| 'org.ent'
3840
| 'org.ent.pressagency'
41+
| 'org.ent.pressagency.AFP'
42+
| 'org.ent.pressagency.ANSA'
43+
| 'org.ent.pressagency.AP'
44+
| 'org.ent.pressagency.APA'
45+
| 'org.ent.pressagency.ATS-SDA'
46+
| 'org.ent.pressagency.Belga'
47+
| 'org.ent.pressagency.CTK'
48+
| 'org.ent.pressagency.DDP-DAPD'
49+
| 'org.ent.pressagency.DNB'
50+
| 'org.ent.pressagency.DPA'
51+
| 'org.ent.pressagency.Domei'
52+
| 'org.ent.pressagency.Europapress'
53+
| 'org.ent.pressagency.Extel'
54+
| 'org.ent.pressagency.Havas'
55+
| 'org.ent.pressagency.Kipa'
56+
| 'org.ent.pressagency.Reuters'
57+
| 'org.ent.pressagency.SPK-SMP'
58+
| 'org.ent.pressagency.Stefani'
59+
| 'org.ent.pressagency.TASS'
60+
| 'org.ent.pressagency.UP-UPI'
61+
| 'org.ent.pressagency.Wolff'
62+
| 'org.ent.pressagency.Xinhua'
63+
| 'org.ent.pressagency.ag'
64+
| 'org.ent.pressagency.unk'
3965
| 'pers'
4066
| 'pers.coll'
4167
| 'pers.ind'
@@ -46,41 +72,62 @@ type NerType =
4672
| 'time'
4773
| 'time.date.abs'
4874
| 'time.hour.abs'
75+
| 'unk'
76+
| 'UNK'
4977

78+
/**
79+
* See https://github.com/impresso/impresso-schemas/blob/31-revise-entity-json-schema/json/entities/entities.schema.json
80+
*/
5081
interface DownstreamNes {
51-
confidence_nel?: number // named entity linking confidence score
52-
confidence_ner: number // named entity recognition confidence score
53-
id: string
54-
lOffset: number // left offset
82+
// fields not in the schema
83+
index?: number // index
84+
id: string | string[]
5585
nested: boolean // is nested
56-
rOffset: number // right offset
57-
surface: string // surface form (text)
86+
87+
// fields from the schema
88+
89+
// required:
90+
lOffset: number | null // left offset
91+
rOffset: number | null // right offset
92+
surface: string | null // surface form (text)
5893
type: NerType
5994

95+
// optional:
96+
confidence_nel?: number // named entity linking confidence score
97+
confidence_ner?: number // named entity recognition confidence score
98+
6099
wkd_id?: string // Wikidata ID
61100
wkpedia_pagename?: string // Wikipedia page name
101+
wkpedia_url?: string // Wikipedia URL
62102

63103
function?: string // function
64104
name?: string // entity name
105+
106+
title?: string
65107
}
66108

109+
/**
110+
* Loosely based on https://github.com/impresso/impresso-schemas/blob/31-revise-entity-json-schema/json/entities/entities.schema.json
111+
* Some extra fields come from https://github.com/impresso/impresso-annotation/blob/main/backend/model_handler.py
112+
*/
67113
interface DownstreamResponse {
68114
sys_id: string // model id
69-
text: string // input text
115+
text?: string // input text
70116
ts: string // ISO timestamp
71117
nes: DownstreamNes[]
72118
}
73119

74120
export interface ImpressoNerEntity {
75121
id: string
76122
type: NerType
77-
surfaceForm: string
78-
offset: { start: number; end: number }
123+
surfaceForm?: string
124+
offset?: { start: number; end: number }
79125
isTypeNested: boolean
80-
confidence: { ner: number; nel?: number }
126+
confidence: { ner?: number; nel?: number }
81127
wikidata?: {
82128
id: string
83129
wikipediaPageName?: string
130+
wikipediaPageUrl?: string
84131
}
85132
function?: string
86133
name?: string
@@ -111,37 +158,69 @@ export class ImpressoNerService {
111158

112159
const url = `${this.baseUrl}/${MethodToUrl[method]}/`
113160

114-
const response = await axios.post<DownstreamResponse, AxiosResponse<DownstreamResponse>, DownstreamRequestBody>(
115-
url,
116-
{ data: text }
117-
)
118-
if (response.status !== 200) {
119-
console.error(`Failed to fetch downstream data. Error (${response.status}): `, response.data)
161+
const response = await request(url, {
162+
method: 'POST',
163+
body: JSON.stringify({ data: text }),
164+
headers: { 'Content-Type': 'application/json' },
165+
dispatcher: new Agent({
166+
connectTimeout: 1 * 60 * 1000, // 1 minute
167+
headersTimeout: 5 * 60 * 1000, // 5 minutes
168+
bodyTimeout: 5 * 60 * 1000, // 5 minutes
169+
}),
170+
})
171+
172+
if (response.statusCode !== 200) {
173+
let bodyText = ''
174+
try {
175+
bodyText = await response.body.text()
176+
} catch {
177+
/* ignore */
178+
}
179+
180+
logger.error(`Failed to fetch downstream data. Error (${response.statusCode}): `, bodyText)
120181
throw new Error('Failed to fetch downstream data')
121182
}
122-
return convertDownstreamResponse(response.data)
183+
184+
try {
185+
const responseBody = await response.body.json()
186+
return convertDownstreamResponse(responseBody as DownstreamResponse, data)
187+
} catch (error) {
188+
logger.error('Failed to parse downstream response', error)
189+
throw new Error('Failed to parse downstream response')
190+
}
123191
}
124192
}
125193

126-
const convertDownstreamResponse = (response: DownstreamResponse): ImpressoNerResponse => ({
194+
const convertDownstreamResponse = (response: DownstreamResponse, request: RequestPayload): ImpressoNerResponse => ({
127195
modelId: response.sys_id,
128-
text: response.text,
196+
text: response.text != null ? response.text : request.text,
129197
timestamp: response.ts,
130198
entities: response.nes.map(convertDownstreamEntity),
131199
})
132200

133201
const convertDownstreamEntity = (entity: DownstreamNes): ImpressoNerEntity => ({
134-
id: entity.id,
135-
type: entity.type,
136-
surfaceForm: entity.surface,
137-
offset: { start: entity.lOffset, end: entity.rOffset },
202+
id: typeof entity.id === 'string' ? entity.id : entity.id.join(','),
203+
type: sanitizeType(entity.type),
204+
...(entity.surface != null ? { surfaceForm: entity.surface } : {}),
205+
...(entity.lOffset != null && entity.rOffset != null
206+
? { offset: { start: entity.lOffset, end: entity.rOffset } }
207+
: {}),
138208
isTypeNested: entity.nested,
139209
confidence: { ner: entity.confidence_ner, nel: entity.confidence_nel },
140210
...(entity.wkd_id != null && entity.wkd_id != 'NIL'
141211
? {
142-
wikidata: { id: entity.wkd_id, wikipediaPageName: entity.wkpedia_pagename },
212+
wikidata: {
213+
id: entity.wkd_id,
214+
wikipediaPageName: entity.wkpedia_pagename,
215+
wikipediaPageUrl: [null, undefined, 'N/A'].includes(entity.wkpedia_url) ? undefined : entity.wkpedia_url,
216+
},
143217
}
144218
: {}),
145219
...(entity.function != null ? { function: entity.function } : {}),
146220
...(entity.name != null ? { name: entity.name } : {}),
147221
})
222+
223+
const sanitizeType = (type: NerType): NerType => {
224+
if (type === 'UNK') return 'unk'
225+
return type
226+
}

0 commit comments

Comments
 (0)