1
1
import type { Params } from '@feathersjs/feathers'
2
- import axios , { AxiosResponse } from 'axios'
2
+ import { Agent , request } from 'undici'
3
+ import { logger } from '../../logger'
3
4
4
5
export interface RequestPayload {
5
6
text : string
@@ -13,6 +14,7 @@ interface DownstreamRequestBody {
13
14
// See
14
15
// https://github.com/impresso/impresso-annotation/blob/740a31e2c925e4a4d59be97710e390871754674d/frontend/impresso_annotation/templates/landing_page.html#L157
15
16
// https://github.com/impresso/newsagency-classification/blob/7031c3992edf0d4354d9a29dea769fe7320f455f/lib/bert_classification/HIPE-scorer/tagset.txt
17
+ // https://github.com/impresso/impresso-schemas/blob/31-revise-entity-json-schema/json/entities/entities.schema.json
16
18
type NerType =
17
19
| 'comp.demonym'
18
20
| 'comp.function'
@@ -36,6 +38,30 @@ type NerType =
36
38
| 'org.adm'
37
39
| 'org.ent'
38
40
| 'org.ent.pressagency'
41
+ | 'org.ent.pressagency.AFP'
42
+ | 'org.ent.pressagency.ANSA'
43
+ | 'org.ent.pressagency.AP'
44
+ | 'org.ent.pressagency.APA'
45
+ | 'org.ent.pressagency.ATS-SDA'
46
+ | 'org.ent.pressagency.Belga'
47
+ | 'org.ent.pressagency.CTK'
48
+ | 'org.ent.pressagency.DDP-DAPD'
49
+ | 'org.ent.pressagency.DNB'
50
+ | 'org.ent.pressagency.DPA'
51
+ | 'org.ent.pressagency.Domei'
52
+ | 'org.ent.pressagency.Europapress'
53
+ | 'org.ent.pressagency.Extel'
54
+ | 'org.ent.pressagency.Havas'
55
+ | 'org.ent.pressagency.Kipa'
56
+ | 'org.ent.pressagency.Reuters'
57
+ | 'org.ent.pressagency.SPK-SMP'
58
+ | 'org.ent.pressagency.Stefani'
59
+ | 'org.ent.pressagency.TASS'
60
+ | 'org.ent.pressagency.UP-UPI'
61
+ | 'org.ent.pressagency.Wolff'
62
+ | 'org.ent.pressagency.Xinhua'
63
+ | 'org.ent.pressagency.ag'
64
+ | 'org.ent.pressagency.unk'
39
65
| 'pers'
40
66
| 'pers.coll'
41
67
| 'pers.ind'
@@ -46,41 +72,62 @@ type NerType =
46
72
| 'time'
47
73
| 'time.date.abs'
48
74
| 'time.hour.abs'
75
+ | 'unk'
76
+ | 'UNK'
49
77
78
+ /**
79
+ * See https://github.com/impresso/impresso-schemas/blob/31-revise-entity-json-schema/json/entities/entities.schema.json
80
+ */
50
81
interface DownstreamNes {
51
- confidence_nel ?: number // named entity linking confidence score
52
- confidence_ner : number // named entity recognition confidence score
53
- id : string
54
- lOffset : number // left offset
82
+ // fields not in the schema
83
+ index ?: number // index
84
+ id : string | string [ ]
55
85
nested : boolean // is nested
56
- rOffset : number // right offset
57
- surface : string // surface form (text)
86
+
87
+ // fields from the schema
88
+
89
+ // required:
90
+ lOffset : number | null // left offset
91
+ rOffset : number | null // right offset
92
+ surface : string | null // surface form (text)
58
93
type : NerType
59
94
95
+ // optional:
96
+ confidence_nel ?: number // named entity linking confidence score
97
+ confidence_ner ?: number // named entity recognition confidence score
98
+
60
99
wkd_id ?: string // Wikidata ID
61
100
wkpedia_pagename ?: string // Wikipedia page name
101
+ wkpedia_url ?: string // Wikipedia URL
62
102
63
103
function ?: string // function
64
104
name ?: string // entity name
105
+
106
+ title ?: string
65
107
}
66
108
109
+ /**
110
+ * Loosely based on https://github.com/impresso/impresso-schemas/blob/31-revise-entity-json-schema/json/entities/entities.schema.json
111
+ * Some extra fields come from https://github.com/impresso/impresso-annotation/blob/main/backend/model_handler.py
112
+ */
67
113
interface DownstreamResponse {
68
114
sys_id : string // model id
69
- text : string // input text
115
+ text ? : string // input text
70
116
ts : string // ISO timestamp
71
117
nes : DownstreamNes [ ]
72
118
}
73
119
74
120
export interface ImpressoNerEntity {
75
121
id : string
76
122
type : NerType
77
- surfaceForm : string
78
- offset : { start : number ; end : number }
123
+ surfaceForm ? : string
124
+ offset ? : { start : number ; end : number }
79
125
isTypeNested : boolean
80
- confidence : { ner : number ; nel ?: number }
126
+ confidence : { ner ? : number ; nel ?: number }
81
127
wikidata ?: {
82
128
id : string
83
129
wikipediaPageName ?: string
130
+ wikipediaPageUrl ?: string
84
131
}
85
132
function ?: string
86
133
name ?: string
@@ -111,37 +158,69 @@ export class ImpressoNerService {
111
158
112
159
const url = `${ this . baseUrl } /${ MethodToUrl [ method ] } /`
113
160
114
- const response = await axios . post < DownstreamResponse , AxiosResponse < DownstreamResponse > , DownstreamRequestBody > (
115
- url ,
116
- { data : text }
117
- )
118
- if ( response . status !== 200 ) {
119
- console . error ( `Failed to fetch downstream data. Error (${ response . status } ): ` , response . data )
161
+ const response = await request ( url , {
162
+ method : 'POST' ,
163
+ body : JSON . stringify ( { data : text } ) ,
164
+ headers : { 'Content-Type' : 'application/json' } ,
165
+ dispatcher : new Agent ( {
166
+ connectTimeout : 1 * 60 * 1000 , // 1 minute
167
+ headersTimeout : 5 * 60 * 1000 , // 5 minutes
168
+ bodyTimeout : 5 * 60 * 1000 , // 5 minutes
169
+ } ) ,
170
+ } )
171
+
172
+ if ( response . statusCode !== 200 ) {
173
+ let bodyText = ''
174
+ try {
175
+ bodyText = await response . body . text ( )
176
+ } catch {
177
+ /* ignore */
178
+ }
179
+
180
+ logger . error ( `Failed to fetch downstream data. Error (${ response . statusCode } ): ` , bodyText )
120
181
throw new Error ( 'Failed to fetch downstream data' )
121
182
}
122
- return convertDownstreamResponse ( response . data )
183
+
184
+ try {
185
+ const responseBody = await response . body . json ( )
186
+ return convertDownstreamResponse ( responseBody as DownstreamResponse , data )
187
+ } catch ( error ) {
188
+ logger . error ( 'Failed to parse downstream response' , error )
189
+ throw new Error ( 'Failed to parse downstream response' )
190
+ }
123
191
}
124
192
}
125
193
126
- const convertDownstreamResponse = ( response : DownstreamResponse ) : ImpressoNerResponse => ( {
194
+ const convertDownstreamResponse = ( response : DownstreamResponse , request : RequestPayload ) : ImpressoNerResponse => ( {
127
195
modelId : response . sys_id ,
128
- text : response . text ,
196
+ text : response . text != null ? response . text : request . text ,
129
197
timestamp : response . ts ,
130
198
entities : response . nes . map ( convertDownstreamEntity ) ,
131
199
} )
132
200
133
201
const convertDownstreamEntity = ( entity : DownstreamNes ) : ImpressoNerEntity => ( {
134
- id : entity . id ,
135
- type : entity . type ,
136
- surfaceForm : entity . surface ,
137
- offset : { start : entity . lOffset , end : entity . rOffset } ,
202
+ id : typeof entity . id === 'string' ? entity . id : entity . id . join ( ',' ) ,
203
+ type : sanitizeType ( entity . type ) ,
204
+ ...( entity . surface != null ? { surfaceForm : entity . surface } : { } ) ,
205
+ ...( entity . lOffset != null && entity . rOffset != null
206
+ ? { offset : { start : entity . lOffset , end : entity . rOffset } }
207
+ : { } ) ,
138
208
isTypeNested : entity . nested ,
139
209
confidence : { ner : entity . confidence_ner , nel : entity . confidence_nel } ,
140
210
...( entity . wkd_id != null && entity . wkd_id != 'NIL'
141
211
? {
142
- wikidata : { id : entity . wkd_id , wikipediaPageName : entity . wkpedia_pagename } ,
212
+ wikidata : {
213
+ id : entity . wkd_id ,
214
+ wikipediaPageName : entity . wkpedia_pagename ,
215
+ wikipediaPageUrl : [ null , undefined , 'N/A' ] . includes ( entity . wkpedia_url ) ? undefined : entity . wkpedia_url ,
216
+ } ,
143
217
}
144
218
: { } ) ,
145
219
...( entity . function != null ? { function : entity . function } : { } ) ,
146
220
...( entity . name != null ? { name : entity . name } : { } ) ,
147
221
} )
222
+
223
+ const sanitizeType = ( type : NerType ) : NerType => {
224
+ if ( type === 'UNK' ) return 'unk'
225
+ return type
226
+ }
0 commit comments