Skip to content

Commit f72dff6

Browse files
telpirionAce Nassri
authored and
Ace Nassri
committed
feat: adds beta samples
* feat: adds remaining samples (parse form synchronous, parse table synchronous, parse from with NL model, set endpoint) * fix: adds AutoML NL model to tests * fix: removes forEach() constructions
1 parent 2c0f5a3 commit f72dff6

12 files changed

+819
-7
lines changed

document-ai/batch_parse_form.js

+140
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
/**
2+
* Copyright 2020 Google LLC
3+
* Licensed under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License.
5+
* You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*/
15+
16+
'use strict';
17+
18+
const uuid = require('uuid');
19+
20+
async function main(
21+
projectId = 'YOUR_PROJECT_ID',
22+
location = 'YOUR_PROJECT_LOCATION',
23+
gcsOutputUri = 'output-bucket',
24+
gcsOutputUriPrefix = uuid.v4(),
25+
gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf'
26+
) {
27+
// [START document_parse_form]
28+
/**
29+
* TODO(developer): Uncomment these variables before running the sample.
30+
*/
31+
// const projectId = 'YOUR_PROJECT_ID';
32+
// const location = 'YOUR_PROJECT_LOCATION',
33+
// const gcsOutputUri = 'YOUR_STORAGE_BUCKET';
34+
// const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX';
35+
// const gcsInputUri = 'GCS URI of the PDF to process';
36+
37+
// Imports the Google Cloud client library
38+
const {
39+
DocumentUnderstandingServiceClient,
40+
} = require('@google-cloud/documentai');
41+
const {Storage} = require('@google-cloud/storage');
42+
43+
const client = new DocumentUnderstandingServiceClient();
44+
const storage = new Storage();
45+
46+
async function parseFormGCS(inputUri, outputUri, outputUriPrefix) {
47+
const parent = `projects/${projectId}/locations/${location}`;
48+
49+
// Configure the batch process request.
50+
const request = {
51+
inputConfig: {
52+
gcsSource: {
53+
uri: inputUri,
54+
},
55+
mimeType: 'application/pdf',
56+
},
57+
outputConfig: {
58+
gcsDestination: {
59+
uri: `${outputUri}/${outputUriPrefix}/`,
60+
},
61+
pagesPerShard: 1,
62+
},
63+
formExtractionParams: {
64+
enabled: true,
65+
keyValuePairHints: [
66+
{
67+
key: 'Phone',
68+
valueTypes: ['PHONE_NUMBER'],
69+
},
70+
{
71+
key: 'Contact',
72+
valueTypes: ['EMAIL', 'NAME'],
73+
},
74+
],
75+
},
76+
};
77+
78+
// Configure the request for batch process
79+
const requests = {
80+
parent,
81+
requests: [request],
82+
};
83+
84+
// Batch process document using a long-running operation.
85+
// You can wait for now, or get results later.
86+
const [operation] = await client.batchProcessDocuments(requests);
87+
88+
// Wait for operation to complete.
89+
await operation.promise();
90+
91+
console.log('Document processing complete.');
92+
93+
// Query Storage bucket for the results file(s).
94+
const query = {
95+
prefix: outputUriPrefix,
96+
};
97+
98+
console.log('Fetching results ...');
99+
100+
// List all of the files in the Storage bucket
101+
const [files] = await storage.bucket(gcsOutputUri).getFiles(query);
102+
103+
files.forEach(async (fileInfo, index) => {
104+
// Get the file as a buffer
105+
const [file] = await fileInfo.download();
106+
107+
console.log(`Fetched file #${index + 1}:`);
108+
109+
// Read the results
110+
const results = JSON.parse(file.toString());
111+
112+
// Get all of the document text as one big string.
113+
const {text} = results;
114+
115+
// Utility to extract text anchors from text field.
116+
const getText = textAnchor => {
117+
const startIndex = textAnchor.textSegments[0].startIndex || 0;
118+
const endIndex = textAnchor.textSegments[0].endIndex;
119+
120+
return `\t${text.substring(startIndex, endIndex)}`;
121+
};
122+
123+
// Process the output
124+
const [page1] = results.pages;
125+
const formFields = page1.formFields;
126+
127+
for (const field of formFields) {
128+
const fieldName = getText(field.fieldName.textAnchor);
129+
const fieldValue = getText(field.fieldValue.textAnchor);
130+
131+
console.log('Extracted key value pair:');
132+
console.log(`\t(${fieldName}, ${fieldValue})`);
133+
}
134+
});
135+
}
136+
// [END document_parse_form]
137+
138+
parseFormGCS(gcsInputUri, gcsOutputUri, gcsOutputUriPrefix);
139+
}
140+
main(...process.argv.slice(2));

document-ai/batch_parse_table.js

+148
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
/**
2+
* Copyright 2020 Google LLC
3+
* Licensed under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License.
5+
* You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*/
15+
16+
'use strict';
17+
18+
const uuid = require('uuid');
19+
20+
async function main(
21+
projectId = 'YOUR_PROJECT_ID',
22+
location = 'YOUR_PROJECT_LOCATION',
23+
gcsOutputUri = 'output-bucket',
24+
gcsOutputUriPrefix = uuid.v4(),
25+
gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf'
26+
) {
27+
// [START document_parse_table]
28+
/**
29+
* TODO(developer): Uncomment these variables before running the sample.
30+
*/
31+
// const projectId = 'YOUR_PROJECT_ID';
32+
// const location = 'YOUR_PROJECT_LOCATION';
33+
// const gcsOutputUri = 'YOUR_STORAGE_BUCKET';
34+
// const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX';
35+
// const gcsInputUri = 'YOUR_SOURCE_PDF';
36+
37+
// Imports the Google Cloud client library
38+
const {
39+
DocumentUnderstandingServiceClient,
40+
} = require('@google-cloud/documentai');
41+
const {Storage} = require('@google-cloud/storage');
42+
43+
const client = new DocumentUnderstandingServiceClient();
44+
const storage = new Storage();
45+
46+
async function parseTableGCS(inputUri, outputUri, outputUriPrefix) {
47+
const parent = `projects/${projectId}/locations/${location}`;
48+
49+
// Configure the batch process request.
50+
const request = {
51+
//parent,
52+
inputConfig: {
53+
gcsSource: {
54+
uri: inputUri,
55+
},
56+
mimeType: 'application/pdf',
57+
},
58+
outputConfig: {
59+
gcsDestination: {
60+
uri: `${outputUri}/${outputUriPrefix}/`,
61+
},
62+
pagesPerShard: 1,
63+
},
64+
tableExtractionParams: {
65+
enabled: true,
66+
tableBoundHints: [
67+
{
68+
boundingBox: {
69+
normalizedVertices: [
70+
{x: 0, y: 0},
71+
{x: 1, y: 0},
72+
{x: 1, y: 1},
73+
{x: 0, y: 1},
74+
],
75+
},
76+
},
77+
],
78+
},
79+
};
80+
81+
// Configure the request for batch process
82+
const requests = {
83+
parent,
84+
requests: [request],
85+
};
86+
87+
// Batch process document using a long-running operation.
88+
// You can wait for now, or get results later.
89+
// Note: first request to the service takes longer than subsequent
90+
// requests.
91+
const [operation] = await client.batchProcessDocuments(requests);
92+
93+
// Wait for operation to complete.
94+
await operation.promise();
95+
96+
console.log('Document processing complete.');
97+
98+
// Query Storage bucket for the results file(s).
99+
const query = {
100+
prefix: outputUriPrefix,
101+
};
102+
103+
console.log('Fetching results ...');
104+
105+
// List all of the files in the Storage bucket
106+
const [files] = await storage.bucket(gcsOutputUri).getFiles(query);
107+
108+
files.forEach(async (fileInfo, index) => {
109+
// Get the file as a buffer
110+
const [file] = await fileInfo.download();
111+
112+
console.log(`Fetched file #${index + 1}:`);
113+
114+
// Read the results
115+
const results = JSON.parse(file.toString());
116+
117+
// Get all of the document text as one big string
118+
const text = results.text;
119+
120+
// Get the first table in the document
121+
const [page1] = results.pages;
122+
const [table] = page1.tables;
123+
const [headerRow] = table.headerRows;
124+
125+
console.log('Results from first table processed:');
126+
console.log(
127+
`First detected language: ${page1.detectedLanguages[0].languageCode}`
128+
);
129+
130+
console.log('Header row:');
131+
for (const tableCell of headerRow.cells) {
132+
if (tableCell.layout.textAnchor.textSegments) {
133+
// Extract shards from the text field
134+
// First shard in document doesn't have startIndex property
135+
const startIndex =
136+
tableCell.layout.textAnchor.textSegments[0].startIndex || 0;
137+
const endIndex = tableCell.layout.textAnchor.textSegments[0].endIndex;
138+
139+
console.log(`\t${text.substring(startIndex, endIndex)}`);
140+
}
141+
}
142+
});
143+
}
144+
// [END document_parse_table]
145+
146+
parseTableGCS(gcsInputUri, gcsOutputUri, gcsOutputUriPrefix);
147+
}
148+
main(...process.argv.slice(2));

document-ai/parse_form.js

+102
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/**
2+
* Copyright 2020, Google, Inc.
3+
* Licensed under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License.
5+
* You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*/
15+
16+
'use strict';
17+
18+
/**
19+
* Process a single PDF as a form.
20+
* @param {string} projectId your Google Cloud project ID
21+
* @param {string} location region to use for this operation
22+
* @param {string} gcsInputUri Cloud Storage URI of the PDF document to parse
23+
*/
24+
async function main(
25+
projectId,
26+
location,
27+
gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf'
28+
) {
29+
// [START document_quickstart]
30+
/**
31+
* TODO(developer): Uncomment these variables before running the sample.
32+
*/
33+
// const projectId = 'YOUR_PROJECT_ID';
34+
// const location = 'YOUR_PROJECT_LOCATION';
35+
// const gcsInputUri = 'YOUR_SOURCE_PDF';
36+
37+
const {
38+
DocumentUnderstandingServiceClient,
39+
} = require('@google-cloud/documentai');
40+
const client = new DocumentUnderstandingServiceClient();
41+
42+
async function parseForm() {
43+
// Configure the request for processing the PDF
44+
const parent = `projects/${projectId}/locations/${location}`;
45+
const request = {
46+
parent,
47+
inputConfig: {
48+
gcsSource: {
49+
uri: gcsInputUri,
50+
},
51+
mimeType: 'application/pdf',
52+
},
53+
formExtractionParams: {
54+
enabled: true,
55+
keyValuePairHints: [
56+
{
57+
key: 'Phone',
58+
valueTypes: ['PHONE_NUMBER'],
59+
},
60+
{
61+
key: 'Contact',
62+
valueTypes: ['EMAIL', 'NAME'],
63+
},
64+
],
65+
},
66+
};
67+
68+
// Recognizes text entities in the PDF document
69+
const [result] = await client.processDocument(request);
70+
71+
// Get all of the document text as one big string
72+
const {text} = result;
73+
74+
// Extract shards from the text field
75+
const getText = textAnchor => {
76+
// First shard in document doesn't have startIndex property
77+
const startIndex = textAnchor.textSegments[0].startIndex || 0;
78+
const endIndex = textAnchor.textSegments[0].endIndex;
79+
80+
return text.substring(startIndex, endIndex);
81+
};
82+
83+
// Process the output
84+
const [page1] = result.pages;
85+
const {formFields} = page1;
86+
87+
for (const field of formFields) {
88+
const fieldName = getText(field.fieldName.textAnchor);
89+
const fieldValue = getText(field.fieldValue.textAnchor);
90+
91+
console.log('Extracted key value pair:');
92+
console.log(`\t(${fieldName}, ${fieldValue})`);
93+
}
94+
}
95+
// [END document_quickstart]
96+
await parseForm();
97+
}
98+
99+
main(...process.argv.slice(2)).catch(err => {
100+
console.error(err);
101+
process.exitCode = 1;
102+
});

0 commit comments

Comments
 (0)