Skip to content

Commit f99dde5

Browse files
munkhuushmglnnegreychingor13
authored
samples: add v1beta2 samples (#42)
* init commit & missing tests * updated samples with new default api, added dependencies to other pom files * init commit & missing tests * updated samples with new default api, added dependencies to other pom files * added other tests * finished formatting and tests * init commit & missing tests * updated samples with new default api, added dependencies to other pom files * init commit & missing tests * added other tests * finished formatting and tests * init commit & missing tests * updated samples with new default api, added dependencies to other pom files * removed TODOs * corrected region tags, made requested changes * fixed bom issues * removed unused imports & fixed pom * applied latest changes from API * added correct location format * changed env var * removed prefix bucket prefix * Update samples/snippets/pom.xml Co-Authored-By: Noah Negrey <[email protected]> * Update samples/snippets/src/main/java/com/examples/documentai/ParseWithModel.java Co-Authored-By: Noah Negrey <[email protected]> * Update samples/snippets/pom.xml Co-Authored-By: Noah Negrey <[email protected]> * Update samples/snippets/pom.xml Co-Authored-By: Noah Negrey <[email protected]> * Update samples/snippets/src/main/java/com/examples/documentai/BatchParseForm.java Co-Authored-By: Noah Negrey <[email protected]> * Update samples/snippets/src/main/java/com/examples/documentai/ParseWithModel.java Co-Authored-By: Noah Negrey <[email protected]> * update autoML id * updated custom model * reformatted automl param * added env vars * refactored tests and updated samples to latest changes * removed AutoML project test for now * formatted code * testing project to have correct id * changed project ID for testing * added missing cfg * added another env var * added some comments and fixed region tags * renamed package path, add _beta tags, added some comment on autoMl model * removed version of bom pom.xml * changed env var names * corrected project id in kokoro config * hard-coded java-docs to test * added . * correct autoML test to two project ID * trigger kokoro test * last kokoro test trigger * moved storage dependency and fixed nit Co-authored-by: Noah Negrey <[email protected]> Co-authored-by: Jeff Ching <[email protected]>
1 parent d01ae96 commit f99dde5

15 files changed

+1442
-1
lines changed

document-ai/snippets/pom.xml

+4-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,10 @@
4343
<artifactId>google-cloud-document-ai</artifactId>
4444
</dependency>
4545
<!-- [END documentai_install_with_bom] -->
46-
46+
<dependency>
47+
<groupId>com.google.cloud</groupId>
48+
<artifactId>google-cloud-storage</artifactId>
49+
</dependency>
4750
<dependency>
4851
<groupId>junit</groupId>
4952
<artifactId>junit</artifactId>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
/*
2+
* Copyright 2020 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package documentai.v1beta2;
18+
19+
// [START documentai_batch_parse_form_beta]
20+
21+
import com.google.api.gax.longrunning.OperationFuture;
22+
import com.google.api.gax.paging.Page;
23+
import com.google.cloud.documentai.v1beta2.BatchProcessDocumentsRequest;
24+
import com.google.cloud.documentai.v1beta2.BatchProcessDocumentsResponse;
25+
import com.google.cloud.documentai.v1beta2.Document;
26+
import com.google.cloud.documentai.v1beta2.DocumentUnderstandingServiceClient;
27+
import com.google.cloud.documentai.v1beta2.DocumentUnderstandingServiceSettings;
28+
import com.google.cloud.documentai.v1beta2.FormExtractionParams;
29+
import com.google.cloud.documentai.v1beta2.GcsDestination;
30+
import com.google.cloud.documentai.v1beta2.GcsSource;
31+
import com.google.cloud.documentai.v1beta2.InputConfig;
32+
import com.google.cloud.documentai.v1beta2.KeyValuePairHint;
33+
import com.google.cloud.documentai.v1beta2.OperationMetadata;
34+
import com.google.cloud.documentai.v1beta2.OutputConfig;
35+
import com.google.cloud.documentai.v1beta2.ProcessDocumentRequest;
36+
import com.google.cloud.storage.Blob;
37+
import com.google.cloud.storage.BlobId;
38+
import com.google.cloud.storage.Bucket;
39+
import com.google.cloud.storage.Storage;
40+
import com.google.cloud.storage.StorageOptions;
41+
import com.google.protobuf.util.JsonFormat;
42+
import java.io.File;
43+
import java.io.FileReader;
44+
import java.io.IOException;
45+
import java.util.concurrent.ExecutionException;
46+
import java.util.concurrent.TimeUnit;
47+
import java.util.concurrent.TimeoutException;
48+
49+
public class BatchParseFormBeta {
50+
51+
public static void batchParseFormGcs()
52+
throws IOException, InterruptedException, ExecutionException, TimeoutException {
53+
// TODO(developer): Replace these variables before running the sample.
54+
String projectId = "your-project-id";
55+
String location = "your-project-location"; // Format is "us" or "eu".
56+
String outputGcsBucketName = "your-gcs-bucket-name";
57+
String outputGcsPrefix = "PREFIX";
58+
String inputGcsUri = "gs://your-gcs-bucket/path/to/input/file.json";
59+
batchParseFormGcs(projectId, location, outputGcsBucketName, outputGcsPrefix, inputGcsUri);
60+
}
61+
62+
public static void batchParseFormGcs(
63+
String projectId,
64+
String location,
65+
String outputGcsBucketName,
66+
String outputGcsPrefix,
67+
String inputGcsUri)
68+
throws IOException, InterruptedException, ExecutionException, TimeoutException {
69+
// Initialize client that will be used to send requests. This client only needs to be created
70+
// once, and can be reused for multiple requests. After completing all of your requests, call
71+
// the "close" method on the client to safely clean up any remaining background resources.
72+
try (DocumentUnderstandingServiceClient client =
73+
DocumentUnderstandingServiceClient.create()) {
74+
75+
// Configure the request for processing the PDF
76+
String parent = String.format("projects/%s/locations/%s", projectId, location);
77+
78+
// Improve form parsing results by providing key-value pair hints.
79+
// For each key hint, key is text that is likely to appear in the
80+
// document as a form field name (i.e. "DOB").
81+
// Value types are optional, but can be one or more of:
82+
// ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
83+
// NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
84+
KeyValuePairHint keyValuePairHint =
85+
KeyValuePairHint.newBuilder().setKey("Phone").addValueTypes("PHONE_NUMBER").build();
86+
87+
KeyValuePairHint keyValuePairHint2 =
88+
KeyValuePairHint.newBuilder()
89+
.setKey("Contact")
90+
.addValueTypes("EMAIL")
91+
.addValueTypes("NAME")
92+
.build();
93+
94+
// Setting enabled=True enables form extraction
95+
FormExtractionParams params =
96+
FormExtractionParams.newBuilder()
97+
.setEnabled(true)
98+
.addKeyValuePairHints(keyValuePairHint)
99+
.addKeyValuePairHints(keyValuePairHint2)
100+
.build();
101+
102+
GcsSource inputUri = GcsSource.newBuilder().setUri(inputGcsUri).build();
103+
104+
// mime_type can be application/pdf, image/tiff,
105+
// and image/gif, or application/json
106+
InputConfig config =
107+
InputConfig.newBuilder().setGcsSource(inputUri)
108+
.setMimeType("application/pdf").build();
109+
110+
GcsDestination gcsDestination = GcsDestination.newBuilder()
111+
.setUri(String.format("gs://%s/%s", outputGcsBucketName, outputGcsPrefix)).build();
112+
113+
OutputConfig outputConfig = OutputConfig.newBuilder()
114+
.setGcsDestination(gcsDestination)
115+
.setPagesPerShard(1)
116+
.build();
117+
118+
ProcessDocumentRequest request =
119+
ProcessDocumentRequest.newBuilder()
120+
.setFormExtractionParams(params)
121+
.setInputConfig(config)
122+
.setOutputConfig(outputConfig)
123+
.build();
124+
125+
BatchProcessDocumentsRequest requests =
126+
BatchProcessDocumentsRequest.newBuilder().addRequests(request).setParent(parent).build();
127+
128+
// Batch process document using a long-running operation.
129+
OperationFuture<BatchProcessDocumentsResponse, OperationMetadata> future =
130+
client.batchProcessDocumentsAsync(requests);
131+
132+
// Wait for operation to complete.
133+
System.out.println("Waiting for operation to complete...");
134+
future.get(300, TimeUnit.SECONDS);
135+
136+
System.out.println("Document processing complete.");
137+
138+
Storage storage = StorageOptions.newBuilder().setProjectId(projectId).build().getService();
139+
Bucket bucket = storage.get(outputGcsBucketName);
140+
141+
// List all of the files in the Storage bucket.
142+
Page<Blob> blobs =
143+
bucket.list(
144+
Storage.BlobListOption.currentDirectory(),
145+
Storage.BlobListOption.prefix(outputGcsPrefix));
146+
147+
int idx = 0;
148+
for (Blob blob : blobs.iterateAll()) {
149+
if (!blob.isDirectory()) {
150+
System.out.printf("Fetched file #%d\n", ++idx);
151+
// Read the results
152+
153+
// Download and store json data in a temp file.
154+
File tempFile = File.createTempFile("file", ".json");
155+
Blob fileInfo = storage.get(BlobId.of(outputGcsBucketName, blob.getName()));
156+
fileInfo.downloadTo(tempFile.toPath());
157+
158+
// Parse json file into Document.
159+
FileReader reader = new FileReader(tempFile);
160+
Document.Builder builder = Document.newBuilder();
161+
JsonFormat.parser().merge(reader, builder);
162+
163+
Document document = builder.build();
164+
165+
// Get all of the document text as one big string.
166+
String text = document.getText();
167+
168+
// Process the output.
169+
Document.Page page1 = document.getPages(0);
170+
for (Document.Page.FormField field : page1.getFormFieldsList()) {
171+
String fieldName = getText(field.getFieldName(), text);
172+
String fieldValue = getText(field.getFieldValue(), text);
173+
174+
System.out.println("Extracted form fields pair:");
175+
System.out.printf("\t(%s, %s))", fieldName, fieldValue);
176+
}
177+
178+
// Clean up temp file.
179+
tempFile.deleteOnExit();
180+
}
181+
}
182+
}
183+
}
184+
185+
private static String getText(Document.Page.Layout layout, String text) {
186+
Document.TextAnchor textAnchor = layout.getTextAnchor();
187+
if (textAnchor.getTextSegmentsList().size() > 0) {
188+
int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
189+
int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
190+
return text.substring(startIdx, endIdx);
191+
}
192+
return "[NO TEXT]";
193+
}
194+
}
195+
// [END documentai_batch_parse_form_beta]

0 commit comments

Comments
 (0)