Skip to content

Commit 3674a8d

Browse files
munkhuushmglShabirmean
authored andcommitted
samples: new Doc AI samples for v1beta3 (#206)
* samples: new Doc AI samples for v1beta3 * disabled batch parsetable test * updated pomxs
1 parent a9d7913 commit 3674a8d

13 files changed

+655
-4
lines changed

document-ai/snippets/pom.xml

+1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
<dependency>
4242
<groupId>com.google.cloud</groupId>
4343
<artifactId>google-cloud-document-ai</artifactId>
44+
<version>0.3.0</version>
4445
</dependency>
4546
<!-- [END documentai_install_with_bom] -->
4647
<dependency>
57.6 KB
Binary file not shown.

document-ai/snippets/src/main/java/documentai/v1beta2/BatchParseFormBeta.java

-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
import com.google.cloud.documentai.v1beta2.BatchProcessDocumentsResponse;
2525
import com.google.cloud.documentai.v1beta2.Document;
2626
import com.google.cloud.documentai.v1beta2.DocumentUnderstandingServiceClient;
27-
import com.google.cloud.documentai.v1beta2.DocumentUnderstandingServiceSettings;
2827
import com.google.cloud.documentai.v1beta2.FormExtractionParams;
2928
import com.google.cloud.documentai.v1beta2.GcsDestination;
3029
import com.google.cloud.documentai.v1beta2.GcsSource;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
/*
2+
* Copyright 2020 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package documentai.v1beta3;
18+
19+
// [START documentai_batch_process_document]
20+
21+
import com.google.api.gax.longrunning.OperationFuture;
22+
import com.google.api.gax.paging.Page;
23+
import com.google.api.gax.rpc.UnknownException;
24+
import com.google.cloud.documentai.v1beta3.BatchProcessMetadata;
25+
import com.google.cloud.documentai.v1beta3.BatchProcessRequest;
26+
import com.google.cloud.documentai.v1beta3.BatchProcessResponse;
27+
import com.google.cloud.documentai.v1beta3.Document;
28+
import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient;
29+
import com.google.cloud.storage.Blob;
30+
import com.google.cloud.storage.BlobId;
31+
import com.google.cloud.storage.Bucket;
32+
import com.google.cloud.storage.Storage;
33+
import com.google.cloud.storage.StorageOptions;
34+
import com.google.protobuf.util.JsonFormat;
35+
import java.io.File;
36+
import java.io.FileReader;
37+
import java.io.IOException;
38+
import java.util.List;
39+
import java.util.concurrent.ExecutionException;
40+
import java.util.concurrent.TimeUnit;
41+
import java.util.concurrent.TimeoutException;
42+
43+
public class BatchProcessDocumentBeta {
44+
public static void batchProcessDocument()
45+
throws IOException, InterruptedException, TimeoutException, ExecutionException {
46+
// TODO(developer): Replace these variables before running the sample.
47+
String projectId = "your-project-id";
48+
String location = "your-project-location"; // Format is "us" or "eu".
49+
String processerId = "your-processor-id";
50+
String outputGcsBucketName = "your-gcs-bucket-name";
51+
String outputGcsPrefix = "PREFIX";
52+
String inputGcsUri = "gs://your-gcs-bucket/path/to/input/file.pdf";
53+
batchProcessDocument(
54+
projectId, location, processerId, inputGcsUri, outputGcsBucketName, outputGcsPrefix);
55+
}
56+
57+
public static void batchProcessDocument(
58+
String projectId,
59+
String location,
60+
String processorId,
61+
String gcsInputUri,
62+
String gcsOutputBucketName,
63+
String gcsOutputUriPrefix)
64+
throws IOException, InterruptedException, TimeoutException, ExecutionException {
65+
// Initialize client that will be used to send requests. This client only needs to be created
66+
// once, and can be reused for multiple requests. After completing all of your requests, call
67+
// the "close" method on the client to safely clean up any remaining background resources.
68+
try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
69+
// The full resource name of the processor, e.g.:
70+
// projects/project-id/locations/location/processor/processor-id
71+
// You must create new processors in the Cloud Console first
72+
String name =
73+
String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);
74+
75+
BatchProcessRequest.BatchInputConfig batchInputConfig =
76+
BatchProcessRequest.BatchInputConfig.newBuilder()
77+
.setGcsSource(gcsInputUri)
78+
.setMimeType("application/pdf")
79+
.build();
80+
81+
String fullGcsPath = String.format("gs://%s/%s/", gcsOutputBucketName, gcsOutputUriPrefix);
82+
BatchProcessRequest.BatchOutputConfig outputConfig =
83+
BatchProcessRequest.BatchOutputConfig.newBuilder().setGcsDestination(fullGcsPath).build();
84+
85+
// Configure the batch process request.
86+
BatchProcessRequest request =
87+
BatchProcessRequest.newBuilder()
88+
.setName(name)
89+
.addInputConfigs(batchInputConfig)
90+
.setOutputConfig(outputConfig)
91+
.build();
92+
93+
OperationFuture<BatchProcessResponse, BatchProcessMetadata> future =
94+
client.batchProcessDocumentsAsync(request);
95+
96+
// Batch process document using a long-running operation.
97+
// You can wait for now, or get results later.
98+
// Note: first request to the service takes longer than subsequent
99+
// requests.
100+
System.out.println("Waiting for operation to complete...");
101+
future.get(120, TimeUnit.SECONDS);
102+
103+
System.out.println("Document processing complete.");
104+
105+
Storage storage = StorageOptions.newBuilder().setProjectId(projectId).build().getService();
106+
Bucket bucket = storage.get(gcsOutputBucketName);
107+
108+
// List all of the files in the Storage bucket.
109+
Page<Blob> blobs = bucket.list(Storage.BlobListOption.prefix(gcsOutputUriPrefix + "/"));
110+
int idx = 0;
111+
for (Blob blob : blobs.iterateAll()) {
112+
if (!blob.isDirectory()) {
113+
System.out.printf("Fetched file #%d\n", ++idx);
114+
// Read the results
115+
116+
// Download and store json data in a temp file.
117+
File tempFile = File.createTempFile("file", ".json");
118+
Blob fileInfo = storage.get(BlobId.of(gcsOutputBucketName, blob.getName()));
119+
fileInfo.downloadTo(tempFile.toPath());
120+
121+
// Parse json file into Document.
122+
FileReader reader = new FileReader(tempFile);
123+
Document.Builder builder = Document.newBuilder();
124+
JsonFormat.parser().merge(reader, builder);
125+
126+
Document document = builder.build();
127+
128+
// Get all of the document text as one big string.
129+
String text = document.getText();
130+
131+
// Read the text recognition output from the processor
132+
System.out.println("The document contains the following paragraphs:");
133+
Document.Page page1 = document.getPages(0);
134+
List<Document.Page.Paragraph> paragraphList = page1.getParagraphsList();
135+
for (Document.Page.Paragraph paragraph : paragraphList) {
136+
String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text);
137+
System.out.printf("Paragraph text:%s\n", paragraphText);
138+
}
139+
140+
// Form parsing provides additional output about
141+
// form-formatted PDFs. You must create a form
142+
// processor in the Cloud Console to see full field details.
143+
System.out.println("The following form key/value pairs were detected:");
144+
145+
for (Document.Page.FormField field : page1.getFormFieldsList()) {
146+
String fieldName = getText(field.getFieldName().getTextAnchor(), text);
147+
String fieldValue = getText(field.getFieldValue().getTextAnchor(), text);
148+
149+
System.out.println("Extracted form fields pair:");
150+
System.out.printf("\t(%s, %s))", fieldName, fieldValue);
151+
}
152+
153+
// Clean up temp file.
154+
tempFile.deleteOnExit();
155+
}
156+
}
157+
}
158+
}
159+
160+
// Extract shards from the text field
161+
private static String getText(Document.TextAnchor textAnchor, String text) {
162+
if (textAnchor.getTextSegmentsList().size() > 0) {
163+
int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
164+
int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
165+
return text.substring(startIdx, endIdx);
166+
}
167+
return "[NO TEXT]";
168+
}
169+
}
170+
// [END documentai_batch_process_document]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/*
2+
* Copyright 2020 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package documentai.v1beta3;
18+
19+
// [START documentai_process_document]
20+
21+
import com.google.cloud.documentai.v1beta3.Document;
22+
import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient;
23+
import com.google.cloud.documentai.v1beta3.ProcessRequest;
24+
import com.google.cloud.documentai.v1beta3.ProcessResponse;
25+
import com.google.protobuf.ByteString;
26+
import java.io.IOException;
27+
import java.nio.file.Files;
28+
import java.nio.file.Paths;
29+
import java.util.List;
30+
import java.util.concurrent.ExecutionException;
31+
import java.util.concurrent.TimeoutException;
32+
33+
public class ProcessDocumentBeta {
34+
public static void processDocument()
35+
throws IOException, InterruptedException, ExecutionException, TimeoutException {
36+
// TODO(developer): Replace these variables before running the sample.
37+
String projectId = "your-project-id";
38+
String location = "your-project-location"; // Format is "us" or "eu".
39+
String processerId = "your-processor-id";
40+
String filePath = "path/to/input/file.pdf";
41+
processDocument(projectId, location, processerId, filePath);
42+
}
43+
44+
public static void processDocument(
45+
String projectId, String location, String processorId, String filePath)
46+
throws IOException, InterruptedException, ExecutionException, TimeoutException {
47+
// Initialize client that will be used to send requests. This client only needs to be created
48+
// once, and can be reused for multiple requests. After completing all of your requests, call
49+
// the "close" method on the client to safely clean up any remaining background resources.
50+
try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
51+
// The full resource name of the processor, e.g.:
52+
// projects/project-id/locations/location/processor/processor-id
53+
// You must create new processors in the Cloud Console first
54+
String name =
55+
String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);
56+
57+
// Read the file.
58+
byte[] imageFileData = Files.readAllBytes(Paths.get(filePath));
59+
60+
// Convert the image data to a Buffer and base64 encode it.
61+
ByteString content = ByteString.copyFrom(imageFileData);
62+
63+
Document document =
64+
Document.newBuilder().setContent(content).setMimeType("application/pdf").build();
65+
66+
// Configure the process request.
67+
ProcessRequest request =
68+
ProcessRequest.newBuilder().setName(name).setDocument(document).build();
69+
70+
// Recognizes text entities in the PDF document
71+
ProcessResponse result = client.processDocument(request);
72+
Document documentResponse = result.getDocument();
73+
74+
// Get all of the document text as one big string
75+
String text = documentResponse.getText();
76+
77+
// Read the text recognition output from the processor
78+
System.out.println("The document contains the following paragraphs:");
79+
Document.Page firstPage = documentResponse.getPages(0);
80+
List<Document.Page.Paragraph> paragraphs = firstPage.getParagraphsList();
81+
82+
for (Document.Page.Paragraph paragraph : paragraphs) {
83+
String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text);
84+
System.out.printf("Paragraph text:\n%s\n", paragraphText);
85+
}
86+
87+
// Form parsing provides additional output about
88+
// form-formatted PDFs. You must create a form
89+
// processor in the Cloud Console to see full field details.
90+
System.out.println("The following form key/value pairs were detected:");
91+
92+
for (Document.Page.FormField field : firstPage.getFormFieldsList()) {
93+
String fieldName = getText(field.getFieldName().getTextAnchor(), text);
94+
String fieldValue = getText(field.getFieldValue().getTextAnchor(), text);
95+
96+
System.out.println("Extracted form fields pair:");
97+
System.out.printf("\t(%s, %s))\n", fieldName, fieldValue);
98+
}
99+
}
100+
}
101+
102+
// Extract shards from the text field
103+
private static String getText(Document.TextAnchor textAnchor, String text) {
104+
if (textAnchor.getTextSegmentsList().size() > 0) {
105+
int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
106+
int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
107+
return text.substring(startIdx, endIdx);
108+
}
109+
return "[NO TEXT]";
110+
}
111+
}
112+
// [END documentai_process_document]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/*
2+
* Copyright 2020 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package documentai.v1beta3;
18+
19+
// [START documentai_quickstart]
20+
21+
import com.google.cloud.documentai.v1beta3.Document;
22+
import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient;
23+
import com.google.cloud.documentai.v1beta3.ProcessRequest;
24+
import com.google.cloud.documentai.v1beta3.ProcessResponse;
25+
import com.google.protobuf.ByteString;
26+
import java.io.IOException;
27+
import java.nio.file.Files;
28+
import java.nio.file.Paths;
29+
import java.util.List;
30+
import java.util.concurrent.ExecutionException;
31+
import java.util.concurrent.TimeoutException;
32+
33+
public class QuickStart {
34+
public static void quickStart()
35+
throws IOException, InterruptedException, ExecutionException, TimeoutException {
36+
// TODO(developer): Replace these variables before running the sample.
37+
String projectId = "your-project-id";
38+
String location = "your-project-location"; // Format is "us" or "eu".
39+
String processerId = "your-processor-id";
40+
String filePath = "path/to/input/file.pdf";
41+
quickStart(projectId, location, processerId, filePath);
42+
}
43+
44+
public static void quickStart(
45+
String projectId, String location, String processorId, String filePath)
46+
throws IOException, InterruptedException, ExecutionException, TimeoutException {
47+
// Initialize client that will be used to send requests. This client only needs to be created
48+
// once, and can be reused for multiple requests. After completing all of your requests, call
49+
// the "close" method on the client to safely clean up any remaining background resources.
50+
try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
51+
// The full resource name of the processor, e.g.:
52+
// projects/project-id/locations/location/processor/processor-id
53+
// You must create new processors in the Cloud Console first
54+
String name =
55+
String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);
56+
57+
// Read the file.
58+
byte[] imageFileData = Files.readAllBytes(Paths.get(filePath));
59+
60+
// Convert the image data to a Buffer and base64 encode it.
61+
ByteString content = ByteString.copyFrom(imageFileData);
62+
63+
Document document =
64+
Document.newBuilder().setContent(content).setMimeType("application/pdf").build();
65+
66+
// Configure the process request.
67+
ProcessRequest request =
68+
ProcessRequest.newBuilder().setName(name).setDocument(document).build();
69+
70+
// Recognizes text entities in the PDF document
71+
ProcessResponse result = client.processDocument(request);
72+
Document documentResponse = result.getDocument();
73+
74+
// Get all of the document text as one big string
75+
String text = documentResponse.getText();
76+
77+
// Read the text recognition output from the processor
78+
System.out.println("The document contains the following paragraphs:");
79+
Document.Page firstPage = documentResponse.getPages(0);
80+
List<Document.Page.Paragraph> paragraphs = firstPage.getParagraphsList();
81+
82+
for (Document.Page.Paragraph paragraph : paragraphs) {
83+
String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text);
84+
System.out.printf("Paragraph text:\n%s\n", paragraphText);
85+
}
86+
}
87+
}
88+
89+
// Extract shards from the text field
90+
private static String getText(Document.TextAnchor textAnchor, String text) {
91+
if (textAnchor.getTextSegmentsList().size() > 0) {
92+
int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
93+
int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
94+
return text.substring(startIdx, endIdx);
95+
}
96+
return "[NO TEXT]";
97+
}
98+
}
99+
// [END documentai_quickstart]

0 commit comments

Comments
 (0)