Lokasi ngalangkungan proxy:   [ UP ]  
[Ngawartoskeun bug]   [Panyetelan cookie]                
Skip to content

Commit 39d7fdb

Browse files
authored
fixit: Changes to DocAI Samples to simplify maintenence (#9743)
- Consolidated process/batch process samples with processor versions - Update Quickstart to include Processor Creation - Consolidate Handle Response Samples into a single file to reduce code duplication - ~Not sure if I did the region tags completely correct, is there a way to test?~ - Update: Tested using Devsite staging and it looks correct - Moved Operations Tests into a single file and removed exception handling in sample - Add type updates from fixit #9991
1 parent 96e1c52 commit 39d7fdb

33 files changed

Lines changed: 847 additions & 1400 deletions

documentai/snippets/batch_process_documents_processor_version_sample.py

Lines changed: 0 additions & 163 deletions
This file was deleted.

documentai/snippets/batch_process_documents_processor_version_sample_test.py

Lines changed: 0 additions & 50 deletions
This file was deleted.

documentai/snippets/batch_process_documents_sample.py

Lines changed: 42 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515

1616
# [START documentai_batch_process_document]
17+
# [START documentai_batch_process_documents_processor_version]
1718
import re
1819
from typing import Optional
1920

@@ -24,61 +25,67 @@
2425
from google.cloud import storage
2526

2627
# TODO(developer): Uncomment these variables before running the sample.
27-
# project_id = 'YOUR_PROJECT_ID'
28-
# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
29-
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample
28+
# project_id = "YOUR_PROJECT_ID"
29+
# location = "YOUR_PROCESSOR_LOCATION" # Format is "us" or "eu"
30+
# processor_id = "YOUR_PROCESSOR_ID" # Create processor before running sample
31+
# gcs_output_uri = "YOUR_OUTPUT_URI" # Must end with a trailing slash `/`. Format: gs://bucket/directory/subdirectory/
32+
# processor_version_id = "YOUR_PROCESSOR_VERSION_ID" # Optional. Example: pretrained-ocr-v1.0-2020-09-23
33+
34+
# TODO(developer): You must specify either `gcs_input_uri` and `mime_type` or `gcs_input_prefix`
3035
# gcs_input_uri = "YOUR_INPUT_URI" # Format: gs://bucket/directory/file.pdf
3136
# input_mime_type = "application/pdf"
32-
# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
33-
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/
37+
# gcs_input_prefix = "YOUR_INPUT_URI_PREFIX" # Format: gs://bucket/directory/
3438
# field_mask = "text,entities,pages.pageNumber" # Optional. The fields to return in the Document object.
3539

3640

3741
def batch_process_documents(
3842
project_id: str,
3943
location: str,
4044
processor_id: str,
41-
gcs_input_uri: str,
42-
input_mime_type: str,
43-
gcs_output_bucket: str,
44-
gcs_output_uri_prefix: str,
45+
gcs_output_uri: str,
46+
processor_version_id: Optional[str] = None,
47+
gcs_input_uri: Optional[str] = None,
48+
input_mime_type: Optional[str] = None,
49+
gcs_input_prefix: Optional[str] = None,
4550
field_mask: Optional[str] = None,
4651
timeout: int = 400,
4752
) -> None:
48-
# You must set the api_endpoint if you use a location other than 'us'.
53+
# You must set the `api_endpoint` if you use a location other than "us".
4954
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
5055

5156
client = documentai.DocumentProcessorServiceClient(client_options=opts)
5257

53-
gcs_document = documentai.GcsDocument(
54-
gcs_uri=gcs_input_uri, mime_type=input_mime_type
55-
)
56-
57-
# Load GCS Input URI into a List of document files
58-
gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
59-
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
60-
61-
# NOTE: Alternatively, specify a GCS URI Prefix to process an entire directory
62-
#
63-
# gcs_input_uri = "gs://bucket/directory/"
64-
# gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
65-
# input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
66-
#
58+
if gcs_input_uri:
59+
# Specify specific GCS URIs to process individual documents
60+
gcs_document = documentai.GcsDocument(
61+
gcs_uri=gcs_input_uri, mime_type=input_mime_type
62+
)
63+
# Load GCS Input URI into a List of document files
64+
gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
65+
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
66+
else:
67+
# Specify a GCS URI Prefix to process an entire directory
68+
gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_prefix)
69+
input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
6770

6871
# Cloud Storage URI for the Output Directory
69-
# This must end with a trailing forward slash `/`
70-
destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}"
71-
7272
gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
73-
gcs_uri=destination_uri, field_mask=field_mask
73+
gcs_uri=gcs_output_uri, field_mask=field_mask
7474
)
7575

7676
# Where to write results
7777
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
7878

79-
# The full resource name of the processor, e.g.:
80-
# projects/project_id/locations/location/processor/processor_id
81-
name = client.processor_path(project_id, location, processor_id)
79+
if processor_version_id:
80+
# The full resource name of the processor version, e.g.:
81+
# projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
82+
name = client.processor_version_path(
83+
project_id, location, processor_id, processor_version_id
84+
)
85+
else:
86+
# The full resource name of the processor, e.g.:
87+
# projects/{project_id}/locations/{location}/processors/{processor_id}
88+
name = client.processor_path(project_id, location, processor_id)
8289

8390
request = documentai.BatchProcessRequest(
8491
name=name,
@@ -91,7 +98,7 @@ def batch_process_documents(
9198

9299
# Continually polls the operation until it is complete.
93100
# This could take some time for larger files
94-
# Format: projects/PROJECT_NUMBER/locations/LOCATION/operations/OPERATION_ID
101+
# Format: projects/{project_id}/locations/{location}/operations/{operation_id}
95102
try:
96103
print(f"Waiting for operation {operation.operation.name} to complete...")
97104
operation.result(timeout=timeout)
@@ -117,7 +124,7 @@ def batch_process_documents(
117124

118125
print("Output files:")
119126
# One process per Input Document
120-
for process in metadata.individual_process_statuses:
127+
for process in list(metadata.individual_process_statuses):
121128
# output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
122129
# The Cloud Storage API requires the bucket name and URI prefix separately
123130
matches = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
@@ -136,7 +143,7 @@ def batch_process_documents(
136143
# Document AI may output multiple JSON files per source file
137144
for blob in output_blobs:
138145
# Document AI should only output JSON files to GCS
139-
if ".json" not in blob.name:
146+
if blob.content_type != "application/json":
140147
print(
141148
f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}"
142149
)
@@ -156,4 +163,5 @@ def batch_process_documents(
156163
print(document.text)
157164

158165

166+
# [END documentai_batch_process_documents_processor_version]
159167
# [END documentai_batch_process_document]

0 commit comments

Comments
 (0)