1414
1515
1616# [START documentai_batch_process_document]
17+ # [START documentai_batch_process_documents_processor_version]
1718import re
1819from typing import Optional
1920
2425from google .cloud import storage
2526
2627# TODO(developer): Uncomment these variables before running the sample.
27- # project_id = 'YOUR_PROJECT_ID'
28- # location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
29- # processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample
28+ # project_id = "YOUR_PROJECT_ID"
29+ # location = "YOUR_PROCESSOR_LOCATION" # Format is "us" or "eu"
30+ # processor_id = "YOUR_PROCESSOR_ID" # Create processor before running sample
31+ # gcs_output_uri = "YOUR_OUTPUT_URI" # Must end with a trailing slash `/`. Format: gs://bucket/directory/subdirectory/
32+ # processor_version_id = "YOUR_PROCESSOR_VERSION_ID" # Optional. Example: pretrained-ocr-v1.0-2020-09-23
33+
34+ # TODO(developer): You must specify either `gcs_input_uri` and `mime_type` or `gcs_input_prefix`
3035# gcs_input_uri = "YOUR_INPUT_URI" # Format: gs://bucket/directory/file.pdf
3136# input_mime_type = "application/pdf"
32- # gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
33- # gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/
37+ # gcs_input_prefix = "YOUR_INPUT_URI_PREFIX" # Format: gs://bucket/directory/
3438# field_mask = "text,entities,pages.pageNumber" # Optional. The fields to return in the Document object.
3539
3640
3741def batch_process_documents (
3842 project_id : str ,
3943 location : str ,
4044 processor_id : str ,
41- gcs_input_uri : str ,
42- input_mime_type : str ,
43- gcs_output_bucket : str ,
44- gcs_output_uri_prefix : str ,
45+ gcs_output_uri : str ,
46+ processor_version_id : Optional [str ] = None ,
47+ gcs_input_uri : Optional [str ] = None ,
48+ input_mime_type : Optional [str ] = None ,
49+ gcs_input_prefix : Optional [str ] = None ,
4550 field_mask : Optional [str ] = None ,
4651 timeout : int = 400 ,
4752) -> None :
48- # You must set the api_endpoint if you use a location other than 'us' .
53+ # You must set the ` api_endpoint` if you use a location other than "us" .
4954 opts = ClientOptions (api_endpoint = f"{ location } -documentai.googleapis.com" )
5055
5156 client = documentai .DocumentProcessorServiceClient (client_options = opts )
5257
53- gcs_document = documentai .GcsDocument (
54- gcs_uri = gcs_input_uri , mime_type = input_mime_type
55- )
56-
57- # Load GCS Input URI into a List of document files
58- gcs_documents = documentai .GcsDocuments (documents = [gcs_document ])
59- input_config = documentai .BatchDocumentsInputConfig (gcs_documents = gcs_documents )
60-
61- # NOTE: Alternatively, specify a GCS URI Prefix to process an entire directory
62- #
63- # gcs_input_uri = "gs://bucket/directory/"
64- # gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
65- # input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
66- #
58+ if gcs_input_uri :
59+ # Specify specific GCS URIs to process individual documents
60+ gcs_document = documentai .GcsDocument (
61+ gcs_uri = gcs_input_uri , mime_type = input_mime_type
62+ )
63+ # Load GCS Input URI into a List of document files
64+ gcs_documents = documentai .GcsDocuments (documents = [gcs_document ])
65+ input_config = documentai .BatchDocumentsInputConfig (gcs_documents = gcs_documents )
66+ else :
67+ # Specify a GCS URI Prefix to process an entire directory
68+ gcs_prefix = documentai .GcsPrefix (gcs_uri_prefix = gcs_input_prefix )
69+ input_config = documentai .BatchDocumentsInputConfig (gcs_prefix = gcs_prefix )
6770
6871 # Cloud Storage URI for the Output Directory
69- # This must end with a trailing forward slash `/`
70- destination_uri = f"{ gcs_output_bucket } /{ gcs_output_uri_prefix } "
71-
7272 gcs_output_config = documentai .DocumentOutputConfig .GcsOutputConfig (
73- gcs_uri = destination_uri , field_mask = field_mask
73+ gcs_uri = gcs_output_uri , field_mask = field_mask
7474 )
7575
7676 # Where to write results
7777 output_config = documentai .DocumentOutputConfig (gcs_output_config = gcs_output_config )
7878
79- # The full resource name of the processor, e.g.:
80- # projects/project_id/locations/location/processor/processor_id
81- name = client .processor_path (project_id , location , processor_id )
79+ if processor_version_id :
80+ # The full resource name of the processor version, e.g.:
81+ # projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
82+ name = client .processor_version_path (
83+ project_id , location , processor_id , processor_version_id
84+ )
85+ else :
86+ # The full resource name of the processor, e.g.:
87+ # projects/{project_id}/locations/{location}/processors/{processor_id}
88+ name = client .processor_path (project_id , location , processor_id )
8289
8390 request = documentai .BatchProcessRequest (
8491 name = name ,
@@ -91,7 +98,7 @@ def batch_process_documents(
9198
9299 # Continually polls the operation until it is complete.
93100 # This could take some time for larger files
94- # Format: projects/PROJECT_NUMBER /locations/LOCATION /operations/OPERATION_ID
101+ # Format: projects/{project_id} /locations/{location} /operations/{operation_id}
95102 try :
96103 print (f"Waiting for operation { operation .operation .name } to complete..." )
97104 operation .result (timeout = timeout )
@@ -117,7 +124,7 @@ def batch_process_documents(
117124
118125 print ("Output files:" )
119126 # One process per Input Document
120- for process in metadata .individual_process_statuses :
127+ for process in list ( metadata .individual_process_statuses ) :
121128 # output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
122129 # The Cloud Storage API requires the bucket name and URI prefix separately
123130 matches = re .match (r"gs://(.*?)/(.*)" , process .output_gcs_destination )
@@ -136,7 +143,7 @@ def batch_process_documents(
136143 # Document AI may output multiple JSON files per source file
137144 for blob in output_blobs :
138145 # Document AI should only output JSON files to GCS
139- if ".json" not in blob . name :
146+ if blob . content_type != "application/json" :
140147 print (
141148 f"Skipping non-supported file: { blob .name } - Mimetype: { blob .content_type } "
142149 )
@@ -156,4 +163,5 @@ def batch_process_documents(
156163 print (document .text )
157164
158165
166+ # [END documentai_batch_process_documents_processor_version]
159167# [END documentai_batch_process_document]
0 commit comments