77import io
88import os
99import json
10+ import base64
11+
12+ # Libraries used in the future Document Processing client code - Migrate to utils
13+ from azure .identity import DefaultAzureCredential
14+ from azure .ai .documentintelligence import DocumentIntelligenceClient
15+ from azure .ai .documentintelligence .models import AnalyzeResult , AnalyzeDocumentRequest
16+
17+ # Variables used in the future Document Processing client code - Migrate to utils
18+ endpoint = "" # Add the endpoint URL for the Document Processing service
19+
20+ def extract_text_from_blob (blob_name ):
21+ try :
22+ credential = DefaultAzureCredential ()
23+
24+ client = DocumentIntelligenceClient (
25+ endpoint = endpoint , credential = credential
26+ )
27+
28+ content = get_blob_content ("bronze" , blob_name )
29+
30+ base64_content = base64 .b64encode (content ).decode ('utf-8' )
31+
32+ poller = client .begin_analyze_document (
33+ # AnalyzeDocumentRequest Class: https://learn.microsoft.com/en-us/python/api/azure-ai-documentintelligence/azure.ai.documentintelligence.models.analyzedocumentrequest?view=azure-python
34+ "prebuilt-read" , AnalyzeDocumentRequest (bytes_source = base64_content
35+ ))
36+ result : AnalyzeResult = poller .result ()
37+
38+ if result .paragraphs :
39+ paragraphs = "\n " .join ([paragraph .content for paragraph in result .paragraphs ])
40+
41+ return paragraphs
42+
43+ except Exception as e :
44+ logging .error (f"Error processing { blob_name } : { e } " )
45+ return None
1046
1147def extract_text_from_docx (blob_name ):
1248 try :
@@ -48,6 +84,9 @@ def main(req: func.HttpRequest) -> func.HttpResponse:
4884
4985 processed_files = []
5086 errors = []
87+
88+ # Document Intelligence supported suffixes
89+ suffixes = (".jpg" , ".jpeg" , ".png" , ".tiff" , ".docx" , ".xlsx" , ".pptx" , ".pdf" )
5190
5291 # Lists blobs in the 'bronze' container
5392 if selected_blobs :
@@ -58,6 +97,17 @@ def main(req: func.HttpRequest) -> func.HttpResponse:
5897 try :
5998 blob_name = blob .get ("name" )
6099
100+ # Extract text from supported file types using Document Intelligence
101+ # if blob_name.endswith(suffixes):
102+ # logging.info(f"Processing: {blob_name}")
103+ # text = extract_text_from_blob(blob_name)
104+ # if text:
105+ # sourcefile = os.path.splitext(os.path.basename(blob_name))[0]
106+ # write_to_blob(f"silver", f"{sourcefile}.txt", text)
107+ # processed_files.append(blob_name)
108+ # else:
109+ # errors.append(f"Failed to extract text from: {blob_name}")
110+
61111 if blob_name .endswith (".docx" ):
62112 logging .info (f"Processing DOCX: { blob_name } " )
63113 text = extract_text_from_docx (blob_name )
0 commit comments