Skip to content

Commit c87a27c

Browse files
Merge pull request Azure#1 from pradorodriguez/branch-mp-dev2
Adding document intelligence code
2 parents 51328dd + ce18abf commit c87a27c

File tree

2 files changed

+51
-0
lines changed

2 files changed

+51
-0
lines changed

processUploads/__init__.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,42 @@
77
import io
88
import os
99
import json
10+
import base64
11+
12+
# Libraries used in the future Document Processing client code - Migrate to utils
13+
from azure.identity import DefaultAzureCredential
14+
from azure.ai.documentintelligence import DocumentIntelligenceClient
15+
from azure.ai.documentintelligence.models import AnalyzeResult, AnalyzeDocumentRequest
16+
17+
# Variables used in the future Document Processing client code - Migrate to utils
18+
endpoint = "" # Add the endpoint URL for the Document Processing service
19+
20+
def extract_text_from_blob(blob_name):
21+
try:
22+
credential = DefaultAzureCredential()
23+
24+
client = DocumentIntelligenceClient(
25+
endpoint=endpoint, credential=credential
26+
)
27+
28+
content = get_blob_content("bronze", blob_name)
29+
30+
base64_content = base64.b64encode(content).decode('utf-8')
31+
32+
poller = client.begin_analyze_document(
33+
# AnalyzeDocumentRequest Class: https://learn.microsoft.com/en-us/python/api/azure-ai-documentintelligence/azure.ai.documentintelligence.models.analyzedocumentrequest?view=azure-python
34+
"prebuilt-read", AnalyzeDocumentRequest(bytes_source=base64_content
35+
))
36+
result: AnalyzeResult = poller.result()
37+
38+
if result.paragraphs:
39+
paragraphs = "\n".join([paragraph.content for paragraph in result.paragraphs])
40+
41+
return paragraphs
42+
43+
except Exception as e:
44+
logging.error(f"Error processing {blob_name}: {e}")
45+
return None
1046

1147
def extract_text_from_docx(blob_name):
1248
try:
@@ -48,6 +84,9 @@ def main(req: func.HttpRequest) -> func.HttpResponse:
4884

4985
processed_files = []
5086
errors = []
87+
88+
# Document Intelligence supported suffixes
89+
suffixes = (".jpg", ".jpeg", ".png", ".tiff", ".docx", ".xlsx", ".pptx", ".pdf")
5190

5291
# Lists blobs in the 'bronze' container
5392
if selected_blobs:
@@ -58,6 +97,17 @@ def main(req: func.HttpRequest) -> func.HttpResponse:
5897
try:
5998
blob_name = blob.get("name")
6099

100+
# Extract text from supported file types using Document Intelligence
101+
# if blob_name.endswith(suffixes):
102+
# logging.info(f"Processing: {blob_name}")
103+
# text = extract_text_from_blob(blob_name)
104+
# if text:
105+
# sourcefile = os.path.splitext(os.path.basename(blob_name))[0]
106+
# write_to_blob(f"silver", f"{sourcefile}.txt", text)
107+
# processed_files.append(blob_name)
108+
# else:
109+
# errors.append(f"Failed to extract text from: {blob_name}")
110+
61111
if blob_name.endswith(".docx"):
62112
logging.info(f"Processing DOCX: {blob_name}")
63113
text = extract_text_from_docx(blob_name)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
annotated-types==0.7.0
22
anyio==4.8.0
3+
azure-ai-documentintelligence==1.0.0
34
azure-ai-textanalytics==5.2.0
45
azure-cognitiveservices-speech==1.41.1
56
azure-common==1.1.28

0 commit comments

Comments
 (0)