6
6
python manage.py demo_ocr --clear # Clear previous demo data first
7
7
"""
8
8
9
- import os
10
9
from pathlib import Path
11
10
11
+ from django .conf import settings
12
12
from django .core .files import File
13
13
from django .core .management .base import BaseCommand
14
- from django .conf import settings
15
14
16
15
from genealogy .models import Document , DocumentPage
17
16
from genealogy .tasks import process_page_ocr
@@ -28,7 +27,7 @@ def add_arguments(self, parser):
28
27
)
29
28
parser .add_argument (
30
29
"--sync" ,
31
- action = "store_true" ,
30
+ action = "store_true" ,
32
31
help = "Run OCR synchronously instead of using Celery tasks" ,
33
32
)
34
33
@@ -95,18 +94,17 @@ def _clear_demo_data(self):
95
94
def _create_demo_document (self , file_path : Path , description : str ) -> Document :
96
95
"""Create a document for demo purposes"""
97
96
title = f"Demo: { file_path .stem } - { description } "
98
-
99
- document = Document .objects .create (
97
+
98
+ return Document .objects .create (
100
99
title = title ,
101
100
languages = "eng" , # Default to English for demo
102
101
)
103
- return document
104
102
105
103
def _create_pages_for_document (self , document : Document , file_path : Path ) -> int :
106
104
"""Create document pages from the PDF file"""
107
105
with open (file_path , "rb" ) as f :
108
106
django_file = File (f , name = file_path .name )
109
-
107
+
110
108
# For demo, treat each PDF as a single page
111
109
# In reality, the admin interface would handle multi-page PDFs
112
110
page = DocumentPage .objects .create (
@@ -115,65 +113,71 @@ def _create_pages_for_document(self, document: Document, file_path: Path) -> int
115
113
image_file = django_file ,
116
114
original_filename = file_path .name ,
117
115
)
118
-
116
+
119
117
return 1
120
118
121
119
def _process_ocr_sync (self , document : Document ):
122
120
"""Process OCR synchronously for immediate results"""
123
121
self .stdout .write (" 🔄 Processing OCR (synchronous)..." )
124
-
122
+
125
123
for page in document .pages .all ():
126
124
try :
127
125
page .validate_for_ocr ()
128
-
126
+
129
127
# Import here to avoid import issues
130
128
from genealogy .ocr_processor import OCRProcessor
131
-
129
+
132
130
processor = OCRProcessor ()
133
131
file_path = page .image_file .path
134
-
132
+
135
133
text , confidence , rotation = processor .process_file (file_path )
136
-
134
+
137
135
page .ocr_text = text
138
136
page .ocr_confidence = confidence
139
137
page .rotation_applied = rotation
140
138
page .ocr_completed = True
141
139
page .save ()
142
-
140
+
143
141
self .stdout .write (
144
142
f" ✅ OCR complete - { confidence :.1f} % confidence, "
145
143
f"{ len (text )} characters extracted"
146
144
)
147
-
145
+
148
146
# Show first 100 characters of extracted text
149
147
preview = text [:100 ].replace ("\n " , " " ).strip ()
150
148
if len (text ) > 100 :
151
149
preview += "..."
152
150
self .stdout .write (f" 📝 Preview: { preview } " )
153
-
151
+
154
152
except Exception as e :
155
153
self .stdout .write (
156
- self .style .ERROR (f" ❌ OCR failed for page { page .page_number } : { e } " )
154
+ self .style .ERROR (
155
+ f" ❌ OCR failed for page { page .page_number } : { e } "
156
+ )
157
157
)
158
158
159
159
def _process_ocr_async (self , document : Document ):
160
160
"""Process OCR using Celery tasks"""
161
161
self .stdout .write (" 🔄 Queuing OCR tasks (asynchronous)..." )
162
-
162
+
163
163
task_count = 0
164
164
for page in document .pages .all ():
165
165
try :
166
166
page .validate_for_ocr ()
167
167
task = process_page_ocr .delay (str (page .id ))
168
168
task_count += 1
169
- self .stdout .write (f" 📋 Queued OCR task { task .id } for page { page .page_number } " )
169
+ self .stdout .write (
170
+ f" 📋 Queued OCR task { task .id } for page { page .page_number } "
171
+ )
170
172
except Exception as e :
171
173
self .stdout .write (
172
- self .style .ERROR (f" ❌ Failed to queue OCR for page { page .page_number } : { e } " )
174
+ self .style .ERROR (
175
+ f" ❌ Failed to queue OCR for page { page .page_number } : { e } "
176
+ )
173
177
)
174
-
178
+
175
179
if task_count > 0 :
176
180
self .stdout .write (
177
181
f" ⏱️ { task_count } OCR task(s) queued. "
178
182
"Check the admin interface to see results as they complete."
179
- )
183
+ )
0 commit comments