evz
diff --git a/‎.bandit‎
Lines changed: 2 additions & 0 deletions b/‎.bandit‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.env.example‎
Lines changed: 6 additions & 0 deletions b/‎.env.example‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 6 additions & 0 deletions b/‎Dockerfile‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 64 additions & 4 deletions b/‎Makefile‎
Lines changed: 64 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 46 additions & 7 deletions b/‎README.md‎
Lines changed: 46 additions & 7 deletions
@@ -7,5 +7,7 @@ exclude_dirs:
   - ".git"
 
 # Allow hardcoded SECRET_KEY in development settings
+# Allow HuggingFace downloads without revision pinning for development
 skips:
   - "B105"
+  - "B615"
@@ -23,3 +23,9 @@ DJANGO_SUPERUSER_EMAIL=admin@localhost
 
 # Media Storage
 MEDIA_ROOT=/app/media
+
+# Ollama Configuration
+OLLAMA_HOST=the-area.local
+OLLAMA_PORT=11434
+OLLAMA_LLM_MODEL=aya:35b-23
+OLLAMA_EMBEDDING_MODEL=zylonai/multilingual-e5-large:latest
@@ -138,6 +138,8 @@ Thumbs.db
 media/
 staticfiles/
 static_root/
+models/
+training_*/
 
 # Docker
 .dockerignore
 
@@ -11,6 +11,12 @@ RUN apt-get update && apt-get install -y \
     tesseract-ocr-nld \
     libtesseract-dev \
     poppler-utils \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
     && rm -rf /var/lib/apt/lists/*
 
 # Copy requirements and install Python dependencies
 
@@ -144,17 +144,77 @@ test-tasks: ensure-containers ## Run only task tests in Docker
 
 
 ##@ Docker Commands
+# Resolve mDNS hostname for Ollama host
+resolve-ollama-host:
+	@echo "Resolving Ollama host..."
+	@if [ -f .env ]; then \
+		OLLAMA_HOST=$$(grep "^OLLAMA_HOST=" .env 2>/dev/null | cut -d'=' -f2 | tr -d ' "'\'''); \
+		if [ -n "$$OLLAMA_HOST" ] && echo "$$OLLAMA_HOST" | grep -q "\.local$$"; then \
+			echo "Found mDNS hostname in .env: $$OLLAMA_HOST"; \
+			if command -v avahi-resolve >/dev/null 2>&1; then \
+				echo "Attempting to resolve $$OLLAMA_HOST using mDNS..."; \
+				RESOLVED_IP=$$(avahi-resolve -4 -n "$$OLLAMA_HOST" 2>/dev/null | awk '{print $$2}' | head -1); \
+				if [ -n "$$RESOLVED_IP" ] && [ "$$RESOLVED_IP" != "$$OLLAMA_HOST" ]; then \
+					echo "✅ Resolved $$OLLAMA_HOST to $$RESOLVED_IP"; \
+					echo "OLLAMA_HOST=$$RESOLVED_IP" > .env.ollama; \
+					OLLAMA_PORT=$$(grep "^OLLAMA_PORT=" .env 2>/dev/null | cut -d'=' -f2 | tr -d ' "'\'''); \
+					OLLAMA_EMBEDDING_MODEL=$$(grep "^OLLAMA_EMBEDDING_MODEL=" .env 2>/dev/null | cut -d'=' -f2 | tr -d ' "'\'''); \
+					OLLAMA_LLM_MODEL=$$(grep "^OLLAMA_LLM_MODEL=" .env 2>/dev/null | cut -d'=' -f2 | tr -d ' "'\'''); \
+					[ -n "$$OLLAMA_PORT" ] && echo "OLLAMA_PORT=$$OLLAMA_PORT" >> .env.ollama; \
+					[ -n "$$OLLAMA_EMBEDDING_MODEL" ] && echo "OLLAMA_EMBEDDING_MODEL=$$OLLAMA_EMBEDDING_MODEL" >> .env.ollama; \
+					[ -n "$$OLLAMA_LLM_MODEL" ] && echo "OLLAMA_LLM_MODEL=$$OLLAMA_LLM_MODEL" >> .env.ollama; \
+				else \
+					echo "⚠️  Could not resolve $$OLLAMA_HOST, using original configuration"; \
+					rm -f .env.ollama; \
+				fi; \
+			else \
+				echo "⚠️  avahi-resolve not available, using original configuration"; \
+				rm -f .env.ollama; \
+			fi; \
+		else \
+			echo "OLLAMA_HOST is not an mDNS hostname (.local), no resolution needed"; \
+			rm -f .env.ollama; \
+		fi; \
+	else \
+		echo "No .env file found, skipping mDNS resolution"; \
+		rm -f .env.ollama; \
+	fi
+
 build: ## Build Docker containers
 	@echo "$(YELLOW)🐳 Building Docker containers...$(NC)"
 	docker compose build
 
-up: ## Start all Docker services
+up: resolve-ollama-host ## Start all Docker services
 	@echo "$(YELLOW)🚀 Starting Docker services...$(NC)"
-	docker compose up -d
+	@ENV_FILES=""; \
+	if [ -f .env ]; then \
+		ENV_FILES="--env-file .env"; \
+	fi; \
+	if [ -f .env.ollama ]; then \
+		echo "Using dynamically resolved Ollama configuration"; \
+		ENV_FILES="$$ENV_FILES --env-file .env.ollama"; \
+	fi; \
+	if [ -n "$$ENV_FILES" ]; then \
+		docker compose $$ENV_FILES up -d; \
+	else \
+		docker compose up -d; \
+	fi
 
-up-build: ## Build and start all Docker services
+up-build: resolve-ollama-host ## Build and start all Docker services
 	@echo "$(YELLOW)🚀 Building and starting Docker services...$(NC)"
-	docker compose up --build -d
+	@ENV_FILES=""; \
+	if [ -f .env ]; then \
+		ENV_FILES="--env-file .env"; \
+	fi; \
+	if [ -f .env.ollama ]; then \
+		echo "Using dynamically resolved Ollama configuration"; \
+		ENV_FILES="$$ENV_FILES --env-file .env.ollama"; \
+	fi; \
+	if [ -n "$$ENV_FILES" ]; then \
+		docker compose $$ENV_FILES up --build -d; \
+	else \
+		docker compose up --build -d; \
+	fi
 
 down: ## Stop all Docker services
 	@echo "$(YELLOW)⏹️ Stopping Docker services...$(NC)"
 
@@ -31,11 +31,30 @@ The demo processes a couple sample pages from a book about my family and extract
 
 ## Current Status
 
-OCR processing pipeline implemented: multi-format documents (PDF, JPG, PNG, TIFF), multi-language OCR (English/Dutch), batch upload, background processing with Celery.
-
-Uses Django admin interface to prototype and test business logic before building custom UI. This approach allows rapid iteration on data models and processing workflows.
-
-**Next**: AI-powered extraction to structured genealogy data.
+### OCR Processing Pipeline - Testing & Refinement
+- Multi-format document processing (PDF, JPG, PNG, TIFF) with Tesseract
+- Multi-language support (English/Dutch) for genealogical texts
+- Advanced rotation detection using computer vision techniques (Hough line detection, projection profiles)
+- Two-stage rotation correction: major angles (0°/90°/180°/270°) + fine-angle adjustments (±10°)
+- Batch upload functionality and background processing with Celery/Redis
+
+### AI-Powered Entity Extraction - Implemented
+- **Neural Network NER (Named Entity Recognition)**: Custom BERT-based model fine-tuned for genealogical entities
+- **Performance**: 96.84% F1 score (harmonic mean of precision and recall) across PERSON_NAME, DATE, PLACE, GENEALOGY_ID, FAMILY_GROUP entities
+- **Dual Extraction Pipeline**: Hybrid approach combining traditional regex patterns with neural network predictions
+- **Training Data Curation**: Django admin interface for manual refinement of genealogical anchor extractions
+
+### Text Processing & Data Standardization
+- **Generation-Aware Chunking**: Intelligent segmentation preserving genealogical document structure
+- **Date Standardization**: Multi-format Dutch/English date parsing ("15 maart 1654" → "1654-03-15")
+- **Genealogical ID Correction**: Systematic fixes for OCR errors in Roman numerals (IL→II, XIL→XII)
+- **Family Context Tracking**: Infers individual IDs from family group headers ("a. John" → "X.9.a")
+
+### Development Approach
+Uses Django admin interface to prototype and test business logic before building custom UI. This approach enables rapid iteration on data models and processing workflows while maintaining data quality through manual review capabilities.
+
+**Current Focus**: Optimizing OCR quality and refining neural network training data
+**Next Phase**: LLM integration for natural language queries and relationship inference
 
 ## Sample Data
 
@@ -67,15 +86,35 @@ python manage.py runserver
 
 ## Usage
 
-Upload documents via Django admin → automatic OCR processing → review extracted text and confidence scores.
+**Document Processing Workflow:**
+1. Upload documents via Django admin interface
+2. Automatic OCR processing with rotation detection and correction
+3. Intelligent text chunking with genealogical structure preservation
+4. Dual entity extraction (regex + neural network NER)
+5. Review extracted text, confidence scores, and genealogical anchors
+6. Manual curation of training data for neural network refinement
+
+**Current Capabilities:**
+- Multi-page document OCR with confidence scoring
+- Genealogical entity recognition and extraction
+- Date standardization and genealogical ID correction
+- Visual comparison of extraction methods (regex vs. neural network)
+- Manual anchor curation for gold standard training data
 
 ## Development
 
 **Quality checks:** `make quality-gate` (linting, formatting, type checking, security, tests)
 
 **Tests:** `make test`
 
-**Architecture:** Django + PostgreSQL + Celery + Redis + Tesseract OCR
+**Architecture:** Django + PostgreSQL + Celery + Redis + Tesseract OCR + OpenCV + PyTorch
+
+**Key Technologies:**
+- **Computer Vision**: OpenCV for advanced rotation detection and correction
+- **Machine Learning**: PyTorch + Transformers (BERT) for genealogical Named Entity Recognition
+- **Data Storage**: PostgreSQL with custom ArrayField handling for genealogical anchors
+- **Background Processing**: Celery with Redis for scalable document processing
+- **OCR**: Tesseract with multi-language support and confidence scoring
 
 Run `make help` to see all available development commands.