evz
diff --git a/‎.bandit‎
Lines changed: 3 additions & 0 deletions b/‎.bandit‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.env.example‎
Lines changed: 3 additions & 0 deletions b/‎.env.example‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 18 additions & 1 deletion b/‎Dockerfile‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎datasets/ner.tgz‎
2.62 MB b/‎datasets/ner.tgz‎
2.62 MB
diff --git a/‎datasets/ner/bin/baseline.txt‎
Lines changed: 179 additions & 0 deletions b/‎datasets/ner/bin/baseline.txt‎
Lines changed: 179 additions & 0 deletions
@@ -5,9 +5,12 @@ exclude_dirs:
   - "test"
   - ".venv"
   - ".git"
+  - "genealogy/training_tools"
 
 # Allow hardcoded SECRET_KEY in development settings
 # Allow HuggingFace downloads without revision pinning for development
+# Allow random usage in data generation scripts
 skips:
   - "B105"
   - "B615"
+  - "B311"
@@ -29,3 +29,6 @@ OLLAMA_HOST=the-area.local
 OLLAMA_PORT=11434
 OLLAMA_LLM_MODEL=aya:35b-23
 OLLAMA_EMBEDDING_MODEL=zylonai/multilingual-e5-large:latest
+
+# Document Layout Detection
+DOCLAYOUT_MODEL_PATH=/app/models/doclayout_yolo_docstructbench_imgsz1280_2501.pt
@@ -140,6 +140,7 @@ staticfiles/
 static_root/
 models/
 training_*/
+datasets/
 
 # Docker
 .dockerignore
 
@@ -17,11 +17,28 @@ RUN apt-get update && apt-get install -y \
     libxext6 \
     libxrender-dev \
     libgomp1 \
+    libopencv-dev \
+    python3-opencv \
+    libblas-dev \
+    liblapack-dev \
+    libatlas-base-dev \
+    wget \
     && rm -rf /var/lib/apt/lists/*
 
+# Install Tesseract combined models with both legacy and LSTM components for OEM 2 support
+RUN wget -O /usr/share/tesseract-ocr/5/tessdata/eng.traineddata \
+    https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata \
+    && wget -O /usr/share/tesseract-ocr/5/tessdata/nld.traineddata \
+    https://github.com/tesseract-ocr/tessdata/raw/main/nld.traineddata
+
 # Copy requirements and install Python dependencies
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python -m spacy download nl_core_news_sm
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python -m spacy download en_core_web_sm
 
 # Copy project files
 COPY . .
 
@@ -0,0 +1,179 @@
+#!/usr/bin/perl -w
+# baseline: compute a baseline classification for named entities
+# usage:    baseline [-u] [nbr] train test
+# notes:    option -u: only classify entities with unique class in train
+#           method used: only tag phrases present in training data
+#                        greedy search: tag longest possible phrases
+#                        train and test are supposed to be in
+#                        CoNLL-2002 format
+# url:      http://lcg-www.uia.ac.be/conll2002/ner/
+# 20020524 [email protected]
+
+use strict;
+
+my (
+   $i,$j,$k,
+   $ambiguous,$bestCat,$bestCatNbr,$buffer,$bufferType,$debug,
+   $key,$line,$onlyUniq,$tag,$test,$train,$type,$uniqNbr,$word,
+   @classes,@test,@words,
+   %hash,    # hash of hashes for categories of word sequences
+   %outWords # hash of words that appear outside of entities
+);
+
+$onlyUniq = 0;
+$uniqNbr = 0;
+$debug = 0;
+if (defined $ARGV[0] and $ARGV[0] eq "-d") {
+   $debug = 1;
+   shift(@ARGV);
+}
+if (defined $ARGV[0] and $ARGV[0] eq "-u") {
+   $onlyUniq = 1;
+   shift(@ARGV);
+}
+if (defined $ARGV[0] and $ARGV[0] =~ /^[0-9]+$/) {
+   $uniqNbr = shift(@ARGV);
+}
+if ($#ARGV != 1) { die "usage: baseline [-u] [nbr] train test\n"; }
+$train = shift(@ARGV);
+$test = shift(@ARGV);
+
+# read train file
+$buffer = "";
+$bufferType = "";
+%hash = ();
+open(INFILE,$train);
+while (<INFILE>) {
+   $line = $_;
+   chomp($line);
+   $line = "-X- O" if ($line =~ /^\s*$/);
+   @words = split(/\s+/,$line);
+   $word = shift(@words); # word is first item on line
+   $tag = pop(@words);    # tag is last item on line
+   if ($tag eq "O") { $outWords{$word} = 1; }
+   $type = $tag;
+   $type =~ s/^.*-//;
+   # if previous tagged phrase is complete
+   if ($buffer and
+       ($type eq "O" or $type ne $bufferType or $tag =~ /^B/)) {
+      if (not defined $hash{$buffer}{$bufferType}) {
+         $hash{$buffer}{$bufferType} = 1;
+      } else { $hash{$buffer}{$bufferType}++; }
+      @words = split(/\s+/,$buffer);
+      pop(@words);
+      # store all prefixes of entity in hash with tag PREFIX
+      while (@words) {
+         $line = join(" ",@words);
+         if (not defined $hash{$line}{"PREFIX"}) {
+            $hash{$line}{"PREFIX"} = 1;
+         } else { $hash{$line}{"PREFIX"}++; }
+         pop(@words);
+      }
+      $buffer = "";
+      $bufferType = "";
+   }
+   # append current word to buffer if we are processing a tagged phrase
+   if ($tag ne "O") {
+      $buffer = $buffer ? "$buffer $word" : $word;
+      $bufferType = $bufferType ? $bufferType : $type;
+   }
+}
+if ($buffer) {
+   if (not defined $hash{$buffer}{$bufferType}) {
+      $hash{$buffer}{$bufferType} = 1;
+   } else { $hash{$buffer}{$bufferType}++; }
+   @words = split(/\s+/,$buffer);
+   pop(@words);
+   # store all prefixes of entity in hash with tag PREFIX
+   while (@words) {
+      $line = join(" ",@words);
+      if (not defined $hash{$line}{"PREFIX"}) {
+         $hash{$line}{"PREFIX"} = 1;
+      } else { $hash{$line}{"PREFIX"}++; }
+      pop(@words);
+   }
+}
+close(INFILE);
+
+# read test file
+@test = ();
+open(INFILE,$test) or die "cannot open $test\n";
+while (<INFILE>) {
+   $line = $_;
+   chomp($line);
+   push(@test,$line);
+}
+close(INFILE);
+
+# assign entity tags to test file
+$i = 0;
+LOOP: while ($i<=$#test) {
+   if (not $test[$i]) { print "\n"; $i++; next LOOP; }
+   @words = split(/\s+/,$test[$i]);
+   if (not defined %{$hash{$words[0]}}) {
+      print "$test[$i] O\n";
+      $i++;
+   } else {
+      $j = 0;
+      $buffer = "$words[0]";
+      # add words to phrase while we are in a phrase prefix and
+      # the next word exists and is not a line break
+      while (defined $hash{$buffer}{"PREFIX"} and
+             $i+$j < $#test and $test[$i+$j+1]) {
+         $j++;
+         @words = split(/\s+/,$test[$i+$j]);
+         $buffer .= " $words[0]";
+      }
+      # remove words from entity
+      @classes = defined $hash{$buffer} ? %{$hash{$buffer}}: ();
+      # note: classes always contains pairs tag/amount
+      # remove words from phrase while current phrase is nonempty and
+      # does not contain a phrase or is only a prefix
+      while ($buffer and
+             ($#classes < 0 or
+              ($#classes == 1 and defined $hash{$buffer}{"PREFIX"})) or
+              ($onlyUniq and
+               ($#classes > 3 or
+                ($#classes > 1 and not defined $hash{$buffer}{"PREFIX"})))) {
+         $j--;
+         @words =  split(/\s+/,$buffer);
+         pop(@words);
+         $buffer = join(" ",@words);
+         @classes = defined $hash{$buffer} ? %{$hash{$buffer}}: ();
+      }
+      if ($debug) {
+         # show phrase with possible classification and nbr of examples
+         print ">>> $#classes $buffer ";
+         foreach $i (@classes) { print "# $i "; }
+         print "\n";
+      }
+      # if no complete entity was found
+      if (not $buffer) {
+         print "$test[$i] O\n";
+         $i++;
+         next LOOP;
+      }
+      # get category
+      $bestCat = "UNDEF";
+      $bestCatNbr = 0;
+      foreach $key (sort keys %{$hash{$buffer}}) {
+         if ($key ne "PREFIX" and $hash{$buffer}{$key} > $bestCatNbr) {
+            $bestCatNbr = $hash{$buffer}{$key};
+            $bestCat = $key;
+         }
+      }
+      # does the phrase occur frequently enough in the training data?
+      if ($bestCatNbr < $uniqNbr) {
+         print "$test[$i] O\n";
+         $i++;
+         next LOOP;
+      }
+      for ($k=$i;$k<=$i+$j;$k++) {
+         if ($k == $i) { print "$test[$k] B-$bestCat\n"; }
+         else { print "$test[$k] I-$bestCat\n"; }
+      }
+      $i += $j+1;
+   }
+}
+
+exit(0);