Skip to content

Commit c9c2b18

Browse files
committed
OCR working good. NER model training ready
1 parent 0c4e425 commit c9c2b18

File tree

82 files changed

+342679
-632
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+342679
-632
lines changed

.bandit

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@ exclude_dirs:
55
- "test"
66
- ".venv"
77
- ".git"
8+
- "genealogy/training_tools"
89

910
# Allow hardcoded SECRET_KEY in development settings
1011
# Allow HuggingFace downloads without revision pinning for development
12+
# Allow random usage in data generation scripts
1113
skips:
1214
- "B105"
1315
- "B615"
16+
- "B311"

.env.example

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,6 @@ OLLAMA_HOST=the-area.local
2929
OLLAMA_PORT=11434
3030
OLLAMA_LLM_MODEL=aya:35b-23
3131
OLLAMA_EMBEDDING_MODEL=zylonai/multilingual-e5-large:latest
32+
33+
# Document Layout Detection
34+
DOCLAYOUT_MODEL_PATH=/app/models/doclayout_yolo_docstructbench_imgsz1280_2501.pt

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ staticfiles/
140140
static_root/
141141
models/
142142
training_*/
143+
datasets/
143144

144145
# Docker
145146
.dockerignore

Dockerfile

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,28 @@ RUN apt-get update && apt-get install -y \
1717
libxext6 \
1818
libxrender-dev \
1919
libgomp1 \
20+
libopencv-dev \
21+
python3-opencv \
22+
libblas-dev \
23+
liblapack-dev \
24+
libatlas-base-dev \
25+
wget \
2026
&& rm -rf /var/lib/apt/lists/*
2127

28+
# Install Tesseract combined models with both legacy and LSTM components for OEM 2 support
29+
RUN wget -O /usr/share/tesseract-ocr/5/tessdata/eng.traineddata \
30+
https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata \
31+
&& wget -O /usr/share/tesseract-ocr/5/tessdata/nld.traineddata \
32+
https://github.com/tesseract-ocr/tessdata/raw/main/nld.traineddata
33+
2234
# Copy requirements and install Python dependencies
2335
COPY requirements.txt .
24-
RUN pip install --no-cache-dir -r requirements.txt
36+
RUN --mount=type=cache,target=/root/.cache/pip \
37+
pip install -r requirements.txt
38+
RUN --mount=type=cache,target=/root/.cache/pip \
39+
python -m spacy download nl_core_news_sm
40+
RUN --mount=type=cache,target=/root/.cache/pip \
41+
python -m spacy download en_core_web_sm
2542

2643
# Copy project files
2744
COPY . .

datasets/ner.tgz

2.62 MB
Binary file not shown.

datasets/ner/bin/baseline.txt

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#!/usr/bin/perl -w
2+
# baseline: compute a baseline classification for named entities
3+
# usage: baseline [-u] [nbr] train test
4+
# notes: option -u: only classify entities with unique class in train
5+
# method used: only tag phrases present in training data
6+
# greedy search: tag longest possible phrases
7+
# train and test are supposed to be in
8+
# CoNLL-2002 format
9+
# url: http://lcg-www.uia.ac.be/conll2002/ner/
10+
11+
12+
use strict;
13+
14+
my (
15+
$i,$j,$k,
16+
$ambiguous,$bestCat,$bestCatNbr,$buffer,$bufferType,$debug,
17+
$key,$line,$onlyUniq,$tag,$test,$train,$type,$uniqNbr,$word,
18+
@classes,@test,@words,
19+
%hash, # hash of hashes for categories of word sequences
20+
%outWords # hash of words that appear outside of entities
21+
);
22+
23+
$onlyUniq = 0;
24+
$uniqNbr = 0;
25+
$debug = 0;
26+
if (defined $ARGV[0] and $ARGV[0] eq "-d") {
27+
$debug = 1;
28+
shift(@ARGV);
29+
}
30+
if (defined $ARGV[0] and $ARGV[0] eq "-u") {
31+
$onlyUniq = 1;
32+
shift(@ARGV);
33+
}
34+
if (defined $ARGV[0] and $ARGV[0] =~ /^[0-9]+$/) {
35+
$uniqNbr = shift(@ARGV);
36+
}
37+
if ($#ARGV != 1) { die "usage: baseline [-u] [nbr] train test\n"; }
38+
$train = shift(@ARGV);
39+
$test = shift(@ARGV);
40+
41+
# read train file
42+
$buffer = "";
43+
$bufferType = "";
44+
%hash = ();
45+
open(INFILE,$train);
46+
while (<INFILE>) {
47+
$line = $_;
48+
chomp($line);
49+
$line = "-X- O" if ($line =~ /^\s*$/);
50+
@words = split(/\s+/,$line);
51+
$word = shift(@words); # word is first item on line
52+
$tag = pop(@words); # tag is last item on line
53+
if ($tag eq "O") { $outWords{$word} = 1; }
54+
$type = $tag;
55+
$type =~ s/^.*-//;
56+
# if previous tagged phrase is complete
57+
if ($buffer and
58+
($type eq "O" or $type ne $bufferType or $tag =~ /^B/)) {
59+
if (not defined $hash{$buffer}{$bufferType}) {
60+
$hash{$buffer}{$bufferType} = 1;
61+
} else { $hash{$buffer}{$bufferType}++; }
62+
@words = split(/\s+/,$buffer);
63+
pop(@words);
64+
# store all prefixes of entity in hash with tag PREFIX
65+
while (@words) {
66+
$line = join(" ",@words);
67+
if (not defined $hash{$line}{"PREFIX"}) {
68+
$hash{$line}{"PREFIX"} = 1;
69+
} else { $hash{$line}{"PREFIX"}++; }
70+
pop(@words);
71+
}
72+
$buffer = "";
73+
$bufferType = "";
74+
}
75+
# append current word to buffer if we are processing a tagged phrase
76+
if ($tag ne "O") {
77+
$buffer = $buffer ? "$buffer $word" : $word;
78+
$bufferType = $bufferType ? $bufferType : $type;
79+
}
80+
}
81+
if ($buffer) {
82+
if (not defined $hash{$buffer}{$bufferType}) {
83+
$hash{$buffer}{$bufferType} = 1;
84+
} else { $hash{$buffer}{$bufferType}++; }
85+
@words = split(/\s+/,$buffer);
86+
pop(@words);
87+
# store all prefixes of entity in hash with tag PREFIX
88+
while (@words) {
89+
$line = join(" ",@words);
90+
if (not defined $hash{$line}{"PREFIX"}) {
91+
$hash{$line}{"PREFIX"} = 1;
92+
} else { $hash{$line}{"PREFIX"}++; }
93+
pop(@words);
94+
}
95+
}
96+
close(INFILE);
97+
98+
# read test file
99+
@test = ();
100+
open(INFILE,$test) or die "cannot open $test\n";
101+
while (<INFILE>) {
102+
$line = $_;
103+
chomp($line);
104+
push(@test,$line);
105+
}
106+
close(INFILE);
107+
108+
# assign entity tags to test file
109+
$i = 0;
110+
LOOP: while ($i<=$#test) {
111+
if (not $test[$i]) { print "\n"; $i++; next LOOP; }
112+
@words = split(/\s+/,$test[$i]);
113+
if (not defined %{$hash{$words[0]}}) {
114+
print "$test[$i] O\n";
115+
$i++;
116+
} else {
117+
$j = 0;
118+
$buffer = "$words[0]";
119+
# add words to phrase while we are in a phrase prefix and
120+
# the next word exists and is not a line break
121+
while (defined $hash{$buffer}{"PREFIX"} and
122+
$i+$j < $#test and $test[$i+$j+1]) {
123+
$j++;
124+
@words = split(/\s+/,$test[$i+$j]);
125+
$buffer .= " $words[0]";
126+
}
127+
# remove words from entity
128+
@classes = defined $hash{$buffer} ? %{$hash{$buffer}}: ();
129+
# note: classes always contains pairs tag/amount
130+
# remove words from phrase while current phrase is nonempty and
131+
# does not contain a phrase or is only a prefix
132+
while ($buffer and
133+
($#classes < 0 or
134+
($#classes == 1 and defined $hash{$buffer}{"PREFIX"})) or
135+
($onlyUniq and
136+
($#classes > 3 or
137+
($#classes > 1 and not defined $hash{$buffer}{"PREFIX"})))) {
138+
$j--;
139+
@words = split(/\s+/,$buffer);
140+
pop(@words);
141+
$buffer = join(" ",@words);
142+
@classes = defined $hash{$buffer} ? %{$hash{$buffer}}: ();
143+
}
144+
if ($debug) {
145+
# show phrase with possible classification and nbr of examples
146+
print ">>> $#classes $buffer ";
147+
foreach $i (@classes) { print "# $i "; }
148+
print "\n";
149+
}
150+
# if no complete entity was found
151+
if (not $buffer) {
152+
print "$test[$i] O\n";
153+
$i++;
154+
next LOOP;
155+
}
156+
# get category
157+
$bestCat = "UNDEF";
158+
$bestCatNbr = 0;
159+
foreach $key (sort keys %{$hash{$buffer}}) {
160+
if ($key ne "PREFIX" and $hash{$buffer}{$key} > $bestCatNbr) {
161+
$bestCatNbr = $hash{$buffer}{$key};
162+
$bestCat = $key;
163+
}
164+
}
165+
# does the phrase occur frequently enough in the training data?
166+
if ($bestCatNbr < $uniqNbr) {
167+
print "$test[$i] O\n";
168+
$i++;
169+
next LOOP;
170+
}
171+
for ($k=$i;$k<=$i+$j;$k++) {
172+
if ($k == $i) { print "$test[$k] B-$bestCat\n"; }
173+
else { print "$test[$k] I-$bestCat\n"; }
174+
}
175+
$i += $j+1;
176+
}
177+
}
178+
179+
exit(0);

0 commit comments

Comments
 (0)