snowballstem · saeiddrv · Feb 21, 2024 · Feb 26, 2024 · Feb 26, 2024 · Feb 27, 2024
diff --git a/algorithms/persian.sbl b/algorithms/persian.sbl
@@ -0,0 +1,117 @@
+/*
+ * Persian Stemming Algorithm
+ * Author: https://saeiddrv.com
+*/
+
+stringdef alef    '{U+0627}'
+stringdef be      '{U+0628}'
+stringdef pe      '{U+067E}'
+stringdef te      '{U+062A}'
+stringdef se      '{U+0633}'
+stringdef jim     '{U+062C}'
+stringdef che     '{U+0686}'
+stringdef he      '{U+0647}'
+stringdef khe     '{U+062E}'
+stringdef dal     '{U+062F}'
+stringdef zal     '{U+0630}'
+stringdef re      '{U+0631}'
+stringdef ze      '{U+0632}'
+stringdef zhe     '{U+0698}'
+stringdef sin     '{U+0633}'
+stringdef shin    '{U+0634}'
+stringdef sad     '{U+0635}'
+stringdef zad     '{U+0636}'
+stringdef ta      '{U+0637}'
+stringdef za      '{U+0638}'
+stringdef ain     '{U+0639}'
+stringdef ghain   '{U+063A}'
+stringdef fe      '{U+0641}'
+stringdef ghaf    '{U+0642}'
+stringdef kaf     '{U+06A9}'
+stringdef gaf     '{U+06AF}'
+stringdef lam     '{U+0644}'
+stringdef mim     '{U+0645}'
+stringdef nun     '{U+0646}'
+stringdef vav     '{U+0648}'
+stringdef heh     '{U+0647}'
+stringdef ye      '{U+06CC}'
+
+
+routines (
+    Normalize
+    Prefix
+    Suffix_Noun
+    Suffix_Verb
+    Suffix_Adjective
+    Post_Normalize
+)
+
+
+externals ( stem )
+
+
+groupings ()
+
+
+define Normalize as (
+    do repeat (
+        [substring] among (
+            '{U+06A9}' '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf
+            '{U+06AF}' '{U+06A9}' ( <- '{gaf}' ) // Normalize Gaf
+            '{U+06CC}' '{U+064A}' ( <- '{ye}'  )  // Normalize Ye
+            '{U+0647}' '{U+0629}' ( <- '{he}'  )  // Normalize Heh
+            '{U+0627}' '{U+0622}' '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef
+            '{U+0648}' '{U+0624}' ( <- '{vav}' )  // Normalize Waw and Waw with Hamza above
+            '{U+06CC}' '{U+0626}' ( <- '{ye}' )   // Normalize Ye and Ye with Hamza above
+        )
+    )
+)
+
+
+define Prefix as (
+    [substring] among (
+        '{be}{alef}{ze}' (delete)  // baaz
+        '{be}{ye}' (delete)        // bii
+        '{be}{re}' (delete)        // bar
+        '{pe}{ye}{shin}' (delete)  // pish
+        '{pe}{re}' (delete)        // por
+        '{pe}{sin}' (delete)       // pas
+        '{dal}{re}' (delete)       // dar
+        '{ze}{ye}{re}' (delete)    // ziir
+        '{sin}{re}' (delete)       // sar
+        '{fe}{re}{alef}' (delete)  // faraa
+        '{he}{mim}' (delete)       // ham
+        '{nun}{alef}' (delete)     // naa
+    )
+)
+
+
+define Suffix_Noun as (
+    [substring] among (
+        '{he}{alef}' (delete)  // haa
+        '{alef}{nun}' (delete) // aan
+        '{alef}{te}' (delete)  // aat
+    )
+)
+
+define Suffix_Verb as (
+    [substring] among (
+        // Past tense
+        '{ye}{mim}' '{ye}{ye}' '{ye}{ye}{dal}' '{nun}{dal}' (delete)
+        // Present tense
+        '{mim}' '{ye}' '{dal}' '{ye}{mim}' '{ye}{dal}' '{nun}{dal}' (delete)
+    )
+)
+
+define Suffix_Adjective as (
+    [substring] among (
+        '{te}{re}' (delete) // tar (comparative)
+        '{te}{re}{ye}{nun}' (delete)  // tarin (superlative)
+    )
+)
+
+define stem as (
+    do Normalize
+    do Prefix
+    do Suffix_Noun or Suffix_Verb or Suffix_Adjective
+)
diff --git a/python/modules.txt b/python/modules.txt
@@ -0,0 +1,62 @@
+# This file contains a list of stemmers to include in the distribution.
+# The format is a set of space separated lines - on each line:
+#  First item is name of stemmer.
+#  Second item is comma separated list of character sets.
+#  Third item is comma separated list of names to refer to the stemmer by.
+#
+# Lines starting with a #, or blank lines, are ignored.
+
+# List all the main algorithms for each language, in UTF-8, and also with
+# the most commonly used encoding.
+
+arabic          UTF_8                   arabic,ar,ara
+armenian        UTF_8                   armenian,hy,hye,arm
+basque          UTF_8,ISO_8859_1        basque,eu,eus,baq
+catalan         UTF_8,ISO_8859_1        catalan,ca,cat
+danish          UTF_8,ISO_8859_1        danish,da,dan
+dutch           UTF_8,ISO_8859_1        dutch,nl,dut,nld
+english         UTF_8,ISO_8859_1        english,en,eng
+estonian        UTF_8                   estonian,et,est
+finnish         UTF_8,ISO_8859_1        finnish,fi,fin
+french          UTF_8,ISO_8859_1        french,fr,fre,fra
+german          UTF_8,ISO_8859_1        german,de,ger,deu
+greek           UTF_8                   greek,el,gre,ell
+hindi           UTF_8                   hindi,hi,hin
+hungarian       UTF_8,ISO_8859_2        hungarian,hu,hun
+indonesian      UTF_8,ISO_8859_1        indonesian,id,ind
+irish           UTF_8,ISO_8859_1        irish,ga,gle
+italian         UTF_8,ISO_8859_1        italian,it,ita
+lithuanian      UTF_8                   lithuanian,lt,lit
+nepali          UTF_8                   nepali,ne,nep
+norwegian       UTF_8,ISO_8859_1        norwegian,no,nor
+persian         UTF_8                   persian,fa,fas,pers
+portuguese      UTF_8,ISO_8859_1        portuguese,pt,por
+romanian        UTF_8                   romanian,ro,rum,ron
+russian         UTF_8,KOI8_R            russian,ru,rus
+serbian         UTF_8                   serbian,sr,srp
+spanish         UTF_8,ISO_8859_1        spanish,es,esl,spa
+swedish         UTF_8,ISO_8859_1        swedish,sv,swe
+tamil           UTF_8                   tamil,ta,tam
+turkish         UTF_8                   turkish,tr,tur
+yiddish         UTF_8                   yiddish,yi,yid
+
+# Also include the traditional porter algorithm for english.
+# The porter algorithm is included in the libstemmer distribution to assist
+# with backwards compatibility, but for new systems the english algorithm
+# should be used in preference.
+porter          UTF_8,ISO_8859_1        porter			english
+
+# Some other stemmers in the snowball project are not included in the standard
+# distribution. To compile a libstemmer with them in, add them to this list,
+# and regenerate the distribution. (You will need a full source checkout for
+# this.) They are included in the snowball website as curiosities, but are not
+# intended for general use, and use of them is is not fully supported.  These
+# algorithms are:
+#
+# kraaij_pohlmann  - This is a different dutch stemmer.
+#kraaij_pohlmann  UTF_8,ISO_8859_1        kraaij_pohlmann	dutch
+#
+# lovins           - This is an english stemmer, but fairly outdated, and
+#                    only really applicable to a restricted type of input text
+#                    (keywords in academic publications).
+#lovins           UTF_8,ISO_8859_1        lovins		english