Skip to content
117 changes: 117 additions & 0 deletions algorithms/persian.sbl
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
/*
* Persian Stemming Algorithm
* Author: https://saeiddrv.com
*/

stringdef alef '{U+0627}'
stringdef be '{U+0628}'
stringdef pe '{U+067E}'
stringdef te '{U+062A}'
stringdef se '{U+0633}'
stringdef jim '{U+062C}'
stringdef che '{U+0686}'
stringdef he '{U+0647}'
stringdef khe '{U+062E}'
stringdef dal '{U+062F}'
stringdef zal '{U+0630}'
stringdef re '{U+0631}'
stringdef ze '{U+0632}'
stringdef zhe '{U+0698}'
stringdef sin '{U+0633}'
stringdef shin '{U+0634}'
stringdef sad '{U+0635}'
stringdef zad '{U+0636}'
stringdef ta '{U+0637}'
stringdef za '{U+0638}'
stringdef ain '{U+0639}'
stringdef ghain '{U+063A}'
stringdef fe '{U+0641}'
stringdef ghaf '{U+0642}'
stringdef kaf '{U+06A9}'
stringdef gaf '{U+06AF}'
stringdef lam '{U+0644}'
stringdef mim '{U+0645}'
stringdef nun '{U+0646}'
stringdef vav '{U+0648}'
stringdef heh '{U+0647}'
stringdef ye '{U+06CC}'


routines (
Normalize
Prefix
Suffix_Noun
Suffix_Verb
Suffix_Adjective
Post_Normalize
)


externals ( stem )


groupings ()


define Normalize as (
do repeat (
[substring] among (
'{U+06A9}' '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf
'{U+06AF}' '{U+06A9}' ( <- '{gaf}' ) // Normalize Gaf
'{U+06CC}' '{U+064A}' ( <- '{ye}' ) // Normalize Ye
'{U+0647}' '{U+0629}' ( <- '{he}' ) // Normalize Heh
'{U+0627}' '{U+0622}' '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef
'{U+0648}' '{U+0624}' ( <- '{vav}' ) // Normalize Waw and Waw with Hamza above
'{U+06CC}' '{U+0626}' ( <- '{ye}' ) // Normalize Ye and Ye with Hamza above
)
)
)


define Prefix as (
[substring] among (
'{be}{alef}{ze}' (delete) // baaz
'{be}{ye}' (delete) // bii
'{be}{re}' (delete) // bar
'{pe}{ye}{shin}' (delete) // pish
'{pe}{re}' (delete) // por
'{pe}{sin}' (delete) // pas
'{dal}{re}' (delete) // dar
'{ze}{ye}{re}' (delete) // ziir
'{sin}{re}' (delete) // sar
'{fe}{re}{alef}' (delete) // faraa
'{he}{mim}' (delete) // ham
'{nun}{alef}' (delete) // naa
)
)


define Suffix_Noun as (
[substring] among (
'{he}{alef}' (delete) // haa
'{alef}{nun}' (delete) // aan
'{alef}{te}' (delete) // aat
)
)

define Suffix_Verb as (
[substring] among (
// Past tense
'{ye}{mim}' '{ye}{ye}' '{ye}{ye}{dal}' '{nun}{dal}' (delete)
// Present tense
'{mim}' '{ye}' '{dal}' '{ye}{mim}' '{ye}{dal}' '{nun}{dal}' (delete)
)
)

define Suffix_Adjective as (
[substring] among (
'{te}{re}' (delete) // tar (comparative)
'{te}{re}{ye}{nun}' (delete) // tarin (superlative)
)
)

define stem as (
do Normalize
do Prefix
do Suffix_Noun or Suffix_Verb or Suffix_Adjective
)
62 changes: 62 additions & 0 deletions python/modules.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# This file contains a list of stemmers to include in the distribution.
# The format is a set of space separated lines - on each line:
# First item is name of stemmer.
# Second item is comma separated list of character sets.
# Third item is comma separated list of names to refer to the stemmer by.
#
# Lines starting with a #, or blank lines, are ignored.

# List all the main algorithms for each language, in UTF-8, and also with
# the most commonly used encoding.

arabic UTF_8 arabic,ar,ara
armenian UTF_8 armenian,hy,hye,arm
basque UTF_8,ISO_8859_1 basque,eu,eus,baq
catalan UTF_8,ISO_8859_1 catalan,ca,cat
danish UTF_8,ISO_8859_1 danish,da,dan
dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
english UTF_8,ISO_8859_1 english,en,eng
estonian UTF_8 estonian,et,est
finnish UTF_8,ISO_8859_1 finnish,fi,fin
french UTF_8,ISO_8859_1 french,fr,fre,fra
german UTF_8,ISO_8859_1 german,de,ger,deu
greek UTF_8 greek,el,gre,ell
hindi UTF_8 hindi,hi,hin
hungarian UTF_8,ISO_8859_2 hungarian,hu,hun
indonesian UTF_8,ISO_8859_1 indonesian,id,ind
irish UTF_8,ISO_8859_1 irish,ga,gle
italian UTF_8,ISO_8859_1 italian,it,ita
lithuanian UTF_8 lithuanian,lt,lit
nepali UTF_8 nepali,ne,nep
norwegian UTF_8,ISO_8859_1 norwegian,no,nor
persian UTF_8 persian,fa,fas,pers
portuguese UTF_8,ISO_8859_1 portuguese,pt,por
romanian UTF_8 romanian,ro,rum,ron
russian UTF_8,KOI8_R russian,ru,rus
serbian UTF_8 serbian,sr,srp
spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
swedish UTF_8,ISO_8859_1 swedish,sv,swe
tamil UTF_8 tamil,ta,tam
turkish UTF_8 turkish,tr,tur
yiddish UTF_8 yiddish,yi,yid

# Also include the traditional porter algorithm for english.
# The porter algorithm is included in the libstemmer distribution to assist
# with backwards compatibility, but for new systems the english algorithm
# should be used in preference.
porter UTF_8,ISO_8859_1 porter english

# Some other stemmers in the snowball project are not included in the standard
# distribution. To compile a libstemmer with them in, add them to this list,
# and regenerate the distribution. (You will need a full source checkout for
# this.) They are included in the snowball website as curiosities, but are not
# intended for general use, and use of them is is not fully supported. These
# algorithms are:
#
# kraaij_pohlmann - This is a different dutch stemmer.
#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch
#
# lovins - This is an english stemmer, but fairly outdated, and
# only really applicable to a restricted type of input text
# (keywords in academic publications).
#lovins UTF_8,ISO_8859_1 lovins english