-
Notifications
You must be signed in to change notification settings - Fork 187
create persian.sbl #194
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
saeiddrv
wants to merge
16
commits into
snowballstem:master
Choose a base branch
from
saeiddrv:add-persian-stemmer
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
create persian.sbl #194
Changes from 1 commit
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
44057d0
create persian.sbl
f8cfec9
fix errors
0248279
add exception section
90c083c
fix exception section
40f59b3
add Suffix_Normalize
bfc5974
fix modules.txt
27975e6
define arabic characters
be3c55d
fix language code
saeiddrv 84ac8a4
fix using next statement
saeiddrv 1645e2d
add more exceptions
saeiddrv bb9b69f
improve steps
saeiddrv 9aad969
fix Normalize_Nouns
saeiddrv 24d0231
fix Exceptions
saeiddrv a1e92cc
update header comment
saeiddrv f536e2e
Add explanatory comment for U200C half-space usage in Persian
saeiddrv e5e980d
update persian algorithm based on the HPS
saeiddrv File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
/* | ||
* Persian Stemming Algorithm | ||
* Author: https://saeiddrv.com | ||
*/ | ||
|
||
stringdef alef '{U+0627}' | ||
stringdef be '{U+0628}' | ||
stringdef pe '{U+067E}' | ||
stringdef te '{U+062A}' | ||
stringdef se '{U+0633}' | ||
stringdef jim '{U+062C}' | ||
stringdef che '{U+0686}' | ||
stringdef he '{U+0647}' | ||
stringdef khe '{U+062E}' | ||
stringdef dal '{U+062F}' | ||
stringdef zal '{U+0630}' | ||
stringdef re '{U+0631}' | ||
stringdef ze '{U+0632}' | ||
stringdef zhe '{U+0698}' | ||
stringdef sin '{U+0633}' | ||
stringdef shin '{U+0634}' | ||
stringdef sad '{U+0635}' | ||
stringdef zad '{U+0636}' | ||
stringdef ta '{U+0637}' | ||
stringdef za '{U+0638}' | ||
stringdef ain '{U+0639}' | ||
stringdef ghain '{U+063A}' | ||
stringdef fe '{U+0641}' | ||
stringdef ghaf '{U+0642}' | ||
stringdef kaf '{U+06A9}' | ||
stringdef gaf '{U+06AF}' | ||
stringdef lam '{U+0644}' | ||
stringdef mim '{U+0645}' | ||
stringdef nun '{U+0646}' | ||
stringdef vav '{U+0648}' | ||
stringdef heh '{U+0647}' | ||
stringdef ye '{U+06CC}' | ||
|
||
|
||
routines ( | ||
Normalize | ||
Prefix | ||
Suffix_Noun | ||
Suffix_Verb | ||
Suffix_Adjective | ||
Post_Normalize | ||
) | ||
|
||
|
||
externals ( stem ) | ||
|
||
|
||
groupings () | ||
|
||
|
||
define Normalize as ( | ||
do repeat ( | ||
[substring] among ( | ||
'{U+06A9}' '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf | ||
'{U+06AF}' '{U+06A9}' ( <- '{gaf}' ) // Normalize Gaf | ||
ojwb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
'{U+06CC}' '{U+064A}' ( <- '{ye}' ) // Normalize Ye | ||
'{U+0647}' '{U+0629}' ( <- '{he}' ) // Normalize Heh | ||
'{U+0627}' '{U+0622}' '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef | ||
'{U+0648}' '{U+0624}' ( <- '{vav}' ) // Normalize Waw and Waw with Hamza above | ||
'{U+06CC}' '{U+0626}' ( <- '{ye}' ) // Normalize Ye and Ye with Hamza above | ||
) | ||
) | ||
) | ||
|
||
|
||
define Prefix as ( | ||
[substring] among ( | ||
'{be}{alef}{ze}' (delete) // baaz | ||
'{be}{ye}' (delete) // bii | ||
'{be}{re}' (delete) // bar | ||
'{pe}{ye}{shin}' (delete) // pish | ||
'{pe}{re}' (delete) // por | ||
'{pe}{sin}' (delete) // pas | ||
'{dal}{re}' (delete) // dar | ||
'{ze}{ye}{re}' (delete) // ziir | ||
'{sin}{re}' (delete) // sar | ||
'{fe}{re}{alef}' (delete) // faraa | ||
'{he}{mim}' (delete) // ham | ||
'{nun}{alef}' (delete) // naa | ||
) | ||
) | ||
|
||
|
||
define Suffix_Noun as ( | ||
[substring] among ( | ||
'{he}{alef}' (delete) // haa | ||
'{alef}{nun}' (delete) // aan | ||
'{alef}{te}' (delete) // aat | ||
) | ||
) | ||
|
||
define Suffix_Verb as ( | ||
[substring] among ( | ||
// Past tense | ||
'{ye}{mim}' '{ye}{ye}' '{ye}{ye}{dal}' '{nun}{dal}' (delete) | ||
// Present tense | ||
'{mim}' '{ye}' '{dal}' '{ye}{mim}' '{ye}{dal}' '{nun}{dal}' (delete) | ||
) | ||
) | ||
|
||
define Suffix_Adjective as ( | ||
[substring] among ( | ||
'{te}{re}' (delete) // tar (comparative) | ||
'{te}{re}{ye}{nun}' (delete) // tarin (superlative) | ||
) | ||
) | ||
|
||
define stem as ( | ||
do Normalize | ||
do Prefix | ||
do Suffix_Noun or Suffix_Verb or Suffix_Adjective | ||
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# This file contains a list of stemmers to include in the distribution. | ||
# The format is a set of space separated lines - on each line: | ||
# First item is name of stemmer. | ||
# Second item is comma separated list of character sets. | ||
# Third item is comma separated list of names to refer to the stemmer by. | ||
# | ||
# Lines starting with a #, or blank lines, are ignored. | ||
|
||
# List all the main algorithms for each language, in UTF-8, and also with | ||
# the most commonly used encoding. | ||
|
||
arabic UTF_8 arabic,ar,ara | ||
armenian UTF_8 armenian,hy,hye,arm | ||
basque UTF_8,ISO_8859_1 basque,eu,eus,baq | ||
catalan UTF_8,ISO_8859_1 catalan,ca,cat | ||
danish UTF_8,ISO_8859_1 danish,da,dan | ||
dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld | ||
english UTF_8,ISO_8859_1 english,en,eng | ||
estonian UTF_8 estonian,et,est | ||
finnish UTF_8,ISO_8859_1 finnish,fi,fin | ||
french UTF_8,ISO_8859_1 french,fr,fre,fra | ||
german UTF_8,ISO_8859_1 german,de,ger,deu | ||
greek UTF_8 greek,el,gre,ell | ||
hindi UTF_8 hindi,hi,hin | ||
hungarian UTF_8,ISO_8859_2 hungarian,hu,hun | ||
indonesian UTF_8,ISO_8859_1 indonesian,id,ind | ||
irish UTF_8,ISO_8859_1 irish,ga,gle | ||
italian UTF_8,ISO_8859_1 italian,it,ita | ||
lithuanian UTF_8 lithuanian,lt,lit | ||
nepali UTF_8 nepali,ne,nep | ||
norwegian UTF_8,ISO_8859_1 norwegian,no,nor | ||
persian UTF_8 persian,fa,fas,pers | ||
ojwb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
portuguese UTF_8,ISO_8859_1 portuguese,pt,por | ||
romanian UTF_8 romanian,ro,rum,ron | ||
russian UTF_8,KOI8_R russian,ru,rus | ||
serbian UTF_8 serbian,sr,srp | ||
spanish UTF_8,ISO_8859_1 spanish,es,esl,spa | ||
swedish UTF_8,ISO_8859_1 swedish,sv,swe | ||
tamil UTF_8 tamil,ta,tam | ||
turkish UTF_8 turkish,tr,tur | ||
yiddish UTF_8 yiddish,yi,yid | ||
|
||
# Also include the traditional porter algorithm for english. | ||
# The porter algorithm is included in the libstemmer distribution to assist | ||
# with backwards compatibility, but for new systems the english algorithm | ||
# should be used in preference. | ||
porter UTF_8,ISO_8859_1 porter english | ||
|
||
# Some other stemmers in the snowball project are not included in the standard | ||
# distribution. To compile a libstemmer with them in, add them to this list, | ||
# and regenerate the distribution. (You will need a full source checkout for | ||
# this.) They are included in the snowball website as curiosities, but are not | ||
# intended for general use, and use of them is is not fully supported. These | ||
# algorithms are: | ||
# | ||
# kraaij_pohlmann - This is a different dutch stemmer. | ||
#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch | ||
# | ||
# lovins - This is an english stemmer, but fairly outdated, and | ||
# only really applicable to a restricted type of input text | ||
# (keywords in academic publications). | ||
#lovins UTF_8,ISO_8859_1 lovins english |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.