I have a python code that take an arabic word and get the root and also remove diacretics. but i have a problem with the output. For example : when the input is "العربيه" the output is:"عرب" but when the input is "كاتب" the output is:"ب", and when the input is "يخاف" the output is " خف".
This is my code:
and this is the test code:
and the output is: ب
This is my code:
# -*- coding=utf-8 -*- import re from arabic_const import * import Tashaphyne from Tashaphyne import * import enum from enum import Enum search_type=Enum('unvoc_word','voc_word','root_word') HARAKAT_pat = re.compile(ur"[" + u"".join([FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA]) + u"]") HAMZAT_pat = re.compile(ur"[" + u"".join([WAW_HAMZA, YEH_HAMZA]) + u"]"); ALEFAT_pat = re.compile(ur"[" + u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, HAMZA_ABOVE, HAMZA_BELOW]) + u"]"); LAMALEFAT_pat = re.compile(ur"[" + u"".join([LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE]) + u"]"); #-------------------------------------- def strip_tashkeel(w): "strip vowel from a word and return a result word" return HARAKAT_pat.sub('', w) #strip tatweel from a word and return a result word #-------------------------------------- def strip_tatweel(w): "strip tatweel from a word and return a result word" return re.sub(ur'[%s]' % TATWEEL, '', w) #-------------------------------------- def normalize_hamza(w): "strip vowel from a word and return a result word" w = ALEFAT_pat.sub(ALEF, w) return HAMZAT_pat.sub(HAMZA, w) #-------------------------------------- def normalize_lamalef(w): "strip vowel from a word and return a result word" return LAMALEFAT_pat.sub(u'%s%s' % (LAM, ALEF), w) #-------------------------------------- def normalize_spellerrors(w): "strip vowel from a word and return a result word" w = re.sub(ur'[%s]' % TEH_MARBUTA, HEH, w) return re.sub(ur'[%s]' % ALEF_MAKSURA, YEH, w) def normalize_text(word,searchtype): word = strip_tashkeel(word) word = strip_tatweel(word) word = normalize_lamalef(word) word = normalize_hamza(word) word = normalize_spellerrors(word) if searchtype==search_type.root_word.index: ArListem=ArabicLightStemmer(); stem=ArListem.lightStm(word); word=ArListem.get_root(); print word return word #---------------------------------------------
and this is the test code:
**from task import normalize_text normalize_text(u'كاتب',2)
and the output is: ب