
    kh                     	   d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 e
efZ	 ej                  j                  ej                  j                  e            Zd\  ZZZZZZZZdodZdodZeZeZd	 Z G d
 de      Z  G d de!      Z"dZ#d\  Z$Z%Z&Z'Z(Z)Z*Z+Z,Z-Z.Z/Z0Z1d Z2 ejf                  d      Z4dxZ5Z6 e7d      xZ8Z9 ejf                  d      Z: ejf                  d      Z; ejf                  ddjy                  d      z   dz         Z= e7d       e7d       e7d       e7d       e7d       e7d       e7d        e7d!       e7d"      d#	Z>e>j                         D  cg c]8  } | D ]1  }d$jy                  |D cg c]  } ej                  |       c}      3 : c}}} ZA ejf                  d%djy                  eA      z        ZA ejf                  d&      ZBd'd(d)d*d+d,d-d.ZCd/ZDe5e8eCd0fd1ZEdpd2ZF G d3 d4e       ZG G d5 d6      ZH G d7 d8e"eH      ZI G d9 d:e"eH      ZJ ejf                  d;      ZK ejf                  d<      ZL ejf                  d=      ZM G d> d?e eH      ZNd@ZOdAZPdB\  Z$Z%ZQZR ejf                  dC      ZSdD ZT G dE dFeU      ZV G dG dHe       ZW ejf                  dI      ZXdqdJZY	 	 	 	 	 	 	 	 drdKZZdLZ[dMZ\dNZ]dOZ^dPZ_dQ ejf                  dRe\z   dSz   e_z   dz   e^z   dTz   e\z   dUz         fdV ejf                  dWe_z   dTz   e]z   dXz         fdV ejf                  dY      fdZ ejf                  d[      fd\ ejf                  d]e_z   dz   e^z   dTz   e^z   dUz         fd^ ejf                  dRe_z   d_z         fgdQ ejf                  dRe\z   dSz   e_z   dz   e^z   dTz   e\z   d`z   e_z   dz   e^z   daz         fdV ejf                  dWe_z   dTz   e]z   d`z   e_z   dbz         fdV ejf                  dY      fdZ ejf                  d[      fd\ ejf                  d]e_z   dz   e^z   dTz   e^z   dUz         fd^ ejf                  dRe_z   d_z         fggZ`e`d   j                  dce`d   j                  dd             e`dc   j                  dce`dc   j                  dd             dsdeZcdf ZddgxZeZf G dh di      ZgdjZh G dk dle
      Zi G dm dne       Zjy#  dZY ]xY wc c}w c c}}} w )tznThis file is adapted from the pattern library.

URL: http://www.clips.ua.ac.be/pages/pattern-web
Licence: BSD
    N)chain)ElementTree )&slash;wordpart-of-speechchunkprepositionrelationanchorlemmac                     t        |t              r|ffdz   }t        | t              r|D ]  }	  | j                  | c S  | S t	        |       S #  Y 'xY w)z:Returns the given value as a Unicode string (if possible).)zwindows-1252)utf-8ignore)
isinstance
basestringbytesdecodestrvencodinges      J/opt/mcp/mcp-sentiment/venv/lib/python3.12/site-packages/textblob/_text.pydecode_stringr   "   sc    (J'K>$LL!U 	Aqxx|#	
 q6M   AAc                     t        |t              r|ffdz   }t        | t              r|D ]  }	  | j                  | c S  | S t        |       S #  Y 'xY w)z>Returns the given value as a Python byte string (if possible).r   )r   r   r   encoder   s      r   encode_stringr    0   sc    (J'K>$LL!S 	Aqxx|#	
 q6Mr   c                 :    	 t        |        y# t        $ r Y yw xY w)NFT)float
ValueError)strgs    r   	isnumericr%   B   s'    d   s    	c                   l    e Zd Zd Zd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Zd Zd Zd Zd Zy)lazydictc                      y N selfs    r   loadzlazydict.loadP        	    c           
          t         j                  |       dk(  r?| j                          t        | |t	        j
                  t        t         |      |               t        t         |      | g| S )zIf the dictionary is empty, calls lazydict.load().
        Replaces lazydict.method() with dict.method() and calls it.
        r   )dict__len__r-   setattrtypes
MethodTypegetattrr,   methodargss      r   _lazyzlazydict._lazyU   W     <<"IIKD&%"2"2743H$"OP$wtV$T1D11r/   c                 $    | j                  d      S N__repr__r:   r+   s    r   r>   zlazydict.__repr__^       zz*%%r/   c                 $    | j                  d      S Nr2   r?   r+   s    r   r2   zlazydict.__len__a       zz)$$r/   c                 $    | j                  d      S N__iter__r?   r+   s    r   rF   zlazydict.__iter__d   r@   r/   c                 (     | j                   dg| S N__contains__r?   r,   r9   s     r   rI   zlazydict.__contains__g       tzz.0400r/   c                 (     | j                   dg| S )N__getitem__r?   rJ   s     r   rM   zlazydict.__getitem__j       tzz-/$//r/   c                 (     | j                   dg| S )N__setitem__r?   rJ   s     r   rP   zlazydict.__setitem__m   rN   r/   c                 (     | j                   dg| S )N
setdefaultr?   rJ   s     r   rR   zlazydict.setdefaultp   s    tzz,...r/   c                 (     | j                   dg| S )Ngetr?   r,   r9   kwargss      r   rT   zlazydict.gets       tzz%'$''r/   c                 $    | j                  d      S )Nitemsr?   r+   s    r   rY   zlazydict.itemsv   s    zz'""r/   c                 $    | j                  d      S )Nkeysr?   r+   s    r   r[   zlazydict.keysy   s    zz&!!r/   c                 $    | j                  d      S )Nvaluesr?   r+   s    r   r]   zlazydict.values|   s    zz(##r/   c                 (     | j                   dg| S )Nupdater?   rJ   s     r   r_   zlazydict.update       tzz(*T**r/   c                 (     | j                   dg| S Npopr?   rJ   s     r   rc   zlazydict.pop   rW   r/   c                 (     | j                   dg| S )Npopitemr?   rJ   s     r   re   zlazydict.popitem   s    tzz)+d++r/   N)__name__
__module____qualname__r-   r:   r>   r2   rF   rI   rM   rP   rR   rT   rY   r[   r]   r_   rc   re   r*   r/   r   r'   r'   O   sR    
2&%&100/(#"$+(,r/   r'   c                   N    e Zd Zd Zd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zy)lazylistc                      y r)   r*   r+   s    r   r-   zlazylist.load   r.   r/   c           
          t         j                  |       dk(  r?| j                          t        | |t	        j
                  t        t         |      |               t        t         |      | g| S )zyIf the list is empty, calls lazylist.load().
        Replaces lazylist.method() with list.method() and calls it.
        r   )listr2   r-   r3   r4   r5   r6   r7   s      r   r:   zlazylist._lazy   r;   r/   c                 $    | j                  d      S r=   r?   r+   s    r   r>   zlazylist.__repr__   r@   r/   c                 $    | j                  d      S rB   r?   r+   s    r   r2   zlazylist.__len__   rC   r/   c                 $    | j                  d      S rE   r?   r+   s    r   rF   zlazylist.__iter__   r@   r/   c                 (     | j                   dg| S rH   r?   rJ   s     r   rI   zlazylist.__contains__   rK   r/   c                 (     | j                   dg| S )Ninsertr?   rJ   s     r   rs   zlazylist.insert   r`   r/   c                 (     | j                   dg| S )Nappendr?   rJ   s     r   ru   zlazylist.append   r`   r/   c                 (     | j                   dg| S )Nextendr?   rJ   s     r   rw   zlazylist.extend   r`   r/   c                 (     | j                   dg| S )Nremover?   rJ   s     r   ry   zlazylist.remove   r`   r/   c                 (     | j                   dg| S rb   r?   rJ   s     r   rc   zlazylist.pop   rW   r/   N)rf   rg   rh   r-   r:   r>   r2   rF   rI   rs   ru   rw   ry   rc   r*   r/   r   rj   rj      s9    
2&%&1++++(r/   rj   	universal)NNVBJJRBPRDTPPr   NOCJUHPT.Xc                    |j                  d      r*| dj                  t        |j                  d      d         fS |dv r| t        fS |dv r| t        fS |dv r| t
        fS |dv r| t        fS |d	v r| t        fS |d
v r| t        fS |dv r| t        fS |dv r| t        fS |dv r| t        fS |dv r| t        fS |dv r| t        fS |dv r| t        fS | t        fS )zLReturns a (token, tag)-tuple with a simplified universal part-of-speech tag.)zNNP-zNNPS-z{}-{}-)r|   NNSNNPNNPSNP)MDr}   VBDVBGVBNVBPVBZ)r~   JJRJJS)r   RBRRBSWRB)PRPzPRP$WPzWP$)r   PDTWDTEX)IN)CD)CC)r   )POSRPTO)SYMLSr   !?,:()"#$)
startswithformatNOUNsplitVERBADJADVPRONDETPREPNUMCONJINTJPRTPUNCr   tokentags     r   penntreebank2universalr      s   
~~'(w~~dCIIcN2,>?@@
00t}
==t}
""s|
))s|
**t}
((s|
g~t}
g~s|
g~t}
g~t}
!!s|
MMt}1:r/   z(\S+)\sz.,;:!?()[]{}`''"@#$^&*+-|=~_)%za.zadj.zadv.zal.za.m.zc.zcf.zcomp.zconf.zdef.zed.ze.g.zesp.zetc.zex.zf.zfig.zgen.zid.zi.e.zint.zl.zm.zMed.zMil.zMr.zn.zn.q.zorig.zpl.zpred.zpres.zp.m.zref.zv.zvs.zw/z^[A-Za-z]\.$z^([A-Za-z]\.)+$z^[A-Z][|bcdfghjklmnpqrstvwxzz]+.$)z<3u   ♥)
z>:Dz:-Dz:Dz=-Dz=DzX-Dzx-DXDxDz8-D)
z>:Pz:-Pz:Pz:-pz:pz:-bz:bz:c)z:o)z:^))z>:)z:-)z:)z=)z=]z:]z:}z:>z:3z8)z8-))	z>;]z;-)z;)z;-]z;]z;Dz;^)z*-)z*))	z>:oz:-Oz:Oz:oz:-oo_Ozo.Ou   °O°u   °o°)z>:/z:-/z:/z:\z>:\z:-.z:-sz:sz:Sz:-Sz>.>)z>:[z:-(z:(z=(z:-[z:[z:{z:-<z:cz:-cz=/)z:'(z:'''(z;'()	)love      ?)grinr   )tauntg      ?)smileg      ?)winkg      ?)gaspg?)worryg      п)frowng      )cry      z ?z
(%s)($|\s)z
\( ?\! ?\)z 'dz 'mz 'sz 'llz 'rez 'vez n't)z'dz'mz'sz'llz'rez'ven'tzEND-OF-SENTENCE\n{2,}c                     t        |j                  dd            }t        |j                               D ]  \  }}t	        j
                  |||       }  t        | t              rkt        |       j                  dd      j                  dd      j                  dd      j                  d	d
      j                  dd      j                  dd      } t	        j
                  dd|       } t	        j
                  |dt        z  |       } t	        j
                  dd|       } g }t        j                  | dz         D ]  }t        |      dkD  sg }	|j                  |      rD||vr@|j                  |      r|j                  |d          |dd }|j                  |      r||vr@|j                  |dz         r||vr|j                  |      r|	j                  |d          |dd }|j                  d      r%|	j                  d       |dd j                  d      }|j                  d      r]||v s?t         j#                  |      *t$        j#                  |      t&        j#                  |      n2|	j                  |d          |dd }|j                  |dz         r||vr|dk7  r|j                  |       |j)                  t+        |	              g gdd}}}
|t        |      k  r||   ddddt        fv r|t        |      k  rc||   dddd	dddddt        f
v rN||   dv r|
d   j-                  ||         dz  dk(  rn)|dz  }|t        |      k  r||   dddd	dddddt        f
v rN|
d   j)                  d  ||| D               |
j                  g        |}|dz  }|t        |      k  r|
d   j)                  |||        d! |
D        }
d" |
D        }
|
D cg c]  }t.        j                  d# |       }
}|
S c c}w )$a<  Returns a list of sentences. Each sentence is a space-separated string of tokens (words).
    Handles common cases of abbreviations (e.g., etc., ...).
    Punctuation marks are split from other words. Periods (or ?!) mark the end of a sentence.
    Headings without an ending period are inferred by line breaks.
    r   r   u   “u    “ u   ”u    ” u   ‘u    ‘ u   ’u    ’ 'z ' r   z " z

z %s z\s+ r      N)r   r   z...r   r   r   )r   r      c              3   4   K   | ]  }|t         k7  s|  y wr)   )EOS).0ts     r   	<genexpr>zfind_tokens.<locals>.<genexpr>  s      Dq18 Ds   c              3   \   K   | ]$  }t        |      d kD  sdj                  |       & yw)r   r   N)lenjoinr   ss     r   r   zfind_tokens.<locals>.<genexpr>  s"     >3q6A:!>s   ,,c              3   H   K   | ]  }t         j                  d |        yw)(!)N)
RE_SARCASMsubr   s     r   r   zfind_tokens.<locals>.<genexpr>  s     =aq)=s    "c                 h    | j                  d      j                  dd      | j                  d      z   S )Nr   r   r   r   )groupreplace)ms    r   <lambda>zfind_tokens.<locals>.<lambda>  s(    1771:#5#5c2#>#K r/   )tupler   rm   rY   rer   r   r   r   TOKENfindallr   r   ru   endswithrstripRE_ABBR1matchRE_ABBR2RE_ABBR3rw   reversedcountRE_EMOTICONS)stringpunctuationabbreviationsr   	linebreakabtokensr   tail	sentencesijr   s                 r   find_tokensr  ^  s    ++C45KW]]_% &11f%& &#KWUG$WUG$WUG$WUG$WS% WS%  	 VVFD&)FVVIv|V4FVVFC(FF]]6C<( *q6A:D,,{+0@<<,MM!A$'!"A	 ,,{+0@
 **[612q7G::k*KK"&#2A::e$KK&#2c*A::c?]*#>>!,8#>>!,8#>>!,8AbE*crF) **[612q7G* Bwa MM(4.)?*@ dAq!qI
c&k/!9S#s33c&k/fQi4 ' !9
*y}/B/B6!9/MPQ/QUV/VQ c&k/fQi4 ' bM   DF1QK DDR A	Q+ c&k/, bM!%>i>I=9=I  	KQOI  	s   O;c              #   $  K   | r	t        | t              r-t        j                  j	                  |       rt        | d      }nNt        | t              r| j                         }n-t        | d      r| j                         j                         }n| }t        |      D ]p  \  }}|dk(  r/t        |t              r|j                  t        j                        n|}|j                         }t        |      }|r|r|j                  |      rm| r yw)zReturns an iterator over the lines in the file at the given path,
    stripping comments and decoding each line to Unicode.
    r   )r   readr   N)r   r   ospathexistsopen
splitlineshasattrr
  	enumerater   stripcodecsBOM_UTF8decode_utf8r   )r  r   commentfr  lines         r   _readr    s      dJ'BGGNN4,@TG,Aj)!AT6"		&&(AA | 
	GAt 6ju5 

6??+ 
 ::<Dt$DG(@J
	 s   DDc                   F    e Zd Z	 	 	 	 	 	 ddZd Zed        Zed        Zy)LexiconNc                     || _         || _        t        | |      | _        t	        | |      | _        t        | ||      | _        y)zA dictionary of words and their part-of-speech tags.
        For unknown words, rules for word morphology, context and named entities can be used.
        )r  )r  r   N)_path	_language
Morphology
morphologyContextcontextEntitiesentities)r,   r  r   r"  r$  r   languages          r   __init__zLexicon.__init__  s>     
!$T
;t'2 H#>r/   c                 d    t         j                  | d t        | j                        D               y )Nc              3   f   K   | ])  }|j                         s|j                  d       dd  + yw)r   Nr   )r  r   r   xs     r   r   zLexicon.load.<locals>.<genexpr>  s'     T!'')1773<+Ts   11)r1   r_   r  r  r+   s    r   r-   zLexicon.load  s    DTU4::5FTUr/   c                     | j                   S r)   r  r+   s    r   r  zLexicon.path       zzr/   c                     | j                   S r)   r  r+   s    r   r%  zLexicon.language      ~~r/   )r   NNNr   N)rf   rg   rh   r&  r-   propertyr  r%  r*   r/   r   r  r    sI     ?$V    r/   r  c                       e Zd ZddZd Zy)RulesNc                 2    |i }|i }||c| _         | _        y r)   )lexiconcmd)r,   r5  r6  s      r   r&  zRules.__init__  s&    ;C?G!(#dhr/   c                     |S )z6Applies the rule to the given token or list of tokens.r*   r,   r*  s     r   applyzRules.apply  s    r/   NN)rf   rg   rh   r&  r9  r*   r/   r   r3  r3    s    .r/   r3  c                   H    e Zd Zd	dZed        Zd Zd
dZddZd Z	ddZ
y)r  Nc                     |i }d}t         j                  |d      }|j                  d t        |j	                               D               t
        j                  | ||       || _        y)z:A list of rules based on word morphology (prefix, suffix).N)	charhasprefhassufaddprefaddsuf
deletepref	deletesufgoodleft	goodrightTc              3   0   K   | ]  \  }}d |z   |f  yw)r  Nr*   r   kr   s      r   r   z&Morphology.__init__.<locals>.<genexpr>-  s     >DAqC!GQ<>   )r1   fromkeysr_   rm   rY   r3  r&  r  r,   r5  r  r6  s       r   r&  zMorphology.__init__  sY    ?G

 mmC&

>D,=>>tWc*
r/   c                     | j                   S r)   r,  r+   s    r   r  zMorphology.path1  r-  r/   c                 d    t         j                  | d t        | j                        D               y )Nc              3   <   K   | ]  }|j                           y wr)   r   r)  s     r   r   z"Morphology.load.<locals>.<genexpr>7       @1779@   rm   rw   r  r  r+   s    r   r-   zMorphology.load5      D@eDJJ.?@Ar/   c                    |d   }| D ]}  }|d   | j                   v r+t        d      |d   |d   |d   j                         f\  }}}}	|d   | j                   v r:t        d      |d   |d   |d   j                         j                  d      f\  }}}}	r|d   |d   k7  r	dk(  r|v s|	dk(  r|j	                        s|	dk(  r|j                        s|	d	k(  r|z   | j                  v s|	d
k(  r|z   | j                  v s|	dk(  r+|j	                        r|t        |      d | j                  v sO|	dk(  r,|j                        r|dt        |        | j                  v s|	dk(  r|d   k(  s|	dk(  so|d   k(  sy|d<    |S )zFApplies lexical rules to the given token, which is a [word, tag] list.r   r   r   r  r=  r>  r?  r@  rA  rB  NrC  rD  rE  )r6  boollowerlstripr   r   r5  r   )
r,   r   previousnextwrr  r*  posr6  s
             r   r9  zMorphology.apply9  s   !H 	Attxx!%a!A$"qtzz|!C1c3ttxx!%a!A$"qtzz|7J7J37O!O1c3U1X1%169$a8O

19$Q$,,)>8OA(=<'Q#a&(t||3 ;&

1)SVG4:%!tAw,;&1+;a5	6 r/   c                 F   |j                  d      r|j                  d      r|dd d}}|j                  d      r|dd d}}|j                  d      r|dd d}}|r||d|j                  d      z   |d	g}n||j                  d      |d	g}t        j	                  | ||       y
)zInserts a new rule that assigns the given tag to words with the given affix,
        e.g., Morphology.append("RB", "-ly").
        r   r   r   r=  r   r?  r>  r  r*  N)r   r   rX  rj   rs   )r,   r  r   affixr6  taggedr\  s          r   rs   zMorphology.insertY  s     C U^^C%8r"v3EC r"x3E>>#r"y3Ecjjo 5sC@A

3c2Aa#r/   c                 H     | j                   t        |       dz
  g|i | y Nr   rs   r   rU   s      r   ru   zMorphology.appendi  "    CIM3D3F3r/   c                 8    |g }|D ]  } | j                   |   y r)   ru   r,   rulesr\  s      r   rw   zMorphology.extendl  (    =E 	ADKKO	r/   Nr   r:  r:  )r?  Nr)   rf   rg   rh   r&  r1  r  r-   r9  rs   ru   rw   r*   r/   r   r  r    s6    (  B@$ 4r/   r  c                   F    e Zd Zd	dZed        Zd Zd Zd
dZd Z	ddZ
y)r!  Nc                 t    |i }d}t         j                  | |t        j                  |d             || _        y)zAA list of rules based on context (preceding and following words).N)prevtagnexttagprev2tagnext2tagprev1or2tagnext1or2tagprev1or2or3tagnext1or2or3tagsurroundtagcurwdprevwdnextwd
prev1or2wd
next1or2wdnext1or2or3wdprev1or2or3wd	prevwdtag	nextwdtag	wdprevtag	wdnexttag	wdand2aftwdand2tagbfrwdand2tagaftlbigramrbigram
prevbigram
nextbigramT)r3  r&  r1   rJ  r  rK  s       r   r&  zContext.__init__y  s7    ?G
: 	tWdmmC&>?
r/   c                     | j                   S r)   r,  r+   s    r   r  zContext.path  r-  r/   c                 d    t         j                  | d t        | j                        D               y )Nc              3   <   K   | ]  }|j                           y wr)   rO  r)  s     r   r   zContext.load.<locals>.<genexpr>  rP  rQ  rR  r+   s    r   r-   zContext.load  rS  r/   c                    dgdz  }||z   |z   }t        |      D ]U  \  }}| D ]I  }|d   dk(  r|d   |d   k7  r	|d   dk7  r!|d   |d   t        |      dkD  r|d   nd	}	}}|j                         }|d
k(  r|||dz
     d   k(  s|dk(  r|||dz      d   k(  s|dk(  r|||dz
     d   k(  s|dk(  r|||dz      d   k(  s|dk(  r|||dz
     d   ||dz
     d   fv s||dk(  r|||dz      d   ||dz      d   fv s^|dk(  r#|||dz
     d   ||dz
     d   ||dz
     d   fv s6|dk(  r#|||dz      d   ||dz      d   ||dz      d   fv s|dk(  r|||dz
     d   k(  r|	||dz      d   k(  s|dk(  r|||dz      d   k(  s|dk(  r|||dz
     d   k(  s|dk(  r|||dz      d   k(  s|dk(  r|||dz
     d   ||dz
     d   fv s|dk(  r|||dz      d   ||dz      d   fv st|dk(  r|||dz
     d   k(  r|	||dz
     d   k(  sR|dk(  r|||dz      d   k(  r|	||dz      d   k(  s0|dk(  r|||dz
     d   k(  r|	||dz      d   k(  s|dk(  r|||dz      d   k(  r|	||dz      d   k(  s|dk(  r|||dz      d   k(  r|	||dz      d   k(  s|dk(  r|||dz
     d   k(  r|	||dz      d   k(  s|dk(  r|||dz      d   k(  r|	||dz      d   k(  s|dk(  r|||dz
     d   k(  r|	||dz      d   k(  si|d k(  r|||dz      d   k(  r|	||dz      d   k(  sH|d!k(  r|||dz
     d   k(  r|	||dz
     d   k(  s'|d"k(  s|||dz      d   k(  s*|	||dz      d   k(  s:||   d   |d   g||<   L X |t        |      t        |        S )#znApplies contextual rules to the given list of tokens,
        where each token is a [word, tag] list.
        )STAARTr     r   r  r   *r      r   ro  rp  rq  rr  rs  rt  ru  rv  rw  rx  ry  rz  r{  r|  r  r  r  r  r  r  r  r  r  r  r  )r  r   rW  )
r,   r  or   r  r   r\  r6  r*  ys
             r   r9  zContext.apply  s    ""Q&JN!! )	+HAu (+8x'8qt#!aD!A$A
!QiikI%!qQx{*:y(Q!AE(1+-=z)a1QU8A;.>z)a1QU8A;.>},qQx{Aa!eHQK6P1P},qQx{Aa!eHQK6P1P//!AE(1+qQx{Aa!eHQK!HH //!AE(1+qQx{Aa!eHQK!HH},aAhqk1Aa1QQRU8TU;FVw1!a%+;xA1q5!,<xA1q5!,<|+aAhqk1QU8A;5O0O|+aAhqk1QU8A;5O0O{*qAa!eHQK/?A1q5RSDT{*qAa!eHQK/?A1q5RSDT{*qAa!eHQK/?A1q5RSDT{*qAa!eHQK/?A1q5RSDT{*qAa!eHQK/?A1q5RSDT~-!qQx{2BqAaRSeHUVKGW~-!qQx{2BqAaRSeHUVKGWy(Q!AE(1+-=!qQxPQ{BRy(Q!AE(1+-=!qQxPQ{BR|+Qq1uXa[0@Q!APQE(ST+EU|+Qq1uXa[0@Q!APQE(ST+EUaDGQqT?AaDQ(+)	+T Q3q6'""r/   c           
          d|v r|s|s|j                  d      \  }}d}d|v r|s|s|j                  d      \  }}d}t        j                  | |||||xs d|xs dg       y)zInserts a new rule that updates words with tag1 to tag2,
        given constraints x and y, e.g., Context.append("TO < NN", "VB")
        z < ro  z > rp  r   N)r   rj   rs   )r,   r  tag1tag2r6  r*  r  s          r   rs   zContext.insert  sk     D=1jj'GD!CD=1jj'GAtCa$c17AG!DEr/   c                 H     | j                   t        |       dz
  g|i | y rb  rc  rU   s      r   ru   zContext.append  rd  r/   c                 8    |g }|D ]  } | j                   |   y r)   rf  rg  s      r   rw   zContext.extend  ri  r/   rj  )ro  NNr)   rl  r*   r/   r   r!  r!  x  s8    "H  B0#d
F4r/   r!  z^http://z#^www\..*?\.[com|org|net|edu|de|uk]$z#^[\w\-\.\+]+@(\w[\w\-]+\.)+[\w\-]+$c                   >    e Zd ZddZed        Zd Zd Zd	dZd Z	y)
r#  Nc                 Z    |i }d}t         j                  | ||       || _        || _        y)zA dictionary of named entities and their labels.
        For domain names and e-mail adresses, regular expressions are used.
        N)perslocorg)r3  r&  r  r   )r,   r5  r  r   r6  s        r   r&  zEntities.__init__  s5     ?G

 	tWc*
r/   c                     | j                   S r)   r,  r+   s    r   r  zEntities.path  r-  r/   c                     t        | j                        D ]W  }|j                         D cg c]  }|j                          c}}t        j                  | |d   g       j                  |       Y y c c}w Nr   )r  r  r   rW  r1   rR   ru   r8  s     r   r-   zEntities.load  sZ     tyy! 	6A$%GGI.q.AOOD!A$+2215	6.s   A1c                    d}|t        |      k  r?||   d   j                         }t        j                  |      s*t        j                  |      st
        j                  |      r| j                  ||   d<   || v r| |   D ]  }|d   | j                  v r|dd d|d   j                         z   fn|df\  }}d}t        |      D ]6  \  }}||z   t        |      k\  s|||z      d   j                         |k7  s4d} n |s|||z   dz    D ]%  }|d   d	k(  xr |d   xs | j                  |z   |d<   ' ||z  } n |dz  }|t        |      k  r?|S )
zyApplies the named entity recognizer to the given list of tokens,
        where each token is a [word, tag] list.
        r   r   r   Nr   r   TFr   )
r   rW  
RE_ENTITY1r   
RE_ENTITY2
RE_ENTITY3r   r6  upperr  )	r,   r  r  r[  r   r   r  r  r   s	            r   r9  zEntities.apply  s    #f+oq	!""$A"j&6&6q&9Z=M=Ma=P#xxq	!Dya A :;2$((9J3Bqu{{}!45QRTVPW As A )! "1q5CK/6!a%=3C3I3I3Kq3P %A!" %+AA	%: $E %aF 2 ?uQx K488 #($E!H$ Q!" FA- #f+o. r/   c                     |j                  d      |gz   D cg c]  }|j                          }}| j                  |d   g       j                  |       yc c}w )zbAppends a named entity to the lexicon,
        e.g., Entities.append("Hooloovoo", "PERS")
        r   r   N)r   rW  rR   ru   )r,   entitynamer   r   s        r   ru   zEntities.append/  sO     !'S 1TF :;1QWWY;;!b!((+ <s   Ac                 <    |D ]  \  }}| j                  ||        y r)   rf  )r,   r$  r  r  s       r   rw   zEntities.extend6  s#    $ 	&LFDKK%	&r/   )Nr   r   )r  )
rf   rg   rh   r&  r1  r  r-   r9  ru   rw   r*   r/   r   r#  r#    s0      6@,&r/   r#  moodirony)r|   r}   r~   r   z^[acdnrv][-_][0-9]+$c                 J    t        |       t        t        |       xs d      z  S rb  )sumr"   r   )rm   s    r   avgr  V  s    t9uSY^!,,,r/   c                       e Zd ZddZddZy)ScoreNc                 :    |g }t         j                  | ||g      S )z>A (polarity, subjectivity)-tuple with an assessments property.)r   __new__r,   polaritysubjectivityassessmentss       r   r  zScore.__new__[  s#    K}}THl#;<<r/   c                     |g }|| _         y r)   )r  r  s       r   r&  zScore.__init__a  s    K&r/   r)   )rf   rg   rh   r  r&  r*   r/   r   r  r  Z  s    ='r/   r  c                   p    e Zd ZddZed        Zed        Zed        ZddZe	fdZ
ddZdd	Z	 dd
Zy)	SentimentNc                    || _         d| _        d| _        || _        i | _        i | _        |j                  dt              | _        |j                  dd      | _	        |j                  dd      | _
        |j                  dd       | _        y)	a,  A dictionary of words (adjectives) and polarity scores (positive/negative).
        The value for each word is a dictionary of part-of-speech tags.
        The value for each word POS-tag is a tuple with values for
        polarity (-1.0-1.0), subjectivity (0.0-1.0) and intensity (0.5-2.0).
        N	tokenizer	negations)nonotr   never	modifiers)r   modifierc                 $    | j                  d      S )Nly)r   r[  s    r   r   z$Sentiment.__init__.<locals>.<lambda>w  s    D9I r/   )r  r  _confidence_synset_synsetslabelerrT   r  r  r  r  r  )r,   r  r%  synset
confidencerV   s         r   r&  zSentiment.__init__h  sy     
K=K1NOK9

:/IJr/   c                     | j                   S r)   r,  r+   s    r   r  zSentiment.pathy  r-  r/   c                     | j                   S r)   r/  r+   s    r   r%  zSentiment.language}  r0  r/   c                     | j                   S r)   )r  r+   s    r   r  zSentiment.confidence  s    r/   c           
         |s| j                   }t        j                  j                  |      syi i i }}}t	        j
                  |      }|j                         }|j                  d      D ]  }| j                  3| j                  t        |j                  j                  dd            k  sC|j                  j                  d      |j                  j                  d      |j                  j                  dd      |j                  j                  dd      |j                  j                  d	d
      |j                  j                  d      |j                  j                  | j                        f\  }}}}	}
}}t        |      t        |	      t        |
      f}|r1|j                  |i       j                  |g       j                  |       |r|r|||<   |sp|j                  |g       j                  |        |j                  j                  d| j                        | _        |D ](  }t!        d ||   j#                         D              ||<   * t%        |j#                               D ]8  \  }}t'        |j)                          D cg c]  }t+        |       c}||   d<   : |j#                         D ]'  \  }}t'        | D cg c]  }t+        |       c}||<   ) t         j-                  | |       t         j-                  | j.                  |       t         j-                  | j0                  |       yc c}w c c}w )zLoads the XML-file (with sentiment annotations) from the given path.
        By default, Sentiment.path is lazily loaded.
        Nr   r          formr]  r  r  	intensityr   labelr%  c              3   n   K   | ](  \  }}|t        | D cg c]  }t        |       c}f * y c c}w wr)   )zipr  )r   r]  psieachs       r   r   z!Sentiment.load.<locals>.<genexpr>  s7      C S#Y7Ts4y787s   505)r  r  r  r  r   parsegetrootr   r  r"   attribrT   r  rR   ru   r  r1   rY   rm   r  r]   r  r_   r  r  )r,   r  wordssynsetslabelsxmlr[  r]  pr   r  r  r  r  r  ids                   r   r-   zSentiment.load  s    ::Dww~~d#!#Rw%kkmV$ 	?A'4+;+;u\3/@ , HHLL(HHLL'HHLLS1HHLL5HHLLc2HHLL)HHLL.2.31a Qxq584$$Q+66sB?FFsK %F1I&&vr299#>'	?( 
DNNC 	A  %a 0 E!H	 5;;=) 	HFAs474FGDc$iGE!HTN	H }} 	<GB14c;3t9;GBK	<D% DLL&)DMM7+ H <s   L<Mc                 h   t        |      j                  d      }|j                  d      s8|t        k(  rd|z   }|t        k(  rd|z   }|t
        k(  rd|z   }|t        k(  rd|z   }t        j                  |       dk(  r| j                          t        | j                  j                  |d      d	d
       S )zReturns a (polarity, subjectivity)-tuple for the given synset id.
        For example, the adjective "horrible" has id 193480 in WordNet:
        Sentiment.synset(193480, pos="JJ") => (-0.6, 1.0, 1.0).
           )n-v-a-r-r  r  r  r  r   )r  r  Nr   )r   zfillr   r   r   	ADJECTIVEADVERBr1   r2   r-   r   r  rT   )r,   r  r]  s      r   r  zSentiment.synset  s    
 W]]1}}56d{BYd{BYiBYf}BY<<"IIKT]]&&r:6r:;;r/   c                 2   d fd}t        d      r>j                  d   f| j                  j                  j                        z   dz   g}nRt        t              r_t        j                        rJt        d      r>j                  d   f| j                  j                  j                        z   dz   g}nt        t              rG| j                  d d	j                  | j                              j                         D        |      }nt        d
      r.| j                  d t        j                        D        |      }nRt        d      r%| j                  d j                  D        |      }n!t        d      rI| j                  j                   xs j"                  j%                         j                  dd ff|      }nt        d      rB| j                  t        j                  d D              |      }|j'                  dfd       n~t        t(              rB| j                  t        j                  d D              |      }|j'                  dfd       n,t        t*              r| j                  d D        |      }ng }|j-                  dd       }t/         ||D 	cg c]  \  }}}}	||f c}	}}}|       ||D 	cg c]  \  }}}}	||f c}	}}}|      |      S c c}	}}}w c c}	}}}w )aq  Returns a (polarity, subjectivity)-tuple for the given sentence,
        with polarity between -1.0 and 1.0 and subjectivity between 0.0 and 1.0.
        The sentence can be a string, Synset, Text, Sentence, Chunk, Word, Document, Vector.
        An optional weight parameter can be given,
        as a function that takes a list of words and returns a weight.
        c                      yrb  r*   r  s    r   r   z$Sentiment.__call__.<locals>.<lambda>      r/   c                 n    d\  }}| D ]  \  }} ||      }|||z  z  }||z  } |t        |xs d      z  S )N)r   r   r   )r"   )r  weightedr   nr  scorer[  s          r   r  zSentiment.__call__.<locals>.avg  sR    DAq + uUOQYQ uQV!}$$r/   glossr   )r]  r)   synonymsc              3   @   K   | ]  }|j                         d f  y wr)   rW  r   r[  s     r   r   z%Sentiment.__call__.<locals>.<genexpr>  s     Pq!'')T"Ps   r   r  c              3      K   | ];  }|j                   xs |j                  j                         |j                  d d f = y wNr   r   r   rW  r]  r  s     r   r   z%Sentiment.__call__.<locals>.<genexpr>  s;       WW0 0!%%)<   AAlemmatac              3      K   | ];  }|j                   xs |j                  j                         |j                  d d f = y wr  r  r  s     r   r   z%Sentiment.__call__.<locals>.<genexpr>  s3     Ka!''-QXX^^-quuRay9Kr  r   Nr   termsc              3   (   K   | ]
  }|d fdf  y wNr:  r*   r  s     r   r   z%Sentiment.__call__.<locals>.<genexpr>       #I!aY$=#I   weightc                 (    j                   | d      S r  )r  r[  r   s    r   r   z$Sentiment.__call__.<locals>.<lambda>  s    !''!A$- r/   c              3   (   K   | ]
  }|d fdf  y wr  r*   r  s     r   r   z%Sentiment.__call__.<locals>.<genexpr>	  r  r  c                     | d      S r  r*   r  s    r   r   z$Sentiment.__call__.<locals>.<lambda>  s    !AaD' r/   c              3   $   K   | ]  }|d f 
 y wr)   r*   r  s     r   r   z%Sentiment.__call__.<locals>.<genexpr>  s     !71d)!7s   c                      yrb  r*   r  s    r   r   z$Sentiment.__call__.<locals>.<lambda>  r  r/   )r  r  r  )r  r  r  r  r]  r   r   	RE_SYNSETr   r  r   r  r   r   from_iterabler  r   r   rW  rR   r1   rm   rT   r  )
r,   r   negationrV   r  r  r   r[  r  r*  s
    `        r   __call__zSentiment.__call__  s    '2 	% 1g**Q-!DKK!%%K$@@7JKA
 q*%)//!*<JAW**Q-!DKK!%%K$@@7JKA :&  PCHHT^^A5F,G,M,M,OPA
 Q$  "003 A Q	"  K177KXA Q   177#>ahhnn.>bq	"J!LhWA
 Q   ###Iq#II8A h(?@4   ###Iq#II8A h(9:4   !7Q!7BAAHk2155ZQ1a1a&5v>q99Aq!q!f96B
 	
59s   L	(Lc                    |g }g }d}d}|D ]  \  }}|
|| v rk|| |   v rc| |   |   \  }}	}
|:|j                  t        |g||	|
d| j                  j                  |                   ||d   d   j                  |       t	        dt        ||d   d   z  d            |d   d	<   t	        dt        |	|d   d   z  d            |d   d
<   |
|d   d<   | j                  j                  |      |d   d<   |1|d   d   j                  d|       d|d   d   z  |d   d<   d|d   d<   d}d}|r|| j                  v s,t        t        | |   j                  | j                              r||f}|sf|| j                  v sv|}z|r|| j                  v r|}n!|rt        |j                  d            dkD  rd}|F|D|| j                  v s| j                  |d         r"|d   d   j                  |       d|d   d<   d}n|rt        |      dkD  rd}|dk(  rJt        |      dkD  r<|d   d   j                  d       t	        dt        |d   d	   dz  d            |d   d	<   |dk(  r%|j                  t        |gddddt                     |j!                         du st        |      dk  s|t"        vst$        j'                         D ]?  \  \  }}}|t        d |      v s|j                  t        |g|dddt(                        t+        t        |            D ]=  }
||
   d   }||
   d	   }||
   d
   }	||
   d   }||
   d   }||dk  r|dz  n||	|f||
<   ? |S )zReturns a list of (chunk, polarity, subjectivity, label)-tuples for the given list of words:
        where chunk is a list of successive words: a known word optionally
        preceded by a modifier ("very good") or a negation ("not good").
        Nr   )r[  r  r   r  r  r*  r   r[  r   r  r   r  r   r*  r   r  r   r   r   g      ?r   r  F   c                 "    | j                         S r)   r  )r   s    r   r   z'Sentiment.assessments.<locals>.<lambda>c  s    aggi r/   g      )ru   r1   r  rT   maxminrs   r  anymaprI   r  r   r  r  IRONYisalphaPUNCTUATION	EMOTICONSrY   MOODrange)r,   r  r	  r  r   r  r[  r]  r  r   r  _typer   r*  s                 r   r  zSentiment.assessments  s   
 =E C	"FAs yDySDG^q'#,1a9HHTQC1Q!t||?O?OPQ?RST=bE#J%%a(!$T3q1R5:~t+D!EAbE#J!$T3q1R5:~t+D!EAbE#J!"AbE#J!%!1!1!!4AbE#J=bE#J%%a+!$quSz!1AbE#J!#AbE#J t~~-3tAw33T^^DECAT^^ 3A T^^ 3A3qwws|,q0A M.$--!2EbE#J%%a(!#AbE#JA3q6A:A8A
bE#J%%c*!$T3quSzD/@$+G!HAbE#J:HHTQC3#UKL IIK5(SVq[Qk=Q)2): "
A$7 ;;HHTQC1sa4%PQ!"AC	"H s1v 	7A!S	A!S	A!S	A!S	A!S	A1q5q4xaA6AaD	7 r/   c                 f    | j                  |i       }|||fx||<   |d<   |r|| j                  |<   yy)zAnnotates the given word with polarity, subjectivity and intensity scores,
        and optionally a semantic label (e.g., MOOD for emoticons, IRONY for "(!)").
        N)rR   r  )r,   r   r]  r  r  r  r  r[  s           r   annotatezSentiment.annotatep  sC     OOD"%$lI>>#4!&DLL r/   )r   NNNr)   )T)NT)Nr  r  r   N)rf   rg   rh   r&  r1  r  r%  r  r-   r  r  r
  r  r  r*   r/   r   r  r  g  sn    K"        0,d ' <&L
\Vr TX	'r/   r  z^[0-9\-\,\.\:\/\%\$]+$c                 N   t        | t        t        f      r| \  } }| j                  d      rd}| j                  d      rd}| j                  d      r| j                  d      sd}| j                  d      sd	| v rd
}| j                  d      rd}| j                  d      rd}| |gS )zHDefault morphological tagging rules for English, based on word suffixes.ingr   r  r   r   )isousssr   )
ablealfulibleientishivelessticr  r   r~   edr   )ateifyiseizer   )r   rm   r   r   r   s     r   _suffix_rulesr.    s    %$'
s~~e~~d~~c5>>2E#FU	
 %<~~d~~233<r/   c	                 x   |i }g }
t        |       D ]P  \  }}|
j                  ||j                  ||dk(  xr |j                  |j                               xs d      g       R t        |
      D ]  \  }\  }}d\  }}|dkD  r|
|dz
     }|t	        |
      dz
  k  r|
|dz      }|||duxr |j
                  xs dv sO||j                  |dg||      |
|<   j|j                         r|dk7  r||d   g|
|<   t        j                  |      ||d   g|
|<   ||j                  ||d   g||      |
|<   |dk(  rt        ||d   g      |
|<   ||d   g|
|<    |||j                  |
      }
||j                  |
      }
|-|
D cg c]   \  }}t         |||            xs ||d   g" }
}}|
S c c}}w )	a  Returns a list of [token, tag]-items for the given list of tokens:
    ["The", "cat", "purs"] => [["The", "DT"], ["cat", "NN"], ["purs", "VB"]]
    Words are tagged using the given lexicon of (word, tag)-items.
    Unknown words are tagged NN by default.
    Unknown words that start with a capital letter are tagged NNP (unless language="de").
    Unknown words that consist only of digits and punctuation marks are tagged CD.
    Unknown words are then improved with morphological rules.
    All words are improved with contextual rules.
    If a model is given, uses model for unknown words instead of morphology and context.
    If map is a function, it is applied to each (token, tag) after applying all rules.
    Nr   rk  r   r*   der   en)r  ru   rT   rW  r   unknownr9  istitler   r   r.  rm   )r  r5  modelr   r"  r$  defaultr%  r  rV   r`  r  r   r   prevrZ  s                   r   	find_tagsr7    s   . Ff% 
5GKKqAv'L'++ekkm2L'TPTUV	


 %V, 0<E3/
dq5!a%=Ds6{Q!a%=D;%E$5$G%--$M2N !KKtTBq	X%5"GAJ/q	%,"GAJ/q	'&,,eWQZ-@$Mq	T!)5'!**=>q	 #GAJ/q	104 u}v&'
QWX:5#$s5#'>E71:+>>XXM Ys   %F6/z.NN|NNS|NNP|NNPS|NNPS?\-[A-Z]{3,4}|PR|PRP|PRP\$zVB|VBD|VBG|VBN|VBP|VBZz
JJ|JJR|JJSz(?<!W)RB|RBR|RBSr   z((z)/)*((DT|CD|CC|CJ)/)*((z)/)*((z)/)+VPz(((MD|z)/)+)+z((MD)/)r   z((IN|PP|TO)/)+ADJPz((CC|CJ|ADVPz|WRB)/)+z)/)+((z)/)*z)/)*)+r   r  c                    | D cg c]  }| }}dj                  d | D              }t        t        |dv          D ]  \  }}|j                  |      D ]  }|j	                         }|d| j                  t              }	|j                  d      j                  t              }
t        |	|	|
z         D ]o  }t        ||         dk(  rt        ||         dk  s'||	k(  r||   d   dv r|	dz  }	<||	k(  r||   j                  d	|z          Y||   j                  d
|z          q   t        d |      D ]  }|j                  d        t        |      D ]d  \  }\  }}}|j                  d      s|dk(  s"|t        |      dz
  k  s4||dz      d   j                  d      rOd||dz      d<   d||dz      d<   f |S c c}w )zThe input is a list of [token, tag]-items.
    The output is a list of [token, tag, chunk]-items:
    The/DT nice/JJ fish/NN is/VBZ dead/JJ ./. =>
    The/DT/B-NP nice/JJ/I-NP fish/NN/I-NP is/VBZ/B-VP dead/JJ/B-ADJP ././O
    r   c              3   6   K   | ]  \  }}| t            y wr)   )	SEPARATOR)r   r   r   s      r   r   zfind_chunks.<locals>.<genexpr>+  s     @:5#cU9+&@s   )caesptfritrA  roNr   r  r   )r   r   KONzConj(neven)zB-zI-c                     t        |       dk  S )Nr  )r   )r*  s    r   r   zfind_chunks.<locals>.<lambda>D  s    #a&1* r/   Or   zB-NPr~   zB-ADVPr   )r   CHUNKSintfinditerstartr   r>  r   r  r   ru   filterr  r   )r`  r%  r*  chunkedtagsr   ruler   r  r  r  rH  chink_wordr	   s                  r   find_chunksrR  $  s    !!Qq!G!77@@@DHBBC 6	T t$ 	6A 	ARay)A
  +A1a!e_ 6wqz?a'wqz?Q&Av'!*Q-3U"UQa
))$*5  
))$*56	66. ,g6 S #,G"4 +E3>>$EVO3w<!##GAEN1,=,H,H,N$,Aq!$*Aq!+ NG "s   	F>c                    | D ]  }|j                  d        t        |       D ]  \  }}|d   j                  d      s|d   dk(  s$|t        |       dz
  k  s6| |dz      d   j                  d      s| |dz      d   dv s^d|d<   d	}| |dz   d
 D ]V  }|d   j                  d      s	|d   dv s |d   j                  d      r|rd|d<   |d   j                  d      rPd|d<   d}X  | S )zThe input is a list of [token, tag, chunk]-items.
    The output is a list of [token, tag, chunk, preposition]-items.
    PP-chunks followed by NP-chunks make up a PNP-chunk.
    rG  r   r   r   r   )r   r   )r   r   zB-PNPTNzI-PNPF)ru   r  r   r   )rM  chr  r	   pps        r   find_prepositionsrV  P  s     
		#g& #58T"uRyC'73w<!##Aq!**<81q5>!$6#b	!!a%'* #BqENN<8BqE^<S!u~~d+!(2a5>>$/!(2"##" Nr/   pennc                   L    e Zd Zd
dZd Zd Zd Zd Zd Zd Z		 	 	 	 	 	 dd	Z
y)ParserNc                 6    |i }|| _         || _        || _        y)a  A simple shallow parser using a Brill-based part-of-speech tagger.
        The given lexicon is a dictionary of known words and their part-of-speech tag.
        The given default tags are used for unknown words.
        Unknown words that start with a capital letter are tagged NNP (except for German).
        Unknown words that contain only digits and punctuation are tagged CD.
        The given language can be used to discern between
        Germanic and Romance languages for phrase chunking.
        N)r5  r5  r%  )r,   r5  r5  r%  s       r   r&  zParser.__init__  s#     ?G r/   c           	          t        t        |      |j                  dt              |j                  dt              |j                  dt
              d      S )z~Returns a list of sentences from the given string.
        Punctuation marks are separated from each word by a space.
        r   r   r   r   )r   r   r   r   )r  r   rT   r  ABBREVIATIONSreplacements)r,   r   rV   s      r   r  zParser.find_tokens  sE    
 K

=+> **_mDJJy,7
 	
r/   c           
          t        ||j                  d| j                        |j                  d| j                        |j                  d| j                        |j                  dd            S )zAnnotates the given list of tokens with part-of-speech tags.
        Returns a list of tokens, where each token is now a [word, tag]-list.
        r%  r5  r5  r  N)r%  r5  r5  r  )r7  rT   r%  r5  r5  r,   r  rV   s      r   r7  zParser.find_tags  sX    
 ZZ
DMM:JJy$,,7JJy$,,7

5$'
 	
r/   c           	      b    t        t        ||j                  d| j                                    S )zAnnotates the given list of tokens with chunk tags.
        Several tags can be added, for example chunk + preposition tags.
        r%  r%  )rV  rR  rT   r%  r_  s      r   rR  zParser.find_chunks  s*     !J)NO
 	
r/   c                     t        |      S )zGAnnotates the given list of tokens with prepositional noun phrase tags.)rV  r_  s      r   rV  zParser.find_prepositions  s     ((r/   c                     t        |      S )z<Annotates the given list of tokens with verb/predicate tags.)find_relationsr_  s      r   find_labelszParser.find_labels  s    f%%r/   c                 V    |D cg c]  }||d   j                         gz    c}S c c}w )z5Annotates the given list of tokens with word lemmata.r   r  )r,   r  rV   r   s       r   find_lemmatazParser.find_lemmata  s)    8>?uq)**???s   &c                    |r | j                   |fi |}t        |t        t        f      r5|D cg c])  }t        |t              xr |j                  d      xs |+ c}}t        |t              r.|j                  d      D cg c]  }|j                  d       c}}t        t        |            D ]  }	t        t        ||	               D ]1  }
t        ||	   |
   t              st        ||	   |
   |      ||	   |
<   3 |s|s|s|r | j                  ||	   fi |||	<   n||	   D cg c]  }|g c}||	<   |s|r | j                  ||	   fi |||	<   |r | j                  ||	   fi |||	<   |s | j                  ||	   fi |||	<    |j                  dd      r|j                  dd      r|S dg}|r|j                  d       |r|j!                  d	       |r|j                  d
       |r|j                  d       t        t        |            D ]v  }	t        t        ||	               D ]C  }
||	   |
   d   j#                  dd      ||	   |
   d<   dj%                  ||	   |
         ||	   |
<   E dj%                  ||	         ||	<   x dj%                  |      }t'        t)        |      ||j                  d| j*                              }|S c c}w c c}w c c}w )ap  Takes a string (sentences) and returns a tagged Unicode string (TaggedString).
        Sentences in the output are separated by newlines.
        With tokenize=True, punctuation is split from words and sentences are separated by 
.
        With tags=True, part-of-speech tags are parsed (NN, VB, IN, ...).
        With chunks=True, phrase chunk tags are parsed (NP, VP, PP, PNP, ...).
        With relations=True, semantic role labels are parsed (SBJ, OBJ).
        With lemmata=True, word lemmata are parsed.
        Optional parameters are passed to
        the tokenizer, tagger, chunker, labeler and lemmatizer.
        r   r   collapseTr   Fr   r   )r	   r
   r   r   r   r8  r   r%  ra  )r  r   rm   r   r   r   r  r   r   r   r7  rR  re  rg  rT   ru   rw   r   r   TaggedStringr   r%  )r,   r   tokenizerN  chunks	relationsr  r   rV   r  r  r[  r   s                r   r  zParser.parse  s   ,    -f-Aa$'JKLQAz*;qwws|@q@LAa$'(wwt}5!5As1v 	9A3qt9% ?ad1gu-+AaDGX>AaDG? vg%t~~ad5f5!%&qT**!'t''!77!'t''!77!(t((188!#	9* zz*d+vzz'5/IH MM*+MM23MM*%MM'" s1v 	"A3qt9% ,qT!WQZ//Y?!Q
((1Q47+!Q, 88AaD>AaD		"
 IIaLFFVZZ
DMM%J
 e M5 +s   .KK*
K )Nr|   r   r   N)TTTFFr   )rf   rg   rh   r&  r  r7  rR  rV  re  rg  r  r*   r/   r   rY  rY    s@    !


)&@ Kr/   rY  r  c                       e Zd ZddZefdZy)rj  Nc                    |dg}t        |t              r$t        |d      r|j                  |j                  }}t        |t
              rY|D cg c]2  }|D cg c]"  }|D cg c]  }|j                  dd       c}$ c}}4 }}}}dj                  d |D              }t        j                  | |      }t        |      |_        ||_        |S c c}w c c}}w c c}}}w )zUnicode string with tags and language attributes.
        For example: TaggedString("cat/NN/NP", tags=["word", "pos", "chunk"]).
        r   rN  r8  r   r   c              3   L   K   | ]  }d j                  d |D                yw)r   c              3   >   K   | ]  }d j                  |        yw)r8  Nr   )r   r   s     r   r   z1TaggedString.__new__.<locals>.<genexpr>.<genexpr>6  s     'GE'Gs   Nrs  r   s     r   r   z'TaggedString.__new__.<locals>.<genexpr>6  s     XAsxx'GQ'GGXs   "$)	r   r   r  rN  r%  rm   r   r   r  )r,   r   rN  r%  r   r   r*  s          r   r  zTaggedString.__new__'  s    
 <8Dfc"wvv'>#[[&//(Dfd#    JKKU;!))C+;KF  YYXQWXXFKKf%d
 <Ks*   
C	C#C<CCCCc                 l   |t         k7  rt        j                  | |      S t        |       dk(  rg S t        j                  | d      D cg c]P  }|j                  d      D cg c]1  }|j                  d      D cg c]  }|j	                  dd       c}3 c}}R c}}}S c c}w c c}}w c c}}}w )zReturns a list of sentences, where each sentence is a list of tokens,
        where each token is a list of word + tags.
        r   r   r   r8  r   )TOKENSr   r   r   r   )r,   sepsentencer   r*  s        r   r   zTaggedString.split<  s     &=99T3''t9>I  IIdD1
 

  &^^C0 5:KK4DEq9c*E
 	
E
s*   
B/#B);B$B)B/$B))B/r:  )rf   rg   rh   r  ru  r   r*   r/   r   rj  rj  &  s    *  
r/   rj  c                   j    e Zd ZdZddZd Zed        Zed        Ze	dd       Z
d Zd Zdd
Zd Zy	)Spellingabcdefghijklmnopqrstuvwxyzc                     || _         y r)   r,  )r,   r  s     r   r&  zSpelling.__init__T  s	    
r/   c           	          t        | j                        D ]8  }|j                         }t        j	                  | |d   t        |d                : y )Nr   r   )r  r  r   r1   rP   rI  r8  s     r   r-   zSpelling.loadW  sC    tzz" 	4A	AT1Q4QqT3	4r/   c                     | j                   S r)   r,  r+   s    r   r  zSpelling.path\  r-  r/   c                     | j                   S r)   r/  r+   s    r   r%  zSpelling.language`  r0  r/   c                 @   i }t        j                  d|j                               D ]  }||v xr ||   dz   xs d||<    d t        |j	                               D        }dj                  |      }t        |d      }|j                  |       |j                          y)zCounts the words in the given string and saves the probabilities at the given path.
        This can be used to generate a new model for the Spelling() constructor.
        z[a-z]+r   c              3   0   K   | ]  \  }}| d |   yw)r   Nr*   rG  s      r   r   z!Spelling.train.<locals>.<genexpr>l  s     >1A3as>rI  r   r[  N)	r   r   rW  sortedrY   r   r  writeclose)r,   r   r  r4  r[  r  s         r   trainzSpelling.traind  s    
 Haggi0 	8AEz2eAhl7aE!H	8>u{{}(=>		% sO			r/   c                 H   t        t        |      dz         D cg c]  }|d| ||d f }}|D cg c]  \  }}|s	||dd z    c}}|D cg c](  \  }}t        |      dkD  s||d   z   |d   z   |dd z   * c}}|D cg c](  \  }}t        j                  D ]  }|s||z   |dd z    * c}}}|D cg c]%  \  }}t        j                  D ]  }||z   |dd z    ' c}}}f\  }}}	}
t	        ||z   |	z   |
z         S c c}w c c}}w c c}}w c c}}}w c c}}}w )z@Returns a set of words with edit distance 1 from the given word.r   Nr   r   )r  r   ry  ALPHAset)r,   r[  r  r   r  r  cdelete	transposer   rs   s              r   _edit1zSpelling._edit1r  s.    */s1vz):;A!BQ%12;;#(.41aAQ12Y.16EA#a&1*Q1X!_qu$E',JJtq!hnnJQUQqrU]J]J',EEtq!hnnEQUQqrU]E]E	.
*	7F 6I%/&899 <.EJEs-   D
D

D
D&DD#D<*Dc                 J     t         fd j                  |      D              S )z?Returns a set of words with edit distance 2 from the given wordc              3   X   K   | ]!  }j                  |      D ]  }|v s|  # y wr)   )r  )r   e1e2r,   s      r   r   z"Spelling._edit2.<locals>.<genexpr>  s,     W"RW2BRVJ2W2Ws   *	*)r  r  )r,   r[  s   ` r   _edit2zSpelling._edit2  s     WAWWWr/   Nc                 4     |g }t         fd|D              S )z8Returns the given list of words filtered by known words.c              3   ,   K   | ]  }|v s|  y wr)   r*   )r   r[  r,   s     r   r   z"Spelling._known.<locals>.<genexpr>  s     1qDy11s   	)r  )r,   r  s   ` r   _knownzSpelling._known  s    =E1e111r/   c                    t        |       dk(  r| j                          t        |      dk(  r|dfgS |t        v r|dfgS |t        j                  v r|dfgS |j                  dd      j                         r|dfgS | j                  |g      xsG | j                  | j                  |            xs% | j                  | j                  |            xs |g}|D cg c]  }| j                  |d      |f }}t        t        d |D              xs d      t        fd|D        d	
      }|j                         r&|D cg c]  \  }}|j                         |f }}}|S |D cg c]	  \  }}||f }}}|S c c}w c c}}w c c}}w )zReturn a list of (word, confidence) spelling corrections for the given word,
        based on the probability of known words with edit distance 1-2 from the given word.
        r   r   r   r   r   r  c              3   &   K   | ]	  \  }}|  y wr)   r*   )r   r  r   s      r   r   z#Spelling.suggest.<locals>.<genexpr>  s     2GAta2s   c              3   2   K   | ]  \  }}|z  |f  y wr)   r*   )r   r  r   r   s      r   r   z#Spelling.suggest.<locals>.<genexpr>  s     Ewq$a!eT]Es   T)reverse)r   r-   r  r   
whitespacer   isdigitr  r  r  rT   r"   r  r  r3  title)r,   r[  
candidatesr  r  r   r   s         @r   suggestzSpelling.suggest  s    t9>IIKq6Q;H:H:!!!H:99S"%%'H:KK {{4;;q>*{{4;;q>* s	 	 6@@txx3'+@
@#2z227a8E*EtT
99;;EF44::<+FJF  4>>44)>J> A G>s   E;F (F)r   )zspelling.txtr)   )rf   rg   rh   r  r&  r-   r1  r  r%  classmethodr  r  r  r  r  r*   r/   r   ry  ry  Q  sb    (E4
      :X2r/   ry  )r   )r   z;;;)r|   )NNNNNrn  r1  N)r1  )k__doc__r  r  r   r   r4   	itertoolsr   	xml.etreer   r   r   r   r  dirnameabspath__file__MODULESLASHWORDr   CHUNKPNPRELANCHORLEMMAr   r    r  encode_utf8r%   r1   r'   rm   rj   	UNIVERSALr   r   r   r   r   r   r   ADPr   r   r   r   r   r   r   compiler   r  r   r  r\  r   r   r   r   r   r  r]   escaper   r   r]  r   r  r  r  r3  r  r!  r  r  r  r#  r  r  r  r  r  r  r   r  r  r   r.  r7  r>  r|   r}   r~   r   rH  rs   rc   rR  rV  PTBPENNrY  ru  rj  ry  )r   r   r  s   000r   <module>r     s  
  	 	    !5\
WW__RWW__X67F	4 0tS%c65 7,t 7,t((t ((p 	M IdCdCsCtS$$B 	

: < ;k !$&(! (T 2::o&2::()2::	hh 	 'J K M TUZ[S O /0'	. :C9I9I9K 45UVPQEJJA.D			$.// rzz-#((<*@@A RZZ&
 

 
 _Z<h L
 
T5 Txrh rn RZZ$
RZZ>?
RZZ>?
D&x D&t  6 dIvBJJ./	-
'E 
'R' R'p RZZ)*4 
BN 	6 BJJ+,  	
    
	
 
zrzz)b.83b88CDE	zrzz*%&	zrzz+,-	K",s2R7(BRG&PQR	EBJ345)2 BJJ+,  	
     	 
  	
$ 
zrzz)b.83b88CbH8STU	zrzz*%&	zrzz+,-	K",s2R7(BRG&PQR	EBJ3451/1
j q	  F1IMM!$ % q	  F1IMM!$ %)Xz  dK Kf 
$
3 $
VUx U1Fj	 /s#   4R2 #S<R<S2R9<S