
    kh"                         d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
  G d dej                        Z G d	 d
e      Z G d de      Zd Zd Zy)zVarious noun phrase extractors.    N)BaseNPExtractor)requires_nltk_corpus)PatternTagger)filter_insignificanttree2strc                   (    e Zd Zd Zed        Zd Zy)ChunkParserc                     d| _         y NF_trainedselfs    U/opt/mcp/mcp-sentiment/venv/lib/python3.12/site-packages/textblob/en/np_extractors.py__init__zChunkParser.__init__   	        c                 z   t         j                  j                  j                  ddg      D cg c]9  }t         j                  j                  |      D cg c]
  \  }}}||f c}}}; }}}}}t        j                  |      }t        j                  ||      | _        d| _	        yc c}}}w c c}}}}w )z+Train the Chunker on the ConLL-2000 corpus.z	train.txtNP)chunk_typesbackoffTN)
nltkcorpus	conll2000chunked_sentschunktree2conlltagsUnigramTaggerBigramTaggertaggerr   )r   sent_tc
train_dataunigram_taggers          r   trainzChunkParser.train   s    
 --;;$ < 
 
 $(::#<#<T#BCC1aaVC

 
 ++J7''
NK D
s   (B5
B.)B5
.B5
c                    | j                   s| j                          |D cg c]  \  }}|	 }}}| j                  j                  |      }|D cg c]  \  }}|	 }}}t	        ||      D cg c]  \  \  }}}|||f }}}}t
        j                  j                  j                  |      S c c}}w c c}}w c c}}}w )z'Return the parse tree for the sentence.)	r   r(   r!   tagzipr   r   utilconlltags2tree)	r   sentencewordpospos_tagstagged_pos_tagschunktag	chunktags	conlltagss	            r   parsezChunkParser.parse   s    }}JJL+34KT3C44++//(35DE/3XE	E ,/x+C
 
'$h 3!
	 
 zz--i88 5E
s   B/B50B;N)__name__
__module____qualname__r   r   r(   r6    r   r   r	   r	      s      
 
9r   r	   c                   J    e Zd ZdZ e       ZddddddZg dZddZd	 Z	d
 Z
y)ConllExtractorzeA noun phrase extractor that uses chunk parsing trained with the
    ConLL-2000 training corpus.
    NNPNNIJJ)r=   r=   )NNrA   )r>   rA   )r?   r?   )r?   rA   )DTCCzPRP$PRPNc                 6    |st               | _        y || _        y N)r	   parser)r   rG   s     r   r   zConllExtractor.__init__>   s    +1kmvr   c           
          t         j                  j                  |      }g }|D ]  }| j                  |      }|D cg c]  }t	        |t         j
                  j                        r`|j                         dk(  rMt        t        |            dk\  r6t        || j                        rt        t        || j                               }}|D cg c]  }t        |       }	}|j                  |	        |S c c}w c c}w )9Return a list of noun phrases (strings) for body of text.r      )cfg)r   tokenizesent_tokenize_parse_sentence
isinstancetreeTreelabellenr   	_is_matchCFG_normalize_tagsINSIGNIFICANT_SUFFIXESr   extend)
r   text	sentencesnoun_phrasesr.   parsedeachphrasesphrasenpss
             r   extractzConllExtractor.extractA   s    MM//5	! 	%H))(3F
 #dDIINN3JJLD(,T23q8d1   4T4;V;V WXG  3::8F#:C:$	%  ;s   BC6C;c                 n    | j                   j                  |      }| j                  j                  |      S )z4Tag and parse a sentence (a plain, untagged string).)
POS_TAGGERr*   rG   r6   )r   r.   taggeds      r   rN   zConllExtractor._parse_sentenceU   s+    $$X.{{  ((r   rF   )r7   r8   r9   __doc__r   rc   rU   rW   r   ra   rN   r:   r   r   r<   r<   *   s?     J C 9>()r   r<   c                   B    e Zd ZdZddddddZd Zed        Zd Zd	 Z	y
)FastNPExtractorzA fast and simple noun phrase extractor.

    Credit to Shlomi Babluk. Link to original blog post:

        http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
    r=   r>   r?   r@   c                     d| _         y r   r   r   s    r   r   zFastNPExtractor.__init__l   r   r   c                     t         j                  j                  j                  d      }t        j                  g d      }t        j
                  ||      }t        j                  ||      | _        d| _        y )Nnews)
categories))z^-?[0-9]+(.[0-9]+)?$CD)z(-|:|;)$:)z\'*$MD)z(The|the|A|a|An|an)$AT)z.*able$r?   )z	^[A-Z].*$r=   )z.*ness$rA   )z.*ly$RB)z.*s$NNS)z.*ing$VBG)z.*ed$VBD)z.*rA   r   T)	r   r   browntagged_sentsRegexpTaggerr   r    r!   r   )r   r&   regexp_taggerr'   s       r   r(   zFastNPExtractor.traino   sf    [[&&33v3F
))
  ++JN''
NKr   c                 0    t        j                  |      }|S )z+Split the sentence into single words/tokens)r   word_tokenize)r   r.   tokenss      r   _tokenize_sentencez"FastNPExtractor._tokenize_sentence   s    ##H-r   c                 2   | j                   s| j                          | j                  |      }| j                  j	                  |      }t        |      }d}|rd}t        dt        |      dz
        D ]  }||   }||dz      }|d   |d   f}	| j                  j                  |	d      }
|
s9d}|j                  |       |j                  |       |d    d|d    }|
}|j                  |||f        n |r|D cg c]  }|d   dv s|d    }}|S c c}w )rI   TFr   rJ     r=   r>   )r   r(   r{   r!   r*   rV   rangerS   rU   getpopinsert)r   r.   rz   rd   tagsmergext1t2keyvaluematchr0   r$   matchess                  r   ra   zFastNPExtractor.extract   s(   }}JJL((2(v&E1c$i!m, !W!a%[eRUlS"- EHHQKHHQK!!ugQr!ug.ECKKE3<0   "&@A1)?1Q4@@ As   <D	DN)
r7   r8   r9   re   rU   r   r   r(   r{   ra   r:   r   r   rg   rg   [   sB     C  .
r   rg   c                     g }| D ]  \  }}|dk(  s|dk(  r|j                  |df       $|j                  d      r|j                  ||dd f       L|j                  d      r|j                  ||dd f       t|j                  ||f        |S )	zBNormalize the corpus tags.
    ("NN", "NN-PL", "NNS") -> "NN"
    zNP-TLr   r=   z-TLNS)appendendswith)r   retr/   r*   s       r   rV   rV      s     C 
 	c'>SD[JJe}%<<JJc#2h'(<<JJc#2h'(

D#;
  Jr   c                    t        |       }d}|rd}t        t        |      dz
        D ]v  }||   ||dz      }}|d   |d   f}|j                  |d      }|s/d}|j	                  |       |j	                  |       |d    d|d    }	|}
|j                  ||	|
f        n |rt        |D cg c]	  }|d   dv  c}      }	|	S c c}w )zFReturn whether or not a tagged phrases matches a context-free grammar.TFrJ   Nr   r~   r   )listr   rS   r   r   r   any)tagged_phraserK   copyr   ifirstsecondr   r   r   r0   r$   s               r   rT   rT      s    DE
s4y1}% 	A GT!a%[6E(F1I%CGGC&E 8*AfQi[1As|,	  6A1'67EL 7s   .C)re   r   textblob.baser   textblob.decoratorsr   textblob.taggersr   textblob.utilsr   r   ChunkParserIr	   r<   rg   rV   rT   r:   r   r   <module>r      sO    %  ) 4 * 99$## 9>.)_ .)bJo J`&r   