
    kh	                         d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	  G d de      Z
 G d	 d
e      Z e       j                  Z e
       ZddZy)z<Various tokenizer implementations.

.. versionadded:: 0.4.0
    )chainN)BaseTokenizer)requires_nltk_corpus)
strip_puncc                       e Zd ZdZddZy)WordTokenizerak  NLTK's recommended word tokenizer (currently the TreeBankTokenizer).
    Uses regular expressions to tokenize text. Assumes text has already been
    segmented into sentences.

    Performs the following steps:

    * split standard contractions, e.g. don't -> do n't
    * split commas and single quotes
    * separate periods that appear at the end of line
    c                     t         j                  j                  |      }|r|S |D cg c]/  }t        |d      r |j	                  d      r|nt        |d      1 c}S c c}w )zReturn a list of word tokens.

        :param text: string of text.
        :param include_punc: (optional) whether to
            include punctuation as separate tokens. Default to True.
        F)all')nltktokenizeword_tokenizer   
startswith)selftextinclude_punctokenswords        O/opt/mcp/mcp-sentiment/venv/lib/python3.12/site-packages/textblob/tokenizers.pyr   zWordTokenizer.tokenize   sc     ,,T2M #d. ,*Tu2MM  s   4ANT)__name__
__module____qualname____doc__r        r   r   r      s    	r   r   c                        e Zd ZdZed        Zy)SentenceTokenizerzNLTK's sentence tokenizer (currently PunktSentenceTokenizer).
    Uses an unsupervised algorithm to build a model for abbreviation words,
    collocations, and words that start sentences,
    then uses that to find sentence boundaries.
    c                 @    t         j                  j                  |      S )zReturn a list of sentences.)r   r   sent_tokenize)r   r   s     r   r   zSentenceTokenizer.tokenize9   s     }}**400r   N)r   r   r   r   r   r   r   r   r   r   r   2   s     1 1r   r   c                 ^    t        j                  fdt        |       D              }|S )zConvenience function for tokenizing text into words.

    NOTE: NLTK's word tokenizer expects sentences as input, so the text will be
    tokenized to sentences before being tokenized to words.
    c              3   R   K   | ]  }t        j                  |gi    y w)N)_word_tokenizer	itokenize).0sentenceargsr   kwargss     r   	<genexpr>z word_tokenize.<locals>.<genexpr>K   s0        	!!(LJ4J6J s   $')r   from_iterabler    )r   r   r'   r(   wordss    ``` r   r   r   E   s.       %d+  E Lr   r   )r   	itertoolsr   r   textblob.baser   textblob.decoratorsr   textblob.utilsr   r   r   r$   r    r#   r   r   r   r   <module>r0      sP   
   ' 4 % M  F
1 
1 "#--/
r   