
    &thR.                    \   d Z ddlmZ ddlZddlmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ  ej        dej                  Z ej        dej                  Z ej        dej                  Z ej        dej        ej        z            Z ej        dej        ej        z            Z ej        dej                  ZdZ	 	 	 d4d5dZd6d7d Zd8d9d#Z ej        d$ej                  Z d6d:d%Z!	 	 	 d;d<d'Z"	 d=d>d(Z#	 	 	 d?d@d+Z$	 	 	 dAdBd,Z%	 dCdDd.Z&	 	 	 dEdFd2Z'dGd3Z(dS )Hz(
Functions for dealing with markup text
    )annotationsN)Iterable)name2codepoint)MatchPattern)urljoin)
StrOrBytes)safe_url_string)
to_unicodezI&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)z<[a-zA-Z\/!].*?>z5<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']z}<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)z<meta\s[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)[^>]*?\shttp-equiv\s*=[^>]*refreshz<((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))z 	
 Tutf-8textr	   keepIterable[str]remove_illegalboolencodingstrreturnc                f    dfd}t                               |t          | |                    S )u  Remove entities from the given `text` by converting them to their
    corresponding unicode character.

    `text` can be a unicode string or a byte string encoded in the given
    `encoding` (which defaults to 'utf-8').

    If `keep` is passed (with a list of entity names) those entities will
    be kept (they won't be removed).

    It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
    and named entities (such as ``&nbsp;`` or ``&gt;``).

    If `remove_illegal` is ``True``, entities that can't be converted are removed.
    If `remove_illegal` is ``False``, entities that can't be converted are kept "as
    is". For more information see the tests.

    Always returns a unicode string (with the entities removed).

    >>> import w3lib.html
    >>> w3lib.html.replace_entities(b'Price: &pound;100')
    'Price: \xa3100'
    >>> print(w3lib.html.replace_entities(b'Price: &pound;100'))
    Price: £100
    >>>

    m
Match[str]r   r   c                   |                                  }d }|                    d          rt          |d         d          }n|                    d          rt          |d         d          }n|                    d          rm|d         }|                                v r|                     d          S t          j        |          p%t          j        |                                          }|Z	 d|cxk    rdk    r&n n#t          |f                              d	          S t          |          S # t          t          f$ r Y nw xY wr|                    d
          rdn|                     d          S )Ndec
   hex   namedr         cp1252	semicolon )	groupdictgetintlowergroupr   bytesdecodechr
ValueErrorOverflowError)r   groupsnumberentity_namer   r   s       ^/var/www/html/mycamper/aliexpress-site/backend/venv/lib/python3.11/site-packages/w3lib/html.pyconvert_entityz(replace_entities.<locals>.convert_entityH   s   ::e 
	++FFZZ 	++FFZZ   	 /K  ""d**wwqzz!#'44 8J!!##9 9F 
6))))T))))) &++228<<<6{{".    $O

;(?(?OrrQWWQZZOs   52D7 (D7 7E
Er   r   r   r   )_ent_resubr   )r   r   r   r   r2   s    ``  r1   replace_entitiesr6   '   sO    BP P P P P P P8 ;;~z$'A'ABBB    
str | Nonec                l    t          t                              t          | |                              S N)r   r4   searchr   )r   r   s     r1   has_entitiesr<   g   s&    z$99::;;;r7   r#   tokenc                T    t                               |t          | |                    S )ac  Replace all markup tags found in the given `text` by the given token.
    By default `token` is an empty string so it just removes all tags.

    `text` can be a unicode string or a regular string encoded as `encoding`
    (or ``'utf-8'`` if `encoding` is not given.)

    Always returns a unicode string.

    Examples:

    >>> import w3lib.html
    >>> w3lib.html.replace_tags('This text contains <a>some tag</a>')
    'This text contains some tag'
    >>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\xe7ais</b></p>', ' -- ', 'latin-1')
    ' -- Je ne parle pas  -- fran\xe7ais --  -- '
    >>>

    )_tag_rer5   r   )r   r=   r   s      r1   replace_tagsr@   k   s"    ( ;;ujx88999r7   z<!--.*?(?:-->|$)c                X    t          | |          }t                              d|          S )zRemove HTML Comments.

    >>> import w3lib.html
    >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
    'test  whatever'
    >>>

    r#   )r   _REMOVECOMMENTS_REr5   )r   r   utexts      r1   remove_commentsrD      s)     tX&&E!!"e,,,r7   
which_onesc                   rrt          d          d D             d D             dfddfd}d}t          j        |t          j        t          j        z            }|                    |t          | |                    S )a;  Remove HTML Tags only.

    `which_ones` and `keep` are both tuples, there are four cases:

    ==============  ============= ==========================================
    ``which_ones``  ``keep``      what it does
    ==============  ============= ==========================================
    **not empty**   empty         remove all tags in ``which_ones``
    empty           **not empty** remove all tags except the ones in ``keep``
    empty           empty         remove all tags
    **not empty**   **not empty** not allowed
    ==============  ============= ==========================================


    Remove all tags:

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags(doc)
    'This is a link: example'
    >>>

    Keep only some tags:

    >>> w3lib.html.remove_tags(doc, keep=('div',))
    '<div>This is a link: example</div>'
    >>>

    Remove only specific tags:

    >>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
    '<div><p>This is a link: example</p></div>'
    >>>

    You can't remove some and keep some:

    >>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
    Traceback (most recent call last):
        ...
    ValueError: Cannot use both which_ones and keep
    >>>

    z#Cannot use both which_ones and keepc                6    h | ]}|                                 S r   r'   .0tags     r1   	<setcomp>zremove_tags.<locals>.<setcomp>   s     444##))++444r7   c                6    h | ]}|                                 S r   rH   rI   s     r1   rL   zremove_tags.<locals>.<setcomp>   s     (((CCIIKK(((r7   rK   r   r   r   c                @    |                                  } r| v S | vS r:   rH   )rK   r   rE   s    r1   will_removez remove_tags.<locals>.will_remove   s-    iikk 	%*$$$r7   r   r   c                r    |                      d          } |          rdn|                      d          S )N   r#   r   )r(   )r   rK   rO   s     r1   
remove_tagzremove_tags.<locals>.remove_tag   s5    ggajj [%%5rr1771::5r7   z</?([^ >/]+).*?>)rK   r   r   r   r3   )r,   recompileDOTALL
IGNORECASEr5   r   )r   rE   r   r   rR   regexretagsrO   s    ``    @r1   remove_tagsrY      s    b  @d @>???44444J((4(((D      6 6 6 6 6 6 EZry2=899F::j*T8"<"<===r7   c                    t          | |          }|rad                    d |D                       }t          j        |t          j        t          j        z            }|                    d|          }|S )a  Remove tags and their content.

    `which_ones` is a tuple of which tags to remove including their content.
    If is empty, returns the string unmodified.

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
    '<div><p> <a href="http://www.example.com">example</a></p></div>'
    >>>

    |c           	     &    g | ]}d | d| d| dS )<z\b.*?</z>|<z\s*/>r   rI   s     r1   
<listcomp>z,remove_tags_with_content.<locals>.<listcomp>   s4    SSS<c<<#<<#<<<SSSr7   r#   )r   joinrS   rT   rU   rV   r5   )r   rE   r   rC   tagsrX   s         r1   remove_tags_with_contentra      sm      tX&&E &xxSS
SSSTTD")bm";<<

2u%%Lr7   
	
replace_byc                x    t          | |          }|D ]&}|                    |t          ||                    }'|S )a$  Remove escape characters.

    `which_ones` is a tuple of which escape characters we want to remove.
    By default removes ``\n``, ``\t``, ``\r``.

    `replace_by` is the string to replace the escape characters by.
    It defaults to ``''``, meaning the escape characters are removed.

    )r   replace)r   rE   rf   r   rC   ecs         r1   replace_escape_charsrj      sH      tX&&E D Db*Z"B"BCCLr7   c                    dd}t          | |          }d} ||t                    D ]E}t          |t                    r|t	          |||	          z  }-||                    d
          z  }F|S )a`  
    This function receives markup as a text (always a unicode string or
    a UTF-8 encoded string) and does the following:

    1. removes entities (except the ones in `keep`) from any part of it
        that is not inside a CDATA
    2. searches for CDATAs and extracts their text (if any) without modifying it.
    3. removes the found CDATAs

    txtr   patternPattern[str]r   Iterable[str | Match[str]]c              3     K   d}|                     |           D ],}|                    d          \  }}| ||         V  |V  |}-| |d          V  d S )Nr   rQ   )finditerspan)rl   rm   offsetmatchmatch_smatch_es         r1   _get_fragmentsz&unquote_markup.<locals>._get_fragments  s{      %%c** 	 	E$zz!}}GWfWn%%%%KKKFF&''lr7   r#   )r   r   cdata_d)rl   r   rm   rn   r   ro   )r   	_cdata_re
isinstancer   r6   r(   )r   r   r   r   rw   rC   ret_textfragments           r1   unquote_markupr}     s    "    tX&&EH"N5)44 2 2h$$ 	2(tN   HH
 y111HHOr7   baseurlc                    t          | |          }t                              |          x}r?t          t	          |          t	          |                    d          |                    S t	          |          S )zReturn the base url if declared in the given HTML `text`,
    relative to the given base url.

    If no base url is found, the given `baseurl` is returned.

    )r   rQ   )rD   _baseurl_rer;   r   r
   r(   )r   r~   r   rC   r   s        r1   get_base_urlr   0  sw     !999Eu%%%q 
G$$oaggajj8&T&T&T
 
 	
 7###r7   scriptnoscriptignore_tags%tuple[None, None] | tuple[float, str]c                   	 t          | |          }n# t          $ r t          |             w xY wt          ||          }t	          t          |                    }t                              |          pt                              |          x}rlt          |
                    d                    }t          |
                    d                              d          |          }t          ||          }||fS dS )aX  Return the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, ``(None, None)`` is returned.

    r&   urlz "')NN)r   UnicodeDecodeErrorprintra   rD   r6   _meta_refresh_rer;   _meta_refresh_re2floatr(   r
   stripr   )r   r~   r   r   rC   r   intervalr   s           r1   get_meta_refreshr   B  s    4**   d %UK88E,U3344E##E**M.?.F.Fu.M.MMq ((aggenn226::HEEgs##}:s    .c                6    |                      t                    S )a  
    Strip all leading and trailing space characters (as defined in
    https://www.w3.org/TR/html5/infrastructure.html#space-character).

    Such stripping is useful e.g. for processing HTML element attributes which
    contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
    defines them as "valid URL potentially surrounded by spaces"
    or "valid non-empty URL potentially surrounded by spaces".

    >>> strip_html5_whitespace(' hello\n')
    'hello'
    )r   HTML5_WHITESPACE)r   s    r1   strip_html5_whitespacer   `  s     ::&'''r7   )r   Tr   )
r   r	   r   r   r   r   r   r   r   r   r:   )r   r	   r   r8   r   r   )r#   N)r   r	   r=   r   r   r8   r   r   )r   r	   r   r8   r   r   )r   r   N)
r   r	   rE   r   r   r   r   r8   r   r   )r   N)r   r	   rE   r   r   r8   r   r   )rb   r#   N)
r   r	   rE   r   rf   r	   r   r8   r   r   )r   TN)
r   r	   r   r   r   r   r   r8   r   r   )r#   r   )r   r	   r~   r	   r   r   r   r   )r#   r   r   )
r   r	   r~   r   r   r   r   r   r   r   )r   r   r   r   ))__doc__
__future__r   rS   collections.abcr   html.entitiesr   r   r   urllib.parser   w3lib._typesr	   	w3lib.urlr
   
w3lib.utilr   rT   rV   r4   rU   r?   Ir   r   r   ry   r   r6   r<   r@   rB   rD   rY   ra   rj   r}   r   r   r   r   r7   r1   <module>r      s    # " " " " " 				 $ $ $ $ $ $ ( ( ( ( ( (                     # # # # # # % % % % % % ! ! ! ! ! !
"*PM  "*(")
4
4bjQSUSWXX2: EI   BJ LI  
 BJCRY 	 ! 
 	=C =C =C =C =C@< < < < <: : : : :.  RZ 2BI>> - - - - -  !#	D> D> D> D> D>P NR    4 !3	    0 	% % % % %R AH$ $ $ $ $( !7	    <( ( ( ( ( (r7   