��Yf?E�@sdZddlZddlZddlZddlmZdgZejd�Zejd�Z ejd�Z ejd�Z ejd �Z ejd �Z ejd �Zejd �Zejd �Zejdej�Zejd �Zejd�ZGdd�dej�ZdS)zA parser for HTML and XHTML.�N)�unescape� HTMLParserz[&<]z &[a-zA-Z#]z%&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]z)&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]z <[a-zA-Z]�>z--\s*>z$([a-zA-Z][^ />]*)(?:\s|/(?!>))*z]((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*aF <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) (?:\s*,)* # possibly followed by a comma )?(?:\s|/(?!>))* )* )? \s* # trailing whitespace z#</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>c@sWeZdZdZd:Zdddd�Zdd �Zd d �Zd d �ZdZ dd�Z dd�Z dd�Z dd�Z dd�Zddd�Zdd�Zdd�Zd d!�Zd"d#�Zd$d%�Zd&d'�Zd(d)�Zd*d+�Zd,d-�Zd.d/�Zd0d1�Zd2d3�Zd4d5�Zd6d7�Zd8d9�ZdS);raEFind tags and other markup and call handler functions. Usage: p = HTMLParser() p.feed(data) ... p.close() Start tags are handled by calling self.handle_starttag() or self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data may be split up in arbitrary chunks). If convert_charrefs is True the character references are converted automatically to the corresponding Unicode character (and self.handle_data() is no longer split in chunks), otherwise they are passed by calling self.handle_entityref() or self.handle_charref() with the string containing respectively the named or numeric reference as the argument. �script�style�convert_charrefsTcCs||_|j�dS)z�Initialize and reset this instance. If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters. N)r�reset)�selfr�r �0/opt/alt/python35/lib64/python3.5/html/parser.py�__init__Ws zHTMLParser.__init__cCs8d|_d|_t|_d|_tjj|�dS)z1Reset this instance. Loses all unprocessed data.�z???N)�rawdata�lasttag�interesting_normal� interesting� cdata_elem� _markupbase� ParserBaser)r r r r r`s     zHTMLParser.resetcCs!|j||_|jd�dS)z�Feed data to the parser. Call this as often as you want, with as little or as much text as you want (may include '\n'). rN)r�goahead)r �datar r r �feedhszHTMLParser.feedcCs|jd�dS)zHandle any buffered data.�N)r)r r r r �closeqszHTMLParser.closeNcCs|jS)z)Return full source of start tag: '<...>'.)�_HTMLParser__starttag_text)r r r r �get_starttag_textwszHTMLParser.get_starttag_textcCs2|j�|_tjd|jtj�|_dS)Nz </\s*%s\s*>)�lowerr�re�compile�Ir)r �elemr r r �set_cdata_mode{szHTMLParser.set_cdata_modecCst|_d|_dS)N)rrr)r r r r �clear_cdata_modes zHTMLParser.clear_cdata_modec Cs�|j}d}t|�}x||kr+|jr�|j r�|jd|�}|dkr�|jdt||d��}|dkr�tjd�j ||� r�P|}n:|j j ||�}|r�|j �}n|jr�P|}||kr<|jr%|j r%|j t |||���n|j |||��|j||�}||kr[P|j}|d|�rtj||�r�|j|�} n�|d|�r�|j|�} n�|d|�r�|j|�} nm|d|�r�|j|�} nL|d |�r|j|�} n+|d |krE|j d�|d } nP| dkr�|sYP|jd |d �} | dkr�|jd|d �} | dkr�|d } n | d 7} |jr�|j r�|j t ||| ���n|j ||| ��|j|| �}q|d |�r�tj||�}|r�|j�d d�} |j| �|j�} |d| d �s�| d } |j|| �}qq(d||d�kr�|j |||d ��|j||d �}Pq|d|�rtj||�}|rj|jd �} |j| �|j�} |d| d �sU| d } |j|| �}qtj||�}|r�|r�|j�||d�kr�|j�} | |kr�|} |j||d �}Pq(|d |kr|j d�|j||d �}q(Pqdstd��qW|r�||kr�|j r�|jr{|j r{|j t |||���n|j |||��|j||�}||d�|_dS)Nr�<�&�"z[\s;]z</z<!--z<?z<!rrz&#��;zinteresting.search() lied���)r�lenrr�find�rfind�maxrr�searchr�start� handle_datarZ updatepos� startswith� starttagopen�match�parse_starttag� parse_endtag� parse_comment�parse_pi�parse_html_declaration�charref�group�handle_charref�end� entityref�handle_entityref� incomplete�AssertionError) r r;r�i�n�jZampposr2r0�k�namer r r r�s�                        "    zHTMLParser.goaheadcCs�|j}|||d�dks/td��|||d�dkrV|j|�S|||d�dkr}|j|�S|||d�j�d kr�|jd |d�}|d kr�d S|j||d|��|d S|j|�SdS)Nr&z<!z+unexpected call to parse_html_declaration()�z<!--�z<![� z <!doctyperrr(r()rr?r5Zparse_marked_sectionrr*� handle_decl�parse_bogus_comment)r r@r�gtposr r r r7s &    z!HTMLParser.parse_html_declarationrcCs~|j}|||d�dks/td��|jd|d�}|dkrUd S|rv|j||d|��|dS) Nr&�<!�</z"unexpected call to parse_comment()rr)rKrLr(r()rr?r*�handle_comment)r r@Zreportr�posr r r rIs & zHTMLParser.parse_bogus_commentcCs�|j}|||d�dks/td��tj||d�}|sOdS|j�}|j||d|��|j�}|S)Nr&z<?zunexpected call to parse_pi()rr()rr?�picloser-r.� handle_pir;)r r@rr2rBr r r r6!s &  zHTMLParser.parse_picCs�d|_|j|�}|dkr(|S|j}|||�|_g}tj||d�}|srtd��|j�}|jd�j�|_ }x�||kr�t j||�}|s�P|jddd�\} } } | s�d} np| dd�dko| d d�knsO| dd�dkoJ| d d�knr_| dd �} | rqt | �} |j | j�| f�|j�}q�W|||�j �} | dkrC|j�\} }d |jkr| |jjd �} t|j�|jjd �}n|t|j�}|j|||��|S| jd �re|j||�n,|j||�||jkr�|j|�|S)Nrrz#unexpected call to parse_starttag()r&rF�'�"r�/>� r(r(r()rrS)r�check_for_whole_start_tagr�tagfind_tolerantr2r?r;r9rr�attrfind_tolerantr�append�stripZgetpos�countr)r+r/�endswith�handle_startendtag�handle_starttag�CDATA_CONTENT_ELEMENTSr!)r r@�endposr�attrsr2rC�tag�mZattrname�restZ attrvaluer;�lineno�offsetr r r r3-sR     00    zHTMLParser.parse_starttagcCs�|j}tj||�}|r�|j�}|||d�}|dkrU|dS|dkr�|jd|�r{|dS|jd|�r�d S||kr�|S|dS|dkr�d S|dkr�d S||kr�|S|dStd��dS) Nrr�/z/>r&r z6abcdefghijklmnopqrstuvwxyz=/ABCDEFGHIJKLMNOPQRSTUVWXYZzwe should not get here!r(r(r()r�locatestarttagend_tolerantr2r;r0r?)r r@rrbrB�nextr r r rU`s.        z$HTMLParser.check_for_whole_start_tagcCs�|j}|||d�dks/td��tj||d�}|sOdS|j�}tj||�}|s*|jdk r�|j|||��|St j||d�}|s�|||d�dkr�|dS|j |�S|j d�j �}|j d|j��}|j|�|dS|j d�j �}|jdk rx||jkrx|j|||��|S|j|j ��|j�|S) Nr&z</zunexpected call to parse_endtagrrFz</>rr()rr?� endendtagr-r;� endtagfindr2rr/rVrIr9rr*� handle_endtagr")r r@rr2rJZ namematchZtagnamer r r r r4�s8 &    zHTMLParser.parse_endtagcCs!|j||�|j|�dS)N)r]rk)r rar`r r r r\�szHTMLParser.handle_startendtagcCsdS)Nr )r rar`r r r r]�szHTMLParser.handle_starttagcCsdS)Nr )r rar r r rk�szHTMLParser.handle_endtagcCsdS)Nr )r rDr r r r:�szHTMLParser.handle_charrefcCsdS)Nr )r rDr r r r=�szHTMLParser.handle_entityrefcCsdS)Nr )r rr r r r/�szHTMLParser.handle_datacCsdS)Nr )r rr r r rM�szHTMLParser.handle_commentcCsdS)Nr )r Zdeclr r r rH�szHTMLParser.handle_declcCsdS)Nr )r rr r r rP�szHTMLParser.handle_picCsdS)Nr )r rr r r � unknown_decl�szHTMLParser.unknown_declcCs tjdtdd�t|�S)NzZThe unescape method is deprecated and will be removed in 3.5, use html.unescape() instead.� stacklevelr&)�warnings�warn�DeprecationWarningr)r �sr r r r�s  zHTMLParser.unescape)rr)�__name__� __module__� __qualname__�__doc__r^r rrrrrr!r"rr7rIr6r3rUr4r\r]rkr:r=r/rMrHrPrlrr r r r r?s8       z  3 " (          )rurrnrZhtmlr�__all__rrr>r<r8r1rOZ commentcloserVrW�VERBOSErgrirjrrr r r r �<module>s(