B � f�"�@s\dZddlZddlZddlZdgZe�dd�ZGdd�d�ZGdd�d�Z Gd d �d �Z dS) a% robotparser.py Copyright (C) 2000 Bastian Kleineidam You can choose between two licenses when using this package: 1) GNU GPLv2 2) PSF license for Python 2.2 The robots.txt Exclusion Protocol is implemented as specified in http://www.robotstxt.org/norobots-rfc.txt �N�RobotFileParser� RequestRatezrequests secondsc@sjeZdZdZddd�Zdd�Zdd�Zd d �Zd d �Zd d�Z dd�Z dd�Z dd�Z dd�Z dd�ZdS)rzs This class provides a set of methods to read, parse and answer questions about a single robots.txt file. �cCs,g|_d|_d|_d|_|�|�d|_dS)NFr)�entries� default_entry� disallow_all� allow_all�set_url� last_checked)�self�url�r �7/opt/alt/python37/lib64/python3.7/urllib/robotparser.py�__init__s  zRobotFileParser.__init__cCs|jS)z�Returns the time the robots.txt file was last fetched. This is useful for long-running web spiders that need to check for new robots.txt files periodically. )r )r r r r�mtime$szRobotFileParser.mtimecCsddl}|��|_dS)zYSets the time the robots.txt file was last fetched to the current time. rN)�timer )r rr r r�modified-szRobotFileParser.modifiedcCs&||_tj�|�dd�\|_|_dS)z,Sets the URL referring to a robots.txt file.��N)r �urllib�parse�urlparseZhost�path)r r r r rr 5szRobotFileParser.set_urlc Cs�ytj�|j�}WnRtjjk rd}z0|jdkr:d|_n|jdkrT|jdkrTd|_Wdd}~XYnX|� �}|� |� d�� ��dS)z4Reads the robots.txt URL and feeds it to the parser.)i�i�Ti�i�Nzutf-8) rZrequestZurlopenr �errorZ HTTPError�coderr�readr�decode� splitlines)r �f�err�rawr r rr:s zRobotFileParser.readcCs,d|jkr|jdkr(||_n |j�|�dS)N�*)� useragentsrr�append)r �entryr r r� _add_entryGs  zRobotFileParser._add_entrycCs6d}t�}|���x|D�]�}|sT|dkr8t�}d}n|dkrT|�|�t�}d}|�d�}|dkrr|d|�}|��}|s�q|�dd�}t|�dkr|d����|d<tj � |d���|d<|ddk�r|dkr�|�|�t�}|j � |d�d}q|ddk�r4|dk�r|j � t|dd ��d}q|dd k�rh|dk�r|j � t|dd ��d}q|dd k�r�|dk�r|d�����r�t|d�|_d}q|dd kr|dkr|d�d�}t|�dk�r|d�����r|d�����rtt|d�t|d��|_d}qW|dk�r2|�|�dS)z�Parse the input lines from a robots.txt file. We allow that a user-agent: line is not preceded by one or more blank lines. rr��#N�:z user-agentZdisallowFZallowTz crawl-delayz request-rate�/)�Entryrr%�find�strip�split�len�lowerrr�unquoter"r#� rulelines�RuleLine�isdigit�int�delayr�req_rate)r �lines�stater$�line�iZnumbersr r rrPsd             zRobotFileParser.parsecCs�|jr dS|jrdS|jsdStj�tj�|��}tj�dd|j|j |j |j f�}tj� |�}|sfd}x"|j D]}|�|�rn|�|�SqnW|jr�|j�|�SdS)z=using the parsed robots.txt decide if useragent can fetch urlFTrr))rrr rrrr0� urlunparserZparamsZqueryZfragment�quoter� applies_to� allowancer)r � useragentr Z parsed_urlr$r r r� can_fetch�s$    zRobotFileParser.can_fetchcCs>|��s dSx|jD]}|�|�r|jSqW|jr:|jjSdS)N)rrr=r5r)r r?r$r r r� crawl_delay�s   zRobotFileParser.crawl_delaycCs>|��s dSx|jD]}|�|�r|jSqW|jr:|jjSdS)N)rrr=r6r)r r?r$r r r� request_rate�s   zRobotFileParser.request_ratecCs0|j}|jdk r||jg}d�tt|��dS)N� )rr�join�map�str)r rr r r�__str__�s  zRobotFileParser.__str__N)r)�__name__� __module__� __qualname__�__doc__rrrr rr%rr@rArBrGr r r rrs    C  c@s(eZdZdZdd�Zdd�Zdd�ZdS) r2zoA rule line is a single "Allow:" (allowance==True) or "Disallow:" (allowance==False) followed by a path.cCs<|dkr|sd}tj�tj�|��}tj�|�|_||_dS)NrT)rrr;rr<rr>)r rr>r r rr�s  zRuleLine.__init__cCs|jdkp|�|j�S)Nr!)r� startswith)r �filenamer r rr=�szRuleLine.applies_tocCs|jr dndd|jS)NZAllowZDisallowz: )r>r)r r r rrG�szRuleLine.__str__N)rHrIrJrKrr=rGr r r rr2�sr2c@s0eZdZdZdd�Zdd�Zdd�Zdd �Zd S) r*z?An entry has one or more user-agents and zero or more rulelinescCsg|_g|_d|_d|_dS)N)r"r1r5r6)r r r rr�szEntry.__init__cCs�g}x|jD]}|�d|���q W|jdk r@|�d|j���|jdk rj|j}|�d|j�d|j���|�tt|j ��|�d�d� |�S)Nz User-agent: z Crawl-delay: zRequest-rate: r)rrC) r"r#r5r6ZrequestsZseconds�extendrErFr1rD)r Zret�agentZrater r rrG�s    z Entry.__str__cCsF|�d�d��}x.|jD]$}|dkr*dS|��}||krdSqWdS)z2check if this entry applies to the specified agentr)rr!TF)r-r/r")r r?rOr r rr=�s zEntry.applies_tocCs$x|jD]}|�|�r|jSqWdS)zZPreconditions: - our agent applies to this entry - filename is URL decodedT)r1r=r>)r rMr9r r rr>�s   zEntry.allowanceN)rHrIrJrKrrGr=r>r r r rr*�s   r*) rK� collectionsZ urllib.parserZurllib.request�__all__� namedtuplerrr2r*r r r r�<module> s 6