3
tY]"                 @   s\   d Z ddlZddlZddlZdgZejddZG dd dZG dd dZ	G d	d
 d
Z
dS )a%   robotparser.py

    Copyright (C) 2000  Bastian Kleineidam

    You can choose between two licenses when using this package:
    1) GNU GPLv2
    2) PSF license for Python 2.2

    The robots.txt Exclusion Protocol is implemented as specified in
    http://www.robotstxt.org/norobots-rfc.txt
    NRobotFileParserRequestRatezrequests secondsc               @   sj   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd ZdS )r   zs This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file.

     c             C   s,   g | _ d | _d| _d| _| j| d| _d S )NFr   )entriesdefault_entrydisallow_all	allow_allset_urllast_checked)selfurl r   #lib/python3.6/urllib/robotparser.py__init__   s    
zRobotFileParser.__init__c             C   s   | j S )zReturns the time the robots.txt file was last fetched.

        This is useful for long-running web spiders that need to
        check for new robots.txt files periodically.

        )r
   )r   r   r   r   mtime$   s    zRobotFileParser.mtimec             C   s   ddl }|j  | _dS )zYSets the time the robots.txt file was last fetched to the
        current time.

        r   N)timer
   )r   r   r   r   r   modified-   s    zRobotFileParser.modifiedc             C   s&   || _ tjj|dd \| _| _dS )z,Sets the URL referring to a robots.txt file.      N)r   urllibparseurlparsehostpath)r   r   r   r   r   r	   5   s    zRobotFileParser.set_urlc             C   s   yt jj| j}W nR t jjk
rd } z2|jdkr:d| _n|jdkrT|jdk rTd| _W Y dd}~X nX |j	 }| j
|jdj  dS )	z4Reads the robots.txt URL and feeds it to the parser.    Ti  i  Nzutf-8)r   r   )r   ZrequestZurlopenr   errorZ	HTTPErrorcoder   r   readr   decode
splitlines)r   ferrrawr   r   r   r   :   s    
zRobotFileParser.readc             C   s,   d|j kr| jd kr(|| _n| jj| d S )N*)
useragentsr   r   append)r   entryr   r   r   
_add_entryG   s    

zRobotFileParser._add_entryc             C   s6  d}t  }| j  x|D ]}|sT|dkr8t  }d}n|dkrT| j| t  }d}|jd}|dkrr|d| }|j }|sq|jdd}t|dkr|d j j |d< tj	j
|d j |d< |d dkr |dkr| j| t  }|jj|d  d}q|d dkr4|dkr|jjt|d d	 d}q|d d
krh|dkr|jjt|d d d}q|d dkr|dkr|d j j rt|d |_d}q|d dkr|dkr|d jd}t|dkr|d j j r|d j j rtt|d t|d |_d}qW |dkr2| j| dS )zParse the input lines from a robots.txt file.

        We allow that a user-agent: line is not preceded by
        one or more blank lines.
        r   r      #N:z
user-agentZdisallowFZallowTzcrawl-delayzrequest-rate/)Entryr   r(   findstripsplitlenlowerr   r   unquoter%   r&   	rulelinesRuleLineisdigitintdelayr   req_rate)r   linesstater'   lineiZnumbersr   r   r   r   P   sd    






 
zRobotFileParser.parsec             C   s   | j r
dS | jrdS | jsdS tjjtjj|}tjjdd|j|j	|j
|jf}tjj|}|sfd}x"| jD ]}|j|rn|j|S qnW | jr| jj|S dS )z=using the parsed robots.txt decide if useragent can fetch urlFTr   r,   )r   r   r
   r   r   r   r3   
urlunparser   ZparamsZqueryZfragmentquoter   
applies_to	allowancer   )r   	useragentr   Z
parsed_urlr'   r   r   r   	can_fetch   s$    
zRobotFileParser.can_fetchc             C   s4   | j  sd S x| jD ]}|j|r|jS qW | jjS )N)r   r   r@   r8   r   )r   rB   r'   r   r   r   crawl_delay   s    

zRobotFileParser.crawl_delayc             C   s4   | j  sd S x| jD ]}|j|r|jS qW | jjS )N)r   r   r@   r9   r   )r   rB   r'   r   r   r   request_rate   s    

zRobotFileParser.request_ratec             C   s0   | j }| jd k	r|| jg }djtt|d S )N
)r   r   joinmapstr)r   r   r   r   r   __str__   s    
zRobotFileParser.__str__N)r   )__name__
__module____qualname____doc__r   r   r   r	   r   r(   r   rC   rD   rE   rJ   r   r   r   r   r      s   
		Cc               @   s(   e Zd ZdZdd Zdd Zdd ZdS )	r5   zoA rule line is a single "Allow:" (allowance==True) or "Disallow:"
       (allowance==False) followed by a path.c             C   s>   |dkr| rd}t jjt jj|}t jj|| _|| _d S )Nr   T)r   r   r>   r   r?   r   rA   )r   r   rA   r   r   r   r      s
    zRuleLine.__init__c             C   s   | j dkp|j| j S )Nr$   )r   
startswith)r   filenamer   r   r   r@      s    zRuleLine.applies_toc             C   s   | j r
dndd | j S )NZAllowZDisallowz: )rA   r   )r   r   r   r   rJ      s    zRuleLine.__str__N)rK   rL   rM   rN   r   r@   rJ   r   r   r   r   r5      s   r5   c               @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )r-   z?An entry has one or more user-agents and zero or more rulelinesc             C   s   g | _ g | _d | _d | _d S )N)r%   r4   r8   r9   )r   r   r   r   r      s    zEntry.__init__c             C   s   g }x| j D ]}|jd|  qW | jd k	r@|jd| j  | jd k	rj| j}|jd|j d|j  |jtt| j	 |jd dj
|S )NzUser-agent: zCrawl-delay: zRequest-rate: r,   r   rF   )r%   r&   r8   r9   ZrequestsZsecondsextendrH   rI   r4   rG   )r   ZretagentZrater   r   r   rJ      s    


zEntry.__str__c             C   sF   |j dd j }x.| jD ]$}|dkr*dS |j }||krdS qW dS )z2check if this entry applies to the specified agentr,   r   r$   TF)r0   r2   r%   )r   rB   rR   r   r   r   r@      s    zEntry.applies_toc             C   s$   x| j D ]}|j|r|jS qW dS )zZPreconditions:
        - our agent applies to this entry
        - filename is URL decodedT)r4   r@   rA   )r   rP   r<   r   r   r   rA      s    

zEntry.allowanceN)rK   rL   rM   rN   r   rJ   r@   rA   r   r   r   r   r-      s
   r-   )rN   collectionsZurllib.parser   Zurllib.request__all__
namedtupler   r   r5   r-   r   r   r   r   <module>   s    2