ó
Û³Y]c           @   sU   d  Z  d d l m Z d d l m Z d Z d Z d „  Z d e f d „  ƒ  YZ	 d	 S(
   sC   
A library written in CUDA Python for generating reduction kernels
iÿÿÿÿ(   t   division(   t
   from_dtypei    i   c            sÜ   d d l  m ‰  ˆ  j d t ƒ |  ƒ ‰ t d ‰ t t ‰ ˆ  j d t ƒ ‡  ‡ f d †  ƒ ‰ ˆ  j d t ƒ ‡  ‡ ‡ f d †  ƒ ‰ ˆ  j d t ƒ ‡  ‡ ‡ f d †  ƒ ‰ ‡  ‡ ‡ ‡ ‡ ‡ ‡ f d †  } ˆ  j | ƒ S(	   Niÿÿÿÿ(   t   cudat   devicei   c            s™   ˆ  j  j } | t } | t } |  | d d … f } | | | <t d } xH | r” | | k  r‡ | | } ˆ | | | | ƒ | | <n  | d } qM Wd S(   s8   
        Compute reduction within a single warp
        Ni   (   t	   threadIdxt   xt	   _WARPSIZE(   t   sm_partialst   initt   tidt   warpidt   laneidt   sm_thist   widtht   old(   R   t	   reduce_op(    s;   lib/python2.7/site-packages/numba/cuda/kernels/reduction.pyt   inner_warp_reduction   s    



	
c            s  ˆ  j  j } ˆ  j j } ˆ  j j } ˆ  j j } | | | } |  j } | | }	 |  | }
 x1 t | |	 | |	 ƒ D] } ˆ |
 |  | ƒ }
 qr Wˆ  j ƒ  ˆ | |
 ƒ ˆ  j ƒ  | d k  rð ˆ | | d f | | d d f ƒ | | d f <n  | d k rˆ | d | d ƒ | | <n  d S(   sœ  
        Partially reduce `arr` into `partials` using `sm_partials` as working
        space.  The algorithm goes like:

            array chunks of 128:  |   0 | 128 | 256 | 384 | 512 |
                        block-0:  |   x |     |     |   x |     |
                        block-1:  |     |   x |     |     |   x |
                        block-2:  |     |     |   x |     |     |

        The array is divided into chunks of 128 (size of a threadblock).
        The threadblocks consumes the chunks in roundrobin scheduling.
        First, a threadblock loads a chunk into temp memory.  Then, all
        subsequent chunks are combined into the temp memory.

        Once all chunks are processed.  Inner-block reduction is performed
        on the temp memory.  So that, there will just be one scalar result
        per block.  The result from each block is stored to `partials` at
        the dedicated slot.
        i   i    i   N(   i    i    (   i   i    (   R   R   t   blockIdxt   blockDimt   gridDimt   sizet   ranget   syncthreads(   t   arrt   partialsR   R	   t   blkidt   blkszt   gridszt   startt   stopt   stept   tmpt   i(   R   R   R   (    s;   lib/python2.7/site-packages/numba/cuda/kernels/reduction.pyt   device_reduce_full_block(   s$    	



$c            ss  ˆ  j  j } ˆ  j j } ˆ  j j } | t } | t } |  j } ˆ  j  j } |  | }	 |	 | | | f <ˆ  j ƒ  | d t | k  r• ˆ | |	 ƒ nh | d k rý | | d d … f }
 | t } x9 t d | | ƒ D]! } ˆ |
 d |
 | ƒ |
 d <qÕ Wn  ˆ  j ƒ  | d k ro| t d t } | d } x0 t d | ƒ D] } ˆ | | | d f ƒ } q?W| | | <n  d S(   sÂ   
        This computes reduction on `arr`.
        This device function must be used by 1 threadblock only.
        The blocksize must match `arr.size` and must not be greater than 128.
        i   i    N(   i    i    (   R   R   R   R   R   R   R   R   (   R   R   R   R	   R   R   R
   R   R   t   valueR   t   baseR    t   num_active_warpst   result(   R   R   R   (    s;   lib/python2.7/site-packages/numba/cuda/kernels/reduction.pyt   device_reduce_partial_block^   s0    

	


"

c            s¡   ˆ  j  j } ˆ  j j t ˆ f d ˆ ƒ} ˆ  j j ˆ k rO ˆ |  | | ƒ n ˆ |  | | ƒ | r | d k r ˆ  j j d k r ˆ | d | ƒ | d <n  d S(   sI  
        Perform reductions on *arr* and writing out partial reduction result
        into *partials*.  The length of *partials* is determined by the
        number of threadblocks. The initial value is set with *init*.

        Launch config:

        Blocksize must be mutiple of warpsize and it is limited to 4 warps.
        t   dtypei    N(   R   R   t   sharedt   arrayt	   _NUMWARPSR   R   (   R   R   R   t   use_initR	   R   (   R   R!   R&   t   inner_sm_sizet   max_blocksizet   nbtypeR   (    s;   lib/python2.7/site-packages/numba/cuda/kernels/reduction.pyt   gpu_reduce_block_stridedŠ   s    
	$(   t   numbaR   t   jitt   TrueR   R*   (   t   fnR.   R/   (    (   R   R!   R&   R,   R   R-   R.   R   s;   lib/python2.7/site-packages/numba/cuda/kernels/reduction.pyt   _gpu_reduce_factory   s    

$'6',!t   Reducec           B   s5   e  Z i  Z d  „  Z d „  Z d d d d d „ Z RS(   c         C   s   | |  _  d S(   sí  Create a reduction object that reduces values using a given binary
        function. The binary function is compiled once and cached inside this
        object. Keeping this object alive will prevent re-compilation.

        :param binop: A function to be compiled as a CUDA device function that
                    will be used as the binary operation for reduction on a
                    CUDA device. Internally, it is compiled using
                    ``cuda.jit(device=True)``.
        N(   t   _functor(   t   selft   functor(    (    s;   lib/python2.7/site-packages/numba/cuda/kernels/reduction.pyt   __init__¦   s    
c         C   sW   |  j  | f } | |  j k r. |  j | } n% t |  j  t | ƒ ƒ } | |  j | <| S(   N(   R6   t   _cacheR4   R   (   R7   R'   t   keyt   kernel(    (    s;   lib/python2.7/site-packages/numba/cuda/kernels/reduction.pyt   _compile²   s    i    c         C   s½  d d l  m } | j d k r. t d ƒ ‚ n  | d	 k	 rG | |  } n  | j j | ƒ } | j d k  rl | S|  j | j ƒ } t	 t
 } | j | | }	 | j |	 }
 t |	 | t
 d ƒ } | } |
 rÖ | d 7} n  | j d | d | j ƒ } |	 r"| | | | f | |	  | |  | t ƒ n  |
 rT| d |
 | f | |	 | | | | ƒ n  | j d k r†| d | | f | | | t ƒ n  | d	 k	 r±| d  j | d  d | ƒd	 S| d
 Sd	 S(   s  Performs a full reduction.

        :param arr: A host or device array. If a device array is given, the
                    reduction is performed inplace and the values in the array
                    are overwritten. If a host array is given, it is copied to
                    the device automatically.
        :param size: Optional integer specifying the number of elements in
                    ``arr`` to reduce. If this parameter is not specified, the
                    entire array is reduced.
        :param res: Optional device array into which to write the reduction
                    result to. The result is written into the first element of
                    this array. If this parameter is specified, then no
                    communication of the reduction output takes place from the
                    device to the host.
        :param init: Optional initial value for the reduction, the type of which
                    must match ``arr.dtype``.
        :param stream: Optional CUDA stream in which to perform the reduction.
                    If no stream is specified, the default stream of 0 is
                    used.
        :return: If ``res`` is specified, ``None`` is returned. Otherwise, the
                result of the reduction is returned.
        iÿÿÿÿ(   R   i   s   only support 1D arrayi   t   shapeR'   t   streamNi    (   R0   R   t   ndimt	   TypeErrort   NoneR'   t   typeR   R=   R*   R   t   mint   device_arrayR2   t   Falset   copy_to_device(   R7   R   R   t   resR   R?   R   R<   t	   blocksizet	   size_fullt   size_partialt   full_blockctt   partials_sizeR   (    (    s;   lib/python2.7/site-packages/numba/cuda/kernels/reduction.pyt   __call__»   sB    

#N(   t   __name__t
   __module__R:   R9   R=   RB   RN   (    (    (    s;   lib/python2.7/site-packages/numba/cuda/kernels/reduction.pyR5   £   s   			N(
   t   __doc__t
   __future__R    t   numba.numpy_supportR   R   R*   R4   t   objectR5   (    (    (    s;   lib/python2.7/site-packages/numba/cuda/kernels/reduction.pyt   <module>   s   	–