U
    /e}$                     @   s   d dl mZ d dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lmZmZmZ dd Zdd Zdd ZdddZG dd deZdS )    )defaultdict)IntegralN)	is_scalar)partition_all)compute_as_if_collectiontokenize)methods)Accessor)categorical_dtypecategorical_dtype_dispatchis_categorical_dtype)AttributeNotImplementedErrorclear_known_categorieshas_known_categoriesc                 C   s   |   } | D ]N\}}t| | r:| | j|| |< qt| | |dd}| | || |< q|dk	rt| jr| j|}nt| j|dd}| jj|d}| jj|_|| _| S )zCategorize a dataframe with given categories

    df: DataFrame
    categories: dict mapping column name to iterable of categories
    F)meta
categoriesorderedN)dtype)	copyitemsr   catset_categoriesr
   Zastypeindexname)dfr   r   colvalsZ	cat_dtypeind r   >/tmp/pip-unpacked-wheel-dbjnr7gq/dask/dataframe/categorical.py_categorize_block   s$    
  
r    c                 C   sv   i }|D ]8}| | }t |r0||jj||< q|  ||< q|rnt | jr\|| jjfS || j  fS |d fS )N)r   Z_constructorr   r   dropnadrop_duplicatesr   )r   columnsr   resr   xr   r   r   _get_categories1   s    
r&   c                 C   s   t t}g }| D ]6}|d  D ]\}}|| | q ||d  qdd | D }|d d krn|d fS ||d |dd   fS )Nr      c                 S   s$   i | ]\}}|t j|d d qS )T)Zignore_index)r   concatr"   .0kvr   r   r   
<dictcomp>G   s    z'_get_categories_agg.<locals>.<dictcomp>)r   listr   appendr"   )partsr$   Zres_indpr+   r,   r   r   r   _get_categories_agg@   s    r2   c                    s  | j dkr$tddgjntr2gfddD dk	rxtjrdtj ndkrxjjt	kt
sdkr| S |dkrd}n*|dkr| j}nt|tr|dk rtd	t| |}d
|   fddt|  D }d| }| j}d}	||krt|t|	 }
tt|t|D ](\}}t fdd|D f||
|f< q2|d }|
 |	d7 }	q
t fddt|D f||df< || j t| j||dff|\}dd | D }| t|S )a0  Convert columns of the DataFrame to category dtype.

    Parameters
    ----------
    columns : list, optional
        A list of column names to convert to categoricals. By default any
        column with an object dtype is converted to a categorical, and any
        unknown categoricals are made known.
    index : bool, optional
        Whether to categorize the index. By default, object indices are
        converted to categorical, and unknown categorical indices are made
        known. Set True to always categorize the index, False to never.
    split_every : int, optional
        Group partitions into groups of this size while performing a
        tree-reduction. If set to False, no tree-reduction will be used.
        Default is 16.
    kwargs
        Keyword arguments are passed on to compute.
    Nobjectcategoryc                    s(   g | ] }t  | r t | s|qS r   )r   r   )r*   c)r   r   r   
<listcomp>k   s    zcategorize.<locals>.<listcomp>F      z#split_every must be an integer >= 2zget-categories-chunk-c                    s"   i | ]\}} |ft |fqS r   )r&   )r*   ikey)ar#   r   r   r   r-      s    zcategorize.<locals>.<dictcomp>zget-categories-agg-r   c                    s   g | ]} |fqS r   r   r*   r9   r;   r   r   r6      s     r'   c                    s   g | ]} |fqS r   r   r<   r=   r   r   r6      s     c                 S   s   i | ]\}}||  qS r   )Zsort_valuesr)   r   r   r   r-      s      )_metar.   Zselect_dtypesr#   r   r   r   r   r   r3   lenZnpartitions
isinstancer   
ValueErrorr   	enumerateZ__dask_keys__strr   ranger2   updateZdaskr   	__class__r   map_partitionsr    )r   r#   r   Zsplit_everykwargstokenZdskprefixr+   depthbZpart_iZindsr   r   )r;   r#   r   r   r   
categorizeP   s^    



""  rM   c                   @   sd   e Zd ZdZdZdZdZedd Zdd Z	d	d
 Z
edd Zedd Zedd Zdd ZdS )CategoricalAccessora  
    Accessor object for categorical properties of the Series values.

    Examples
    --------
    >>> s.cat.categories  # doctest: +SKIP

    Notes
    -----
    Attributes that depend only on metadata are eager

    * categories
    * ordered

    Attributes depending on the entire dataset are lazy

    * codes
    * ...

    So `df.a.cat.categories` <=> `df.a._meta.cat.categories`
    So `df.a.cat.codes` <=> `df.a.map_partitions(lambda x: x.cat.codes)`
    r   )Zadd_categoriesZ
as_orderedZas_unorderedZremove_categoriesZrename_categoriesZreorder_categoriesr   r   c                 C   s
   t | jS )z&Whether the categories are fully known)r   _seriesselfr   r   r   known   s    zCategoricalAccessor.knownc                 K   s.   | j r| jS | d jf |}| |jS )aL  Ensure the categories in this series are known.

        If the categories are known, this is a no-op. If unknown, the
        categories are computed, and a new series with known categories is
        returned.

        Parameters
        ----------
        kwargs
            Keywords to pass on to the call to `compute`.
        r   )rR   rO   _property_mapuniquecomputer   values)rQ   rH   r   r   r   r   as_known   s    zCategoricalAccessor.as_knownc                 C   s&   | j s| jS | j }t|j|_|S )z0Ensure the categories in this series are unknown)rR   rO   r   r   r>   )rQ   outr   r   r   
as_unknown   s
    
zCategoricalAccessor.as_unknownc                 C   s   |  | jjddS )z3Whether the categories have an ordered relationshipr   r   )_delegate_propertyrO   r>   rP   r   r   r   r      s    zCategoricalAccessor.orderedc                 C   s$   | j sd}t|| | jjddS )zZThe categories of this categorical.

        If categories are unknown, an error is raisedz`df.column.cat.categories` with unknown categories is not supported.  Please use `column.cat.as_known()` or `df.categorize()` beforehand to ensure known categoriesr   r   )rR   r   rZ   rO   r>   rQ   msgr   r   r   r      s
    zCategoricalAccessor.categoriesc                 C   s   | j sd}t|| dS )zUThe codes of this categorical.

        If categories are unknown, an error is raisedz`df.column.cat.codes` with unknown categories is not supported.  Please use `column.cat.as_known()` or `df.categorize()` beforehand to ensure known categoriescodes)rR   r   rS   r[   r   r   r   r]      s
    zCategoricalAccessor.codesc              	   C   s   | j   }t| }t| j jtjr6| j j}n
| j jj	}|
|j\}}|dkr^| j S ||dk }|j||jd}| j j| jdddd|i|dd	S )
z
        Removes categories which are not used

        Notes
        -----
        This method requires a full scan of the data to compute the
        unique values, which can be expensive.
        N)r   r   r   r   new_categorieszcat-set_categories)r   rI   )rO   r!   rT   pdZIndexrU   r@   r>   ZCategoricalIndexr   Zreindexr   r   r   rG   Z_delegate_method)rQ   ZpresentZmeta_catr   maskr_   r   r   r   r   remove_unused_categories  s&    


z,CategoricalAccessor.remove_unused_categoriesN)__name__
__module____qualname____doc__Z_accessor_nameZ_accessor_methodsZ_accessor_propertiespropertyrR   rW   rY   r   r   r]   rb   r   r   r   r   rN      s   	



rN   )NNN)collectionsr   Znumbersr   Zpandasr`   Zpandas.api.typesr   Ztlzr   Z	dask.baser   r   Zdask.dataframer   Zdask.dataframe.accessorr	   Zdask.dataframe.dispatchr
   r   r   Zdask.dataframe.utilsr   r   r   r    r&   r2   rM   rN   r   r   r   r   <module>   s   
S