U
    /eT-                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlm	Z	 d dl
mZ d dlmZmZmZmZ d dlmZ d dlmZ erxenejZddd	dd	d	efd
dZdddZdddZdS )    N)is_list_like	is_scalar)methods)PANDAS_GT_200)	DataFrameSeriesapply_concat_applymap_partitionshas_known_categories)M_Fc              
      s$  t  tjtjfr4tj f|||||||d|S d}	d}
t  trjt sXt|	t st|
nxt  tr|dkr j	dk
 rt|	 jjdgdj}nt fdd	|D st|	t fd
d	|D st|
 jjjdd }tj| j}t| f|||||||d|S )a
  
    Convert categorical variable into dummy/indicator variables.

    Data must have category dtype to infer result's ``columns``.

    Parameters
    ----------
    data : Series, or DataFrame
        For Series, the dtype must be categorical.
        For DataFrame, at least one column must be categorical.
    prefix : string, list of strings, or dict of strings, default None
        String to append DataFrame column names.
        Pass a list with length equal to the number of columns
        when calling get_dummies on a DataFrame. Alternatively, `prefix`
        can be a dictionary mapping column names to prefixes.
    prefix_sep : string, default '_'
        If appending prefix, separator/delimiter to use. Or pass a
        list or dictionary as with `prefix.`
    dummy_na : bool, default False
        Add a column to indicate NaNs, if False NaNs are ignored.
    columns : list-like, default None
        Column names in the DataFrame to be encoded.
        If `columns` is None then all the columns with
        `category` dtype will be converted.
    sparse : bool, default False
        Whether the dummy columns should be sparse or not.  Returns
        SparseDataFrame if `data` is a Series or if all columns are included.
        Otherwise returns a DataFrame with some SparseBlocks.

        .. versionadded:: 0.18.2

    drop_first : bool, default False
        Whether to get k-1 dummies out of k categorical levels by removing the
        first level.

    dtype : dtype, default bool
        Data type for new columns. Only a single dtype is allowed.

        .. versionadded:: 0.18.2

    Returns
    -------
    dummies : DataFrame

    Examples
    --------
    Dask's version only works with Categorical data, as this is the only way to
    know the output shape without computing all the data.

    >>> import pandas as pd
    >>> import dask.dataframe as dd
    >>> s = dd.from_pandas(pd.Series(list('abca')), npartitions=2)
    >>> dd.get_dummies(s)
    Traceback (most recent call last):
        ...
    NotImplementedError: `get_dummies` with non-categorical dtypes is not supported...

    With categorical data:

    >>> s = dd.from_pandas(pd.Series(list('abca'), dtype='category'), npartitions=2)
    >>> dd.get_dummies(s)  # doctest: +NORMALIZE_WHITESPACE
    Dask DataFrame Structure:
                       a      b      c
    npartitions=2
    0              uint8  uint8  uint8
    2                ...    ...    ...
    3                ...    ...    ...
    Dask Name: get_dummies, 2 graph layers
    >>> dd.get_dummies(s).compute()  # doctest: +ELLIPSIS
       a  b  c
    0  1  0  0
    1  0  1  0
    2  0  0  1
    3  1  0  0

    See Also
    --------
    pandas.get_dummies
    )prefix
prefix_sepdummy_nacolumnssparse
drop_firstdtypez`get_dummies` with non-categorical dtypes is not supported. Please use `df.categorize()` beforehand to convert to categorical dtype.z`get_dummies` with unknown categories is not supported. Please use `column.cat.as_known()` or `df.categorize()` beforehand to ensure known categoriesNobjectcategory)includec                 3   s   | ]}t  | V  qd S N)r   is_categorical_dtype.0cdata :/tmp/pip-unpacked-wheel-dbjnr7gq/dask/dataframe/reshape.py	<genexpr>   s     zget_dummies.<locals>.<genexpr>c                 3   s   | ]}t  | V  qd S r   r
   r   r   r   r    r!      s     .r   )
isinstancepdr   r   get_dummiesr   r   NotImplementedErrorr   dtypesany_metaZselect_dtypesr   all	__class__
__module__splitsysmodulesr	   )r   r   r   r   r   r   r   r   kwargsZnot_cat_msgZunknown_cat_msgpackage_nameZdummiesr   r   r    r%      s`    Z	




r%   meanc                 C   sj  t |r|dkrtdt |r(|dkr0tdt| | sFtdt| | sZtdt|rttdd |D st |stdd	d
dddg}t |r||krtdddd |D  tj	| | j
j|d}t |r|}ntjjt||fd|gd}|dkrzt |r6tj|| | jt| j| d}nBtj|t| j| d}|D ]"}	||	 | | j|	 ||	< qTntj|tjt| j| d}|||d}
|dkrt| gtjtj|d|
d}|dkrt| gtjtj|d|
d}|d
kr|S |dkr|S |d	kr|| S |dkr>t| gtjtj|d|
dS |dkrbt| gtjtj|d|
dS tdS )a  
    Create a spreadsheet-style pivot table as a DataFrame. Target ``columns``
    must have category dtype to infer result's ``columns``.
    ``index``, ``columns``, and ``aggfunc`` must be all scalar.
    ``values`` can be scalar or list-like.

    Parameters
    ----------
    df : DataFrame
    index : scalar
        column to be index
    columns : scalar
        column to be columns
    values : scalar or list(scalar)
        column(s) to aggregate
    aggfunc : {'mean', 'sum', 'count', 'first', 'last'}, default 'mean'

    Returns
    -------
    table : DataFrame

    See Also
    --------
    pandas.DataFrame.pivot_table
    Nz.'index' must be the name of an existing columnz0'columns' must be the name of an existing columnz 'columns' must be category dtypezs'columns' must have known categories. Please use `df[columns].cat.as_known()` beforehand to ensure known categoriesc                 S   s   g | ]}t |qS r   )r   )r   vr   r   r    
<listcomp>   s     zpivot_table.<locals>.<listcomp>z4'values' must refer to an existing column or columnsr2   sumcountfirstlastzaggfunc must be either z, c                 s   s   | ]}d | d V  qdS )'Nr   )r   xr   r   r    r!      s     zpivot_table.<locals>.<genexpr>)name)names)r7   r8   )r   r   index)r   r=   )r=   r   values)r5   r2   Zpivot_table_sum)chunkZ	aggregatemetatokenZchunk_kwargs)r6   r2   Zpivot_table_countZpivot_table_firstZpivot_table_last)r   
ValueErrorr   r   r   r   r*   joinr$   ZCategoricalIndexcat
categoriesZ
MultiIndexZfrom_productsortedr   r   ZIndexr)   Zastyper'   npZfloat64r   Z	pivot_sumZ	pivot_aggZpivot_countZpivot_firstZpivot_agg_firstZ
pivot_lastZpivot_agg_last)Zdfr=   r   r>   ZaggfuncZavailable_aggfuncsZcolumns_contentsZnew_columnsr@   Z	value_colr0   Zpv_sumZpv_countr   r   r    pivot_table   s    
 

"  
	
	




	rH   valuec              
   C   s(   ddl m} | jtj||||||ddS )a  
    Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set.

    This function is useful to massage a DataFrame into a format where one or more columns are identifier variables
    (``id_vars``), while all other columns, considered measured variables (``value_vars``), are "unpivoted" to the row
    axis, leaving just two non-identifier columns, 'variable' and 'value'.

    Parameters
    ----------
    frame : DataFrame
    id_vars : tuple, list, or ndarray, optional
        Column(s) to use as identifier variables.
    value_vars : tuple, list, or ndarray, optional
        Column(s) to unpivot. If not specified, uses all columns that
        are not set as `id_vars`.
    var_name : scalar
        Name to use for the 'variable' column. If None it uses
        ``frame.columns.name`` or 'variable'.
    value_name : scalar, default 'value'
        Name to use for the 'value' column.
    col_level : int or string, optional
        If columns are a MultiIndex then use this level to melt.

    Returns
    -------
    DataFrame
        Unpivoted DataFrame.

    See Also
    --------
    pandas.DataFrame.melt
    r   )
no_defaultmelt)r@   id_vars
value_varsvar_name
value_name	col_levelrA   )dask.dataframe.corerJ   r	   r   rK   )framerL   rM   rN   rO   rP   rJ   r   r   r    rK   <  s    )rK   )NNNr2   )NNNrI   N)r.   ZnumpyrG   Zpandasr$   Zpandas.api.typesr   r   Zdask.dataframer   Zdask.dataframe._compatr   rQ   r   r   r   r	   Zdask.dataframe.utilsr   Z
dask.utilsr   boolZuint8Z_get_dummies_dtype_defaultr%   rH   rK   r   r   r   r    <module>   s4   
 
      