U
    /ea                    @  s  d dl mZ d dlZd dlZd dlmZmZmZ d dlm	Z	m
Z
 d dlmZmZ d dlmZ d dlmZ d dlmZmZmZmZmZ d dlZd dlZd d	lmZmZmZmZ d d
l m!Z!m"Z"m#Z#m$Z$m%Z% d dl&m'Z( d dl)m*Z* d dl+m,Z,m-Z- d dl.m/Z0 d dl1m2Z2m3Z3m4Z4m5Z5m6Z6 d dl7m8Z8m9Z9m:Z:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZAmBZBmCZCmDZD d dlEmFZFmGZGmHZH d dlImJZJmKZK d dlLmMZMmNZNmOZOmPZP d dlQmRZR d dlSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZe d dlfmgZgmhZhmiZi d dljmkZk d dllmmZm d dlnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZmZmZ d dlmZ e5de5d ZdZeBrdnd Zed!d" d#d$ Zdd%d&Zd'd( ZG d)d* d*e2eqZdd+d,ZG d-d. d.e2eqZd/d0 ZG d1d2 d2eZG d3d4 d4eZG d5d6 d6eZejejejejejejejejejejejejejejejejejejejfD ]Zee ee qd7D ]6ZeejeZeee eejeZeee qd8D ]6ZeejeZeee eejeZeee qd9d: Zedd d;d<d=Zd>d? Zd@dA ZddBdCZdDdE ZdFdG ZddHdIZe\dddedddddddddd"fdJdKZeZddLdMZd"dNdOdPZe\ed d d dQdRdSZ/dTdU ZdVdW ZdXdY ZdZd[ Zd\d] Zdd_d`ZddadbZddcddZddedfZddhdiZddjdkZddldmZdndo ZddpdqZddrdsZdtdu Zdvdw ZddxdyZdzd{ Zd|d} Zd~d ZdddZdddZdddZdddZdddZdddZdddZdddZdd Zdd Zdd Zdd Ze
ejڃdddZe
ejۃdddZeedr4e
ej݃dd Zdd Zdd ZdddZdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd ZdS )    )annotationsN)HashableIteratorSequence)partialwraps)IntegralNumbergetitem)pformat)AnyCallableClassVarLiteralMapping)is_bool_dtypeis_datetime64_any_dtypeis_numeric_dtypeis_timedelta64_dtype)firstmergepartition_allremoveunique)core)Arraynormalize_argmap_partitions)DaskMethodsMixindont_optimizeis_dask_collectionnamed_schedulerstokenize)	BlockwiseBlockwiseDepBlockwiseDepDict	blockwise)globalmethod)methods)PANDAS_GT_140PANDAS_GT_150PANDAS_GT_200check_numeric_only_deprecation)CachedAccessorDatetimeAccessorStringAccessor)CategoricalAccessor
categorize)get_parallel_typegroup_split_dispatchhash_object_dispatchmeta_nonempty)optimize)PANDAS_GT_110PANDAS_GT_120AttributeNotImplementedErrorcheck_matching_columnsclear_known_categoriesdrop_by_shallow_copyhas_known_categoriesindex_summaryinsert_meta_param_descriptionis_categorical_dtypeis_dataframe_likeis_index_likeis_series_like	make_metameta_frame_constructormeta_series_constructorraise_on_meta_errorvalid_divisions)Delayeddelayedunpack_collections)HighLevelGraph)DataFrameTreeReduction)IndexCallableMOperatorMethodMixin_deprecatedapplyderived_fromfuncnamehas_keywordis_arraylikeiter_chunks	key_splitmaybe_pluralizememory_reprparse_bytespartial_by_orderpseudorandom	put_linesrandom_state_datatypename)get_templatethreadssync__no_default__Tzcompute.use_numexprFc                   s   t   fdd}|S )z6Decorator for methods that accept a numeric_only kwargc                   s>   | ddkrtdn| ddkr.|  }  | f||S )Nnumeric_onlyFz0'numeric_only=False' is not implemented in Dask.T)getNotImplementedError_get_numeric_data)selfargskwargsfunc 7/tmp/pip-unpacked-wheel-dbjnr7gq/dask/dataframe/core.pywrapperm   s    z_numeric_only.<locals>.wrapper)r   )ro   rr   rp   rn   rq   _numeric_onlyj   s    
rs   c                 C  s   | s| S t tt| tjr*tj| S t| d s\zt	
| W S  tk
rZ   |  Y S X dd | D }|sv| d S tj|d|dS )Nr   c                 S  s   g | ]}t |r|qS rp   len.0irp   rp   rq   
<listcomp>   s      z_concat.<locals>.<listcomp>T)uniformignore_index)
isinstancer   r   flattennpndarraydaZconcatenate3has_parallel_typepdSeries	Exceptionr*   concat)rl   r{   args2rp   rp   rq   _concat{   s    

r   c                 C  s   t | S N)r   )resultsrp   rp   rq   finalize   s    r   c                   @  s   e Zd ZdZd4ddZdd Zdd Zd	d
 Zdd Ze	e
dedZeeZdd Zdd ZddddZedd Zedd Zdd Zedd Zdd Zd d! Zed"d# Zd$d% Zd&d' Zd(d) Zed*d+ Zed,d- Z ed5d/d0Z!d6d2d3Z"dS )7Scalarz*A Dask object to represent a pandas scalarNc                 C  sz   t |tstj||g d}|| _|| _tjdd| _t|| jd}t	|sZt
|sZt|rptdtt| || _d S )Ndependenciesfloat64dtypeparent_metaz%Expected meta to specify scalar, got )r|   rN   from_collectionsdask_namer   r   _parent_metarF   rC   rE   rD   	TypeErrorrb   type_metark   dsknamemeta	divisionsrp   rp   rq   __init__   s    
zScalar.__init__c                 C  s   | j S r   r   rk   rp   rp   rq   __dask_graph__   s    zScalar.__dask_graph__c                 C  s   | j gS r   keyr   rp   rp   rq   __dask_keys__   s    zScalar.__dask_keys__c                 C  s   | j S r   r   r   rp   rp   rq   __dask_tokenize__   s    zScalar.__dask_tokenize__c                 C  s   | j fS r   r   r   rp   rp   rq   __dask_layers__   s    zScalar.__dask_layers__dataframe_optimizer   Zfalseyc                 C  s   t dfS Nrp   )r   r   rp   rp   rq   __dask_postcompute__   s    zScalar.__dask_postcompute__c                 C  s
   | j dfS r   _rebuildr   rp   rp   rq   __dask_postpersist__   s    zScalar.__dask_postpersist__renamec                C  s(   | j }|r|||}t||| j| jS r   )r   rh   r   r   r   rk   r   r   r   rp   rp   rq   r      s    zScalar._rebuildc                 C  s   | j S r   r   r   rp   rp   rq   _meta_nonempty   s    zScalar._meta_nonemptyc                 C  s   | j jS r   r   r   r   rp   rp   rq   r      s    zScalar.dtypec                 C  s:   t tt| }|| j t| jds2|d t|S )Nr   	setdirr   update__dict__hasattrr   r   listrk   orp   rp   rq   __dir__   s
    
zScalar.__dir__c                 C  s   dS )z6Dummy divisions to be compat with Series and DataFrameNNrp   r   rp   rp   rq   r      s    zScalar.divisionsc                 C  s`   t | jdk r| jn| jd d d }t| jdr@d| jj }ndt| jj }d| | dS )	N
      ...r   z
, dtype=%sz	, type=%sz
dd.Scalar<>)ru   r   r   r   r   r   __name__)rk   r   extrarp   rp   rq   __repr__   s
    &zScalar.__repr__c                 C  s   t |  S r   )r~   asarraycomputer   rp   rp   rq   	__array__   s    zScalar.__array__c                 C  s   | j | j| jfS r   r   r   r   r   rp   rp   rq   _args   s    zScalar._argsc                 C  s   | j S r   r   r   rp   rp   rq   __getstate__   s    zScalar.__getstate__c                 C  s   |\| _ | _| _d S r   r   rk   staterp   rp   rq   __setstate__   s    zScalar.__setstate__c                 C  s   t d|  dd S )NzTrying to convert a   to a boolean value. Because Dask objects are lazily evaluated, they cannot be converted to a boolean value or used in boolean conditions like if statements. Try calling .compute() to force computation prior to converting to a boolean value or using in a conditional statement.r   r   rp   rp   rq   __bool__   s    
zScalar.__bool__c                 C  s
   | j dfS Nr   r   r   rp   rp   rq   r      s    z
Scalar.keyc                   s    fdd}|S )Nc                   sR   t  d t|  }|df | jdffi} | j}tj||| gd}t|||S )N-r   r   )rV   r$   r   r   rN   r   r   )rk   r   r   r   graphoprp   rq   f   s
    
z%Scalar._get_unary_operator.<locals>.frp   )clsr   r   rp   r   rq   _get_unary_operator   s    zScalar._get_unary_operatorFc                   s    fddS )Nc                   s   t | | dS )N)inv)_scalar_binaryrk   otherr   r   rp   rq   <lambda>
      z-Scalar._get_binary_operator.<locals>.<lambda>rp   r   r   r   rp   r   rq   _get_binary_operator  s    zScalar._get_binary_operatorTc                 C  sR   |   }|  d }|rB| ||  }d| j }tj||dd}t| j||dS )zConvert into a ``dask.delayed`` object.

        Parameters
        ----------
        optimize_graph : bool, optional
            If True [default], the graph is optimized before converting into
            ``dask.delayed`` objects.
        r   delayed-rp   r   layer)	r   r   __dask_optimize__r   r   rN   r   rK   r   )rk   optimize_graphr   r   rp   rp   rq   
to_delayed  s    	
zScalar.to_delayed)N)F)T)#r   
__module____qualname____doc__r   r   r   r   r   r)   r8   r!   r   staticmethodDEFAULT_GET__dask_scheduler__r   r   r   propertyr   r   r   r   r   r   r   r   r   r   r   classmethodr   r   r   rp   rp   rp   rq   r      sH   
  



	


r   c                 C  s   t |  dt|| }|g}i }t|}t|trJ|| |jdf}nt|rVtS |}|rn| ||jdffn| |jdf|f||df< t	||j
d}	t|	}
|r| |
|j}n| |j|
}tj|||d}|tk	r|||||j |j gS t|||S d S )Nr   r   r   r   )rV   r$   r4   r|   r   appendr   r"   NotImplementedrF   r   r7   r   rN   r   indexminmax)r   rk   r   r   r   r   r   return_typeZ	other_keyZ
other_metaZother_meta_nonemptyr   r   rp   rp   rq   r     s*    

"
r   c                
   @  s~  e Zd ZdZdd Zdd Zdddd	Zd
d Zdd Ze	e
dedZeeZdd Zdd ZddddZedd Zedd Zejdd ZeddddZeeejd d! Zejd"d! Zed#d$ Zed%d& Zed'd( Zd)d* Zd+d, Z d d.d/Z!d!d0d1Z"d"d2d3Z#d4d5 Z$ed6d7 Z%d8d9 Z&ed:d; Z'd<d= Z(ed>d? Z)e)jd@d? Z)d#dAdBZ*edCdD Z+dEdF Z,d$dGdHZ-dIdJ Z.eejd%dLdMZ/dNdO Z0dPdQ Z1e1Z2dRdS Z3dTdU Z4dVdW Z5e5Z6dXdY Z7e8dZd[d\d] Z9e8dZd[d^d_ Z:d&dadbZ;e8dZd[dde<dddddfdcddZ=eejdedf Z>d'dgdhZ?d(djdkZ@dldm ZAd)dndoZBedpdq ZCdrds ZDedtdu ZEd*dvdwZFd+dxdyZGeejd,dzd{ZHeejd-d|d}ZIeejd.d~dZJd/ddZKeejd0ddZLd1ddZMd2ddZNdd ZOd3dddddddZPdd ZQd4ddZReSdd ZTeSd5ddZUd6ddZVeejd7ddZWeejd8ddZXd9ddZYeejdd ZZeejdd Z[eejdd Z\eejd:ddZ]eejd;ddZ^e_eejd<ddZ`e_eejd=ddZaeaZbe_eejd>ddZce_eejd?ddZdeejd@ddZeeejdAddZfe_eejdBddZgeejdCddZhe_eejdDddZidEddZjeejdFddZke_eejdGddĄZldHddƄZmdIddȄZndJddʄZodKdd̄Zpe_eejdLdd΄ZqddЄ Zre_eejdMddӄZsdNddՄZtdOddׄZue_eejdPddلZvdQddۄZwdRdd݄Zxe_eejdSdd߄Zye_dTddZzeejdUddZ{dVddZ|dWddZ}dXddZ~dYddZeejdZddZeejd[ddZeejd\ddZeejd]ddZeejejfddZeejejfddZeejdd Zeejdd Zeejdd Zeejdd  Zeejdd Zesveejd^ddZeeje<fddZeejd_dd	Zeejd`d
dZeejdd ZeSejfddZeejdaddZeejdd Zeejdd ZdbddZedd Zdd Zdd Zdd ZdS (c  _Framea
  Superclass for DataFrame and Series

    Parameters
    ----------
    dsk: dict
        The dask graph to compute this DataFrame
    name: str
        The key prefix that specifies which keys in the dask comprise this
        particular DataFrame / Series
    meta: pandas.DataFrame, pandas.Series, or pandas.Index
        An empty pandas object with names, dtypes, and indices matching the
        expected output.
    divisions: tuple of index values
        Values along which we partition our blocks on the index
    c                 C  sn   t |tstj||g d}|| _|| _t|}| |sZtdt| j	 dt
t| || _t|| _d S )Nr   zExpected meta to specify type z, got type )r|   rN   r   r   r   rF   _is_partition_typer   r   r   rb   r   tupler   r   rp   rp   rq   r   P  s    

z_Frame.__init__c                 C  s   | j S r   r   r   rp   rp   rq   r   ^  s    z_Frame.__dask_graph__zlist[Hashable])returnc                   s    fddt  jD S )Nc                   s   g | ]} j |fqS rp   r   rv   r   rp   rq   ry   b  s     z(_Frame.__dask_keys__.<locals>.<listcomp>)rangenpartitionsr   rp   r   rq   r   a  s    z_Frame.__dask_keys__c                 C  s   | j fS r   r   r   rp   rp   rq   r   d  s    z_Frame.__dask_layers__c                 C  s   | j S r   r   r   rp   rp   rq   r   g  s    z_Frame.__dask_tokenize__r   r   c                 C  s   t dfS r   )r   r   rp   rp   rq   r   o  s    z_Frame.__dask_postcompute__c                 C  s
   | j dfS r   r   r   rp   rp   rq   r   r  s    z_Frame.__dask_postpersist__Nr   c                C  s,   | j }|r|||}t| ||| j| jS r   )r   rh   r   r   r   r   rp   rp   rq   r   u  s    z_Frame._rebuildc                 C  s   t S r   )new_dd_objectr   rp   rp   rq   _constructor{  s    z_Frame._constructorc                 C  s   | j S )a  
        Tuple of ``npartitions + 1`` values, in ascending order, marking the
        lower/upper bounds of each partition's index. Divisions allow Dask
        to know which partition will contain a given value, significantly
        speeding up operations like `loc`, `merge`, and `groupby` by not
        having to search the full dataset.

        Example: for ``divisions = (0, 10, 50, 100)``, there are three partitions,
        where the index in each partition contains values [0, 10), [10, 50),
        and [50, 100], respectively. Dask therefore knows ``df.loc[45]``
        will be in the second partition.

        When every item in ``divisions`` is ``None``, the divisions are unknown.
        Most operations can still be performed, but some will be much slower,
        and a few may fail.

        It is uncommon to set ``divisions`` directly. Instead, use ``set_index``,
        which sorts and splits the data as needed.
        See https://docs.dask.org/en/latest/dataframe-design.html#partitions.
        )
_divisionsr   rp   rp   rq   r     s    z_Frame.divisionsc                 C  s   t |tstdt| drZt|t| jkrZt| j}td|d  d| dt| d |kr~tdd |D rtd	n8t| j	d
| j	j
}t|r|js|tt|krtd|| _d S )Nzdivisions must be a tupler  zThis dataframe has npartitions=   z(, divisions should be a tuple of length=z, got c                 s  s   | ]}|d k	V  qd S r   rp   rw   vrp   rp   rq   	<genexpr>  s     z#_Frame.divisions.<locals>.<genexpr>z;divisions may not contain a mix of None and non-None valuesr   zdivisions must be sorted)r|   r   r   r   ru   r  
ValueErroranygetattrr   r   rB   orderedsorted)rk   valuenZindex_dtyperp   rp   rq   r     s"    

intc                 C  s   t | jd S )zReturn number of partitionsr  ru   r   r   rp   rp   rq   r     s    z_Frame.npartitionsc                 C  s   | j jS r   )r   attrsr   rp   rp   rq   r    s    z_Frame.attrsc                 C  s   t || j_d S r   )dictr   r  )rk   r  rp   rp   rq   r    s    c                 C  s   | j tjtjdtddS )zSize of the Series or DataFrame as a Delayed object.

        Examples
        --------
        >>> series.size  # doctest: +SKIP
        dd.Scalar<size-ag..., dtype=int64>
        sizeFtokenr   split_every)	reductionr*   r  r~   sumr  r   rp   rp   rq   r    s    	    z_Frame.sizec                 C  s
   t | jS )z.A non-empty version of `_meta` with fake data.)r7   r   r   rp   rp   rq   r     s    z_Frame._meta_nonemptyc                 C  s   | j | j| j| jfS r   )r   r   r   r   r   rp   rp   rq   r     s    z_Frame._argsc                 C  s   | j S r   r   r   rp   rp   rq   r     s    z_Frame.__getstate__c                 C  s   |\| _ | _| _| _d S r   )r   r   r   r  r   rp   rp   rq   r     s    z_Frame.__setstate__Fc                 C  s&   |dk	rt dt| j| j| j| jS )a  Make a copy of the dataframe

        This is strictly a shallow copy of the underlying computational graph.
        It does not affect the underlying data

        Parameters
        ----------
        deep : boolean, default False
            The deep value must be `False` and it is declared as a parameter just for
            compatibility with third-party libraries like cuDF
        FzfThe `deep` value must be False. This is strictly a shallow copy of the underlying computational graph.)r  r   r   r   r   r   rk   deeprp   rp   rq   copy  s
    z_Frame.copyc                 K  s   |   | _t| j}|S r   )r   Z	_computedr~   array)rk   r   rm   xrp   rp   rq   r     s    
z_Frame.__array__c                 C  s   t d S r   ri   rk   r  contextrp   rp   rq   __array_wrap__  s    z_Frame.__array_wrap__c              
   O  s   | dd}|| D ]D}t|tjr2|jdkr2qqt|ttttt	j
t	jt	jfst  S q|dkr|jd k	rptS |jdkr~tS t|f||S ntS d S )Noutrp   __call__r  )rh   r|   r~   r   shaper	   r   r   r   r   	DataFramer   Indexr   	signatureZnoutelemwise)rk   Znumpy_ufuncmethodinputsrm   r   r  rp   rp   rq   __array_ufunc__  s      


z_Frame.__array_ufunc__c                 C  s   t S r   r&  r   rp   rp   rq   	_elemwise  s    z_Frame._elemwisec                 C  s   t d S r   r  r   rp   rp   rq   
_repr_data  s    z_Frame._repr_datac                 C  sB   d| j  }| jr$tj| j|d}ntjdg| j d  |d}|S )Nznpartitions=r    r  )r   known_divisionsr   r$  r   )rk   r   r   rp   rp   rq   _repr_divisions  s
    z_Frame._repr_divisionsc                 C  sn   |   jddd}d}t| jdkrD|dd dd	}d
| }|j| jj|t	| j
tt| jjddS )N   Fmax_rowsshow_dimensionsz:Dask {klass} Structure:
{data}
Dask Name: {name}, {layers}r   
r$  Z	DivisionsEmpty graph layer)klassdatar   layers)r,  	to_stringru   columns	partitionreplaceformat	__class__r   rZ   r   r[   r   r;  )rk   r:  Z_str_fmtrp   rp   rq   r   #  s    
z_Frame.__repr__c                 C  s"   | j tdt| jd | jjddS )zReturn dask Index instancer   z-indexF)r  r   enforce_metadata)r   r  rZ   r   r   r   r   rp   rp   rq   r   2  s    z_Frame.indexc                 C  s6   |j | _ ttj| |dd}|j| _|j| _|j| _d S NFrB  )r   r   r*   Zassign_indexr   r   r   )rk   r  resultrp   rp   rq   r   =  s       c                 C  s   | j tj|dd S )ay  Reset the index to the default index.

        Note that unlike in ``pandas``, the reset ``dask.dataframe`` index will
        not be monotonically increasing from 0. Instead, it will restart at 0
        for each partition (e.g. ``index1 = [0, ..., 10], index2 = [0, ...]``).
        This is due to the inability to statically know the full length of the
        index.

        For DataFrame with multi-level index, returns a new DataFrame with
        labeling information in the columns under the index names, defaulting
        to 'level_0', 'level_1', etc. if any are None. For a standard index,
        the index name will be used (if set), otherwise a default 'index' or
        'level_0' (if 'index' is already taken) will be used.

        Parameters
        ----------
        drop : boolean, default False
            Do not try to insert index into dataframe columns.
        F)droprB  )r   rQ   reset_indexclear_divisions)rk   rF  rp   rp   rq   rG  G  s
      z_Frame.reset_indexc                 C  s   t | jdko| jd dk	S )z#Whether divisions are already knownr   Nr  r   rp   rp   rq   r/  _  s    z_Frame.known_divisionsc                 C  s&   d| j d  }t| | j| j| j|S )zForget division informationr   r  )r   r   r   r   r   )rk   r   rp   rp   rq   rH  d  s    z_Frame.clear_divisionsc                 C  s,   |dkr| j r| jS ddlm} || |dS )a  Compute the current divisions of the DataFrame.

        This method triggers immediate computation. If you find yourself running this command
        repeatedly for the same dataframe, we recommend storing the result
        so you don't have to rerun it.

        If the column or index values overlap between partitions, raises ``ValueError``.
        To prevent this, make sure the data are sorted by the column or index.

        Parameters
        ----------
        col : string, optional
            Calculate the divisions for a non-index column by passing in the name of the column.
            If col is not specified, the index will be used to calculate divisions.
            In this case, if the divisions are already known, they will be returned
            immediately without computing.

        Examples
        --------
        >>> import dask
        >>> ddf = dask.datasets.timeseries(start="2021-01-01", end="2021-01-07", freq="1H").clear_divisions()
        >>> divisions = ddf.compute_current_divisions()
        >>> print(divisions)  # doctest: +NORMALIZE_WHITESPACE
        (Timestamp('2021-01-01 00:00:00'),
         Timestamp('2021-01-02 00:00:00'),
         Timestamp('2021-01-03 00:00:00'),
         Timestamp('2021-01-04 00:00:00'),
         Timestamp('2021-01-05 00:00:00'),
         Timestamp('2021-01-06 00:00:00'),
         Timestamp('2021-01-06 23:00:00'))

        >>> ddf.divisions = divisions
        >>> ddf.known_divisions
        True

        >>> ddf = ddf.reset_index().clear_divisions()
        >>> divisions = ddf.compute_current_divisions("timestamp")
        >>> print(divisions)  # doctest: +NORMALIZE_WHITESPACE
        (Timestamp('2021-01-01 00:00:00'),
         Timestamp('2021-01-02 00:00:00'),
         Timestamp('2021-01-03 00:00:00'),
         Timestamp('2021-01-04 00:00:00'),
         Timestamp('2021-01-05 00:00:00'),
         Timestamp('2021-01-06 00:00:00'),
         Timestamp('2021-01-06 23:00:00'))

        >>> ddf = ddf.set_index("timestamp", divisions=divisions, sorted=True)
        Nr   )compute_divisions)col)r/  r   dask.dataframe.shufflerI  )rk   rJ  rI  rp   rp   rq   compute_current_divisionsi  s    1z _Frame.compute_current_divisionsc                 C  s   d|  kr| j k rvn n\dt| d| j }| j||d  }|df| j|fi}tj||| gd}t||| j|S d| j  }t|dS )z=Get a dask DataFrame/Series representing the `nth` partition.r   zget-partition-r      r   zn must be 0 <= n < N)	r   strr   r   rN   r   r   r   r  )rk   r  r   r   r   r   msgrp   rp   rq   get_partition  s    z_Frame.get_partitionr  c           	      K  s   |d k	r4| j jf d|i| ||d< t}d|i}n| j jf | d  }}|dddkrbtdtj}t| f||| jd|||||d	|S )	NsubsetcolsZkeepTFzdrop_duplicates with keep=Falsezdrop-duplicates)	chunk	aggregater   r  r  	split_outsplit_out_setupsplit_out_setup_kwargsr{   )r   drop_duplicatessplit_out_on_colsrh   ri   rQ   acar   )	rk   rQ  r  rU  r{   rm   rV  rW  rS  rp   rp   rq   rX    s2    
z_Frame.drop_duplicatesc                 C  s   | j ttjdtdd S )Nru   Fr  )r  ru   r~   r  r  r   r   rp   rp   rq   __len__  s        z_Frame.__len__c                 C  s   t d| jj dd S )NzThe truth value of a z& is ambiguous. Use a.any() or a.all().)r  rA  r   r   rp   rp   rq   r     s    z_Frame.__bool__c                   s    fdd}|S )Nc                     s   t d  d S )Nzcannot convert the series to r   rp   	cast_typerp   rq   rr     s    z#_Frame._scalarfunc.<locals>.wrapperrp   )rk   r]  rr   rp   r\  rq   _scalarfunc  s    z_Frame._scalarfuncc                 C  s
   |  tS r   )r^  floatr   rp   rp   rq   	__float__  s    z_Frame.__float__c                 C  s
   |  tS r   )r^  r  r   rp   rp   rq   __int__  s    z_Frame.__int__c                 C  s
   |  tS r   )r^  complexr   rp   rp   rq   __complex__  s    z_Frame.__complex__   padc                 O  s   t || f||S )a  Apply Python function on each DataFrame partition.

        Note that the index and divisions are assumed to remain unchanged.

        Parameters
        ----------
        func : function
            The function applied to each partition. If this function accepts
            the special ``partition_info`` keyword argument, it will receive
            information on the partition's relative location within the
            dataframe.
        args, kwargs :
            Positional and keyword arguments to pass to the function.
            Positional arguments are computed on a per-partition basis, while
            keyword arguments are shared across all partitions. The partition
            itself will be the first positional argument, with all other
            arguments passed *after*. Arguments can be ``Scalar``, ``Delayed``,
            or regular Python objects. DataFrame-like args (both dask and
            pandas) will be repartitioned to align (if necessary) before
            applying the function; see ``align_dataframes`` to control this
            behavior.
        enforce_metadata : bool, default True
            Whether to enforce at runtime that the structure of the DataFrame
            produced by ``func`` actually matches the structure of ``meta``.
            This will rename and reorder columns for each partition,
            and will raise an error if this doesn't work,
            but it won't raise if dtypes don't match.
        transform_divisions : bool, default True
            Whether to apply the function onto the divisions and apply those
            transformed divisions to the output.
        align_dataframes : bool, default True
            Whether to repartition DataFrame- or Series-like args
            (both dask and pandas) so their divisions align before applying
            the function. This requires all inputs to have known divisions.
            Single-partition inputs will be split into multiple partitions.

            If False, all inputs must have either the same number of partitions
            or a single partition. Single-partition inputs will be broadcast to
            every partition of multi-partition inputs.
        $META

        Examples
        --------
        Given a DataFrame, Series, or Index, such as:

        >>> import pandas as pd
        >>> import dask.dataframe as dd
        >>> df = pd.DataFrame({'x': [1, 2, 3, 4, 5],
        ...                    'y': [1., 2., 3., 4., 5.]})
        >>> ddf = dd.from_pandas(df, npartitions=2)

        One can use ``map_partitions`` to apply a function on each partition.
        Extra arguments and keywords can optionally be provided, and will be
        passed to the function after the partition.

        Here we apply a function with arguments and keywords to a DataFrame,
        resulting in a Series:

        >>> def myadd(df, a, b=1):
        ...     return df.x + df.y + a + b
        >>> res = ddf.map_partitions(myadd, 1, b=2)
        >>> res.dtype
        dtype('float64')

        Here we apply a function to a Series resulting in a Series:

        >>> res = ddf.x.map_partitions(lambda x: len(x)) # ddf.x is a Dask Series Structure
        >>> res.dtype
        dtype('int64')

        By default, dask tries to infer the output metadata by running your
        provided function on some fake data. This works well in many cases, but
        can sometimes be expensive, or even fail. To avoid this, you can
        manually specify the output metadata with the ``meta`` keyword. This
        can be specified in many forms, for more information see
        ``dask.dataframe.utils.make_meta``.

        Here we specify the output is a Series with no name, and dtype
        ``float64``:

        >>> res = ddf.map_partitions(myadd, 1, b=2, meta=(None, 'f8'))

        Here we map a function that takes in a DataFrame, and returns a
        DataFrame with a new column:

        >>> res = ddf.map_partitions(lambda df: df.assign(z=df.x * df.y))
        >>> res.dtypes
        x      int64
        y    float64
        z    float64
        dtype: object

        As before, the output metadata can also be specified manually. This
        time we pass in a ``dict``, as the output is a DataFrame:

        >>> res = ddf.map_partitions(lambda df: df.assign(z=df.x * df.y),
        ...                          meta={'x': 'i8', 'y': 'f8', 'z': 'f8'})

        In the case where the metadata doesn't change, you can also pass in
        the object itself directly:

        >>> res = ddf.map_partitions(lambda df: df.head(), meta=ddf)

        Also note that the index and divisions are assumed to remain unchanged.
        If the function you're mapping changes the index/divisions, you'll need
        to clear them afterwards:

        >>> ddf.map_partitions(func).clear_divisions()  # doctest: +SKIP

        Your map function gets information about where it is in the dataframe by
        accepting a special ``partition_info`` keyword argument.

        >>> def func(partition, partition_info=None):
        ...     pass

        This will receive the following information:

        >>> partition_info  # doctest: +SKIP
        {'number': 1, 'division': 3}

        For each argument and keyword arguments that are dask dataframes you will
        receive the number (n) which represents the nth partition of the dataframe
        and the division (the first index value in the partition). If divisions
        are not known (for instance if the index is not sorted) then you will get
        None as the division.
        r   )rk   ro   rl   rm   rp   rp   rq   r     s     z_Frame.map_partitionsc                 O  s"   ddl m} ||| ||f||S )a.  Apply a function to each partition, sharing rows with adjacent partitions.

        This can be useful for implementing windowing functions such as
        ``df.rolling(...).mean()`` or ``df.diff()``.

        Parameters
        ----------
        func : function
            Function applied to each partition.
        before : int, timedelta or string timedelta
            The rows to prepend to partition ``i`` from the end of
            partition ``i - 1``.
        after : int, timedelta or string timedelta
            The rows to append to partition ``i`` from the beginning
            of partition ``i + 1``.
        args, kwargs :
            Positional and keyword arguments to pass to the function.
            Positional arguments are computed on a per-partition basis, while
            keyword arguments are shared across all partitions. The partition
            itself will be the first positional argument, with all other
            arguments passed *after*. Arguments can be ``Scalar``, ``Delayed``,
            or regular Python objects. DataFrame-like args (both dask and
            pandas) will be repartitioned to align (if necessary) before
            applying the function; see ``align_dataframes`` to control this
            behavior.
        enforce_metadata : bool, default True
            Whether to enforce at runtime that the structure of the DataFrame
            produced by ``func`` actually matches the structure of ``meta``.
            This will rename and reorder columns for each partition,
            and will raise an error if this doesn't work,
            but it won't raise if dtypes don't match.
        transform_divisions : bool, default True
            Whether to apply the function onto the divisions and apply those
            transformed divisions to the output.
        align_dataframes : bool, default True
            Whether to repartition DataFrame- or Series-like args
            (both dask and pandas) so their divisions align before applying
            the function. This requires all inputs to have known divisions.
            Single-partition inputs will be split into multiple partitions.

            If False, all inputs must have either the same number of partitions
            or a single partition. Single-partition inputs will be broadcast to
            every partition of multi-partition inputs.
        $META

        Notes
        -----
        Given positive integers ``before`` and ``after``, and a function
        ``func``, ``map_overlap`` does the following:

        1. Prepend ``before`` rows to each partition ``i`` from the end of
           partition ``i - 1``. The first partition has no rows prepended.

        2. Append ``after`` rows to each partition ``i`` from the beginning of
           partition ``i + 1``. The last partition has no rows appended.

        3. Apply ``func`` to each partition, passing in any extra ``args`` and
           ``kwargs`` if provided.

        4. Trim ``before`` rows from the beginning of all but the first
           partition.

        5. Trim ``after`` rows from the end of all but the last partition.

        Examples
        --------
        Given a DataFrame, Series, or Index, such as:

        >>> import pandas as pd
        >>> import dask.dataframe as dd
        >>> df = pd.DataFrame({'x': [1, 2, 4, 7, 11],
        ...                    'y': [1., 2., 3., 4., 5.]})
        >>> ddf = dd.from_pandas(df, npartitions=2)

        A rolling sum with a trailing moving window of size 2 can be computed by
        overlapping 2 rows before each partition, and then mapping calls to
        ``df.rolling(2).sum()``:

        >>> ddf.compute()
            x    y
        0   1  1.0
        1   2  2.0
        2   4  3.0
        3   7  4.0
        4  11  5.0
        >>> ddf.map_overlap(lambda df: df.rolling(2).sum(), 2, 0).compute()
              x    y
        0   NaN  NaN
        1   3.0  3.0
        2   6.0  5.0
        3  11.0  7.0
        4  18.0  9.0

        The pandas ``diff`` method computes a discrete difference shifted by a
        number of periods (can be positive or negative). This can be
        implemented by mapping calls to ``df.diff`` to each partition after
        prepending/appending that many rows, depending on sign:

        >>> def diff(df, periods=1):
        ...     before, after = (periods, 0) if periods > 0 else (0, -periods)
        ...     return df.map_overlap(lambda df, periods=1: df.diff(periods),
        ...                           periods, 0, periods=periods)
        >>> diff(ddf, 1).compute()
             x    y
        0  NaN  NaN
        1  1.0  1.0
        2  2.0  1.0
        3  3.0  1.0
        4  4.0  1.0

        If you have a ``DatetimeIndex``, you can use a ``pd.Timedelta`` for time-
        based windows or any ``pd.Timedelta`` convertible string:

        >>> ts = pd.Series(range(10), index=pd.date_range('2017', periods=10))
        >>> dts = dd.from_pandas(ts, npartitions=2)
        >>> dts.map_overlap(lambda df: df.rolling('2D').sum(),
        ...                 pd.Timedelta('2D'), 0).compute()
        2017-01-01     0.0
        2017-01-02     1.0
        2017-01-03     3.0
        2017-01-04     5.0
        2017-01-05     7.0
        2017-01-06     9.0
        2017-01-07    11.0
        2017-01-08    13.0
        2017-01-09    15.0
        2017-01-10    17.0
        Freq: D, dtype: float64
        r   )map_overlap)dask.dataframe.rollingrg  )rk   ro   beforeafterrl   rm   rg  rp   rp   rq   rg  m  s     z_Frame.map_overlapTc                 C  s   | j t||d S )at  Return the memory usage of each partition

        Parameters
        ----------
        index : bool, default True
            Specifies whether to include the memory usage of the index in
            returned Series.
        deep : bool, default False
            If True, introspect the data deeply by interrogating
            ``object`` dtypes for system-level memory consumption, and include
            it in the returned values.

        Returns
        -------
        Series
            A Series whose index is the partition number and whose values
            are the memory usage of each partition in bytes.
        r   r  )r   total_mem_usagerH  rk   r   r  rp   rp   rq   memory_usage_per_partition  s
      z!_Frame.memory_usage_per_partitionc
                 K  s   |dkr|}|dkr(|	r t d|}|}	|r4| ni }||d< |	rL|	 ni }	||	d< |rd| ni }||d< t| fttt||||||	d	|
S )aF  Generic row-wise reductions.

        Parameters
        ----------
        chunk : callable
            Function to operate on each partition. Should return a
            ``pandas.DataFrame``, ``pandas.Series``, or a scalar.
        aggregate : callable, optional
            Function to operate on the concatenated result of ``chunk``. If not
            specified, defaults to ``chunk``. Used to do the final aggregation
            in a tree reduction.

            The input to ``aggregate`` depends on the output of ``chunk``.
            If the output of ``chunk`` is a:

            - scalar: Input is a Series, with one row per partition.
            - Series: Input is a DataFrame, with one row per partition. Columns
              are the rows in the output series.
            - DataFrame: Input is a DataFrame, with one row per partition.
              Columns are the columns in the output dataframes.

            Should return a ``pandas.DataFrame``, ``pandas.Series``, or a
            scalar.
        combine : callable, optional
            Function to operate on intermediate concatenated results of
            ``chunk`` in a tree-reduction. If not provided, defaults to
            ``aggregate``. The input/output requirements should match that of
            ``aggregate`` described above.
        $META
        token : str, optional
            The name to use for the output keys.
        split_every : int, optional
            Group partitions into groups of this size while performing a
            tree-reduction. If set to False, no tree-reduction will be used,
            and all intermediates will be concatenated and passed to
            ``aggregate``. Default is 8.
        chunk_kwargs : dict, optional
            Keyword arguments to pass on to ``chunk`` only.
        aggregate_kwargs : dict, optional
            Keyword arguments to pass on to ``aggregate`` only.
        combine_kwargs : dict, optional
            Keyword arguments to pass on to ``combine`` only.
        kwargs :
            All remaining keywords will be passed to ``chunk``, ``combine``,
            and ``aggregate``.

        Examples
        --------
        >>> import pandas as pd
        >>> import dask.dataframe as dd
        >>> df = pd.DataFrame({'x': range(50), 'y': range(50, 100)})
        >>> ddf = dd.from_pandas(df, npartitions=4)

        Count the number of rows in a DataFrame. To do this, count the number
        of rows in each partition, then sum the results:

        >>> res = ddf.reduction(lambda x: x.count(),
        ...                     aggregate=lambda x: x.sum())
        >>> res.compute()
        x    50
        y    50
        dtype: int64

        Count the number of rows in a Series with elements greater than or
        equal to a value (provided via a keyword).

        >>> def count_greater(x, value=0):
        ...     return (x >= value).sum()
        >>> res = ddf.x.reduction(count_greater, aggregate=lambda x: x.sum(),
        ...                       chunk_kwargs={'value': 25})
        >>> res.compute()
        25

        Aggregate both the sum and count of a Series at the same time:

        >>> def sum_and_count(x):
        ...     return pd.Series({'count': x.count(), 'sum': x.sum()},
        ...                      index=['count', 'sum'])
        >>> res = ddf.x.reduction(sum_and_count, aggregate=lambda x: x.sum())
        >>> res.compute()
        count      50
        sum      1225
        dtype: int64

        Doing the same, but for a DataFrame. Here ``chunk`` returns a
        DataFrame, meaning the input to ``aggregate`` is a DataFrame with an
        index with non-unique entries for both 'x' and 'y'. We groupby the
        index, and sum each group to get the final result.

        >>> def sum_and_count(x):
        ...     return pd.DataFrame({'count': x.count(), 'sum': x.sum()},
        ...                         columns=['count', 'sum'])
        >>> res = ddf.reduction(sum_and_count,
        ...                     aggregate=lambda x: x.groupby(level=0).sum())
        >>> res.compute()
           count   sum
        x     50  1225
        y     50  3725
        N+`combine_kwargs` provided with no `combine`	aca_chunkaca_combineaca_aggregate)	rS  rT  combiner   r  r  chunk_kwargsaggregate_kwargscombine_kwargs)r  r  rZ  _reduction_chunk_reduction_aggregate_reduction_combine)rk   rS  rT  rs  r   r  r  rt  ru  rv  rm   rp   rp   rq   r    s8    qz_Frame.reductionc                 O  sL   t |tr8|\}}||kr&td| | ||< |||S || f||S d S )Nz1%s is both the pipe target and a keyword argument)r|   r   r  )rk   ro   rl   rm   targetrp   rp   rq   pipe  s    

z_Frame.pipec                   s   t t dstdtj|}t |}d|  fddt|D }g }tt	 D ]dd|f fddtjD }t
jt||gd}	t|	jj}
||
 qd|S )	a8  Pseudorandomly split dataframe into different pieces row-wise

        Parameters
        ----------
        frac : list
            List of floats that should sum to one.
        random_state : int or np.random.RandomState
            If int create a new RandomState with this as the seed.
            Otherwise draw from the passed RandomState.
        shuffle : bool, default False
            If set to True, the dataframe is shuffled (within partition)
            before the split.

        Examples
        --------

        50/50 split

        >>> a, b = df.random_split([0.5, 0.5])  # doctest: +SKIP

        80/10/10 split, consistent random_state

        >>> a, b, c = df.random_split([0.8, 0.1, 0.1], random_state=123)  # doctest: +SKIP

        See Also
        --------
        dask.DataFrame.sample
        r  zfrac should sum to 1split-c                   s*   i | ]"\}}|ft j|f |fqS rp   )pd_splitr   rw   rx   r   )fracr   rk   shufflerp   rq   
<dictcomp>  s    z'_Frame.random_split.<locals>.<dictcomp>zsplit-%d-%sc                   s    i | ]}|ft |f fqS rp   r
   rw   j)rx   r   name2rp   rq   r    s     r   )r~   Zallcloser  r  ra   r   r$   	enumerater   ru   rN   r   r   r   r   r   r   )rk   r  random_stater  
state_datar  r   r   dsk2r   Zout_dfrp   )r  rx   r   r  rk   r  rq   random_split  s,      z_Frame.random_splitr1  c                 C  s*   |dkr| j }|| j k}| j||||dS )a0  First n rows of the dataset

        Parameters
        ----------
        n : int, optional
            The number of rows to return. Default is 5.
        npartitions : int, optional
            Elements are only taken from the first ``npartitions``, with a
            default of 1. If there are fewer than ``n`` rows in the first
            ``npartitions`` a warning will be raised and any found rows
            returned. Pass -1 to use all partitions.
        compute : bool, optional
            Whether to compute the result, default is True.
        r6  )r  r   r   safe)r   _head)rk   r  r   r   r  rp   rp   rq   head  s    
z_Frame.headc                   s&  |dkr| j }|| j kr.td| j  d| d| d| d| j }|rPt}ntj}|dkrd| d| j  i }t|D ]}tj| j|f|f| |f< q|t fdd	t|D f}	||	|f||d
f< n|d
f|| jd
f|fi}tj	||| gd}
t
|
|| j| jd
 | j| g}|r"| }|S )Nr6  zonly z partitions, head received zhead-r   r  zhead-partial-c                   s   g | ]} |fqS rp   rp   rv   Zname_prp   rq   ry   	  s     z _Frame._head.<locals>.<listcomp>r   r   )r   r  r   	safe_headrQ   r  r   r   rN   r   r   r   r   r   )rk   r  r   r   r  r   r  r   rx   r   r   rE  rp   r  rq   r    s8    
   z_Frame._headc                 C  sj   d|| j f }|dftj| j | jd f|fi}tj||| gd}t||| j| jdd }|rf|	 }|S )zkLast n rows of the dataset

        Caveat, the only checks the last n rows of the last partition.
        z
tail-%d-%sr   r  r   N)
r   rQ   tailr   rN   r   r   r   r   r   rk   r  r   r   r   r   rE  rp   rp   rq   r    s     z_Frame.tailc                 C  s   ddl m} || S )zPurely label-location based indexer for selection by label.

        >>> df.loc["b"]  # doctest: +SKIP
        >>> df.loc["b":"d"]  # doctest: +SKIP
        r   )_LocIndexer)dask.dataframe.indexingr  )rk   r  rp   rp   rq   loc&  s    z
_Frame.locc                   s   t |ts|f}ddlm} ||jf}tdd |D }dt|  tj t	d| 
 }fdd|D j|d	 d
 d
  g } fddt|D }tj |gd}t| j|S )Nr   )normalize_indexc                 s  s*   | ]"}t |trt||d  n|V  qdS )r  N)r|   r	   slicerw   krp   rp   rq   r  7  s     z%_Frame._partitions.<locals>.<genexpr>zblocks-r   c                   s   g | ]\}} j | qS rp   r   )rw   _rx   r   rp   rq   ry   ;  s     z&_Frame._partitions.<locals>.<listcomp>r6  r  c                   s   i | ]\}} |ft |qS rp   )r   rw   rx   r   r-  rp   rq   r  >  s      z&_Frame._partitions.<locals>.<dictcomp>r   )r|   r   Zdask.array.slicingr  r   r$   r~   r  r   objecttolistr   r  rN   r   r   r   )rk   r   r  Znew_keysr   r   r   rp   r   rk   rq   _partitions1  s    
z_Frame._partitionsc                 C  s
   t | jS )aF  Slice dataframe by partitions

        This allows partitionwise slicing of a Dask Dataframe.  You can perform normal
        Numpy-style slicing but now rather than slice elements of the array you
        slice along partitions so, for example, ``df.partitions[:5]`` produces a new
        Dask Dataframe of the first five partitions.

        Examples
        --------
        >>> df.partitions[0]  # doctest: +SKIP
        >>> df.partitions[:3]  # doctest: +SKIP
        >>> df.partitions[::10]  # doctest: +SKIP

        Returns
        -------
        A Dask DataFrame
        )rP   r  r   rp   rp   rq   
partitionsC  s    z_Frame.partitionsc                 C  s   t |tr|}d}t |tr$|}d}t|dk	|dk	|dk	|dk	gdkrPtd|dk	rbt| |S |dk	rtt| |S |dk	rt| ||dS |dk	rt| |dS dS )a  Repartition dataframe along new divisions

        Parameters
        ----------
        divisions : list, optional
            The "dividing lines" used to split the dataframe into partitions.
            For ``divisions=[0, 10, 50, 100]``, there would be three output partitions,
            where the new index contained [0, 10), [10, 50), and [50, 100), respectively.
            See https://docs.dask.org/en/latest/dataframe-design.html#partitions.
            Only used if npartitions and partition_size isn't specified.
            For convenience if given an integer this will defer to npartitions
            and if given a string it will defer to partition_size (see below)
        npartitions : int, optional
            Approximate number of partitions of output. Only used if partition_size
            isn't specified. The number of partitions used may be slightly
            lower than npartitions depending on data distribution, but will never be
            higher.
        partition_size: int or string, optional
            Max number of bytes of memory for each partition. Use numbers or
            strings like 5MB. If specified npartitions and divisions will be
            ignored. Note that the size reflects the number of bytes used as
            computed by ``pandas.DataFrame.memory_usage``, which will not
            necessarily match the size when storing to disk.

            .. warning::

               This keyword argument triggers computation to determine
               the memory size of each partition, which may be expensive.

        freq : str, pd.Timedelta
            A period on which to partition timeseries data like ``'7D'`` or
            ``'12h'`` or ``pd.Timedelta(hours=12)``.  Assumes a datetime index.
        force : bool, default False
            Allows the expansion of the existing divisions.
            If False then the new divisions' lower and upper bounds must be
            the same as the old divisions'.

        Notes
        -----
        Exactly one of `divisions`, `npartitions`, `partition_size`, or `freq`
        should be specified. A ``ValueError`` will be raised when that is
        not the case.

        Also note that ``len(divisons)`` is equal to ``npartitions + 1``. This is because ``divisions``
        represents the upper and lower bounds of each partition. The first item is the
        lower bound of the first partition, the second item is the lower bound of the
        second partition and the upper bound of the first partition, and so on.
        The second-to-last item is the lower bound of the last partition, and the last
        (extra) item is the upper bound of the last partition.

        Examples
        --------
        >>> df = df.repartition(npartitions=10)  # doctest: +SKIP
        >>> df = df.repartition(divisions=[0, 5, 10, 20])  # doctest: +SKIP
        >>> df = df.repartition(freq='7d')  # doctest: +SKIP

        See Also
        --------
        DataFrame.memory_usage_per_partition
        pandas.DataFrame.memory_usage
        Nr  zpPlease provide exactly one of ``npartitions=``, ``freq=``, ``divisions=``, ``partition_size=`` keyword argumentsforcefreq)	r|   r  rN  r  r  repartition_sizerepartition_npartitionsrepartitionrepartition_freq)rk   r   r   Zpartition_sizer  r  rp   rp   rq   r  Z  s6    E



z_Frame.repartitionc              	   C  s"   ddl m} || ||||||dS )a  Rearrange DataFrame into new partitions

        Uses hashing of `on` to map rows to output partitions. After this
        operation, rows with the same value of `on` will be in the same
        partition.

        Parameters
        ----------
        on : str, list of str, or Series, Index, or DataFrame
            Column(s) or index to be used to map rows to output partitions
        npartitions : int, optional
            Number of partitions of output. Partition count will not be
            changed by default.
        max_branch: int, optional
            The maximum number of splits per input partition. Used within
            the staged shuffling algorithm.
        shuffle: {'disk', 'tasks'}, optional
            Either ``'disk'`` for single-node operation or ``'tasks'`` for
            distributed operation.  Will be inferred by your current scheduler.
        ignore_index: bool, default False
            Ignore index during shuffle.  If ``True``, performance may improve,
            but index values will not be preserved.
        compute: bool
            Whether or not to trigger an immediate computation. Defaults to False.

        Notes
        -----
        This does not preserve a meaningful index/partitioning scheme. This
        is not deterministic if done in parallel.

        Examples
        --------
        >>> df = df.shuffle(df.columns[0])  # doctest: +SKIP
        r   )r  )r   
max_branchr  r{   r   )rK  r  )rk   onr   r  r  r{   r   Z
dd_shufflerp   rp   rq   r    s    +z_Frame.shufflec                   sn   |} d kr"|d k	r"tdt|ttfr8|j}n|}jj| ||d}|dks` d krt|r~t|s~d}d|i}n
|f}i }j	t
jf| |||dd|S  dkrd	 d
|d krdn|d
 }	}
n$d jd d
|d krdn| }	}
|d krRdt   fddtjD }tj|gd}t||j}n}|jt
j|	|
 ||dS )Nz%fillna with set limit and method=None)r  r'  limitaxisr  rp   r  F)r'  r  r  r   rB  )rf  ffillr  r   bfillzfillna-chunk-c                   s*   i | ]"}|ft jj|f |kfqS rp   )r*   Zfillna_checkr   rv   r'  r   rk   Z
skip_checkrp   rq   r  "  s   z!_Frame.fillna.<locals>.<dictcomp>r   )r'  r  r   )_validate_axisri   r|   r   r   r   fillnarE   r"   r   rQ   r   r$   r   rN   r   r   r   rg  )rk   r  r'  r  r  Z
test_valuer   rl   rm   ri  rj  r   r   partsrp   r  rq   r    sl    
   


	     z_Frame.fillnac                 C  s   | j d||dS )Nr  r'  r  r  r  rk   r  r  rp   rp   rq   r  4  s    z_Frame.ffillc                 C  s   | j d||dS )Nr  r  r  r  rp   rp   rq   r  8  s    z_Frame.bfillc           	        s   |dk	rFd}t |tr>d|  kr*dkr>n nt| | nt| dkrVtd|dkrhtj }dt | t	j
|} fddt|D }tj|gd	}t|jjS )
a  Random sample of items

        Parameters
        ----------
        n : int, optional
            Number of items to return is not supported by dask. Use frac
            instead.
        frac : float, optional
            Approximate fraction of items to return. This sampling fraction is
            applied to all partitions equally. Note that this is an
            **approximate fraction**. You should not expect exactly ``len(df) * frac``
            items to be returned, as the exact number of elements selected will
            depend on how your data is partitioned (but should be pretty close
            in practice).
        replace : boolean, optional
            Sample with or without replacement. Default = False.
        random_state : int or ``np.random.RandomState``
            If an int, we create a new RandomState with this as the seed;
            Otherwise we draw from the passed RandomState.

        See Also
        --------
        DataFrame.random_split
        pandas.DataFrame.sample
        Nzlsample does not support the number of sampled items parameter, 'n'. Please use the 'frac' parameter instead.r   r  zfrac must not be Nonezsample-c                   s,   i | ]$\}}|ft jj|f| fqS rp   )r*   sampler   r~  r  r   r?  rk   rp   rq   r  j  s    z!_Frame.sample.<locals>.<dictcomp>r   )r|   r	   warningswarnr  r~   randomRandomStater$   ra   r   r  rN   r   r   r   r   )	rk   r  r  r?  r  rO  r  r   r   rp   r  rq   r  <  s$    "

z_Frame.samplec                 C  s4   |d k	rd|ini }| j tjfd|i||ddS )Nr  
to_replaceF)regexrB  )r   rQ   r?  )rk   r  r  r  Zvalue_kwargrp   rp   rq   r?  r  s    z_Frame.replacec                 C  sH   |dkrt | jtdd }| j}| ||}||_|dk	rD||_|S )a  Convert a dask DataFrame to a dask array.

        Parameters
        ----------
        lengths : bool or Sequence of ints, optional
            How to determine the chunks sizes for the output array.
            By default, the output array will have unknown chunk lengths
            along the first axis, which can cause some later operations
            to fail.

            * True : immediately compute the length of each partition
            * Sequence : a sequence of integers to use for the chunk sizes
              on the first axis. These values are *not* validated for
              correctness, beyond ensuring that the number of items
              matches the number of partitions.
        meta : object, optional
            An optional `meta` parameter can be passed for dask to override the
            default metadata on the underlying dask array.

        Returns
        -------
        TFrD  N)r   r   ru   r   values_validate_chunks_chunksr   )rk   lengthsr   arrchunksrp   rp   rq   to_dask_array~  s    z_Frame.to_dask_arrayac                 K  s    ddl m} || ||||f|S )z,See dd.to_hdf docstring for more informationr   )to_hdf)dask.dataframe.ior  )rk   Zpath_or_bufr   moder   rm   r  rp   rp   rq   r    s    z_Frame.to_hdfc                 K  s   ddl m} || |f|S )z,See dd.to_csv docstring for more informationr   )to_csv)r  r  )rk   filenamerm   r  rp   rp   rq   r    s    z_Frame.to_csvfailrN  bool)r   uri	if_existsr   c                 C  s.   ddl m} || |||||||||	|
||dS )z,See dd.to_sql docstring for more informationr   )to_sql)r   r  schemar  r   index_label	chunksizer   r'  r   parallelengine_kwargs)r  r  )rk   r   r  r  r  r   r  r  r   r'  r   r  r  r  rp   rp   rq   r    s     z_Frame.to_sqlc                 O  s   ddl m} || |f||S )z-See dd.to_json docstring for more informationr   )to_json)r  r  )rk   r  rl   rm   r  rp   rp   rq   r    s    z_Frame.to_jsonc                   s^   |   }|   |  d |rJ|  |    d| j tj dd  fdd|D S )a  Convert into a list of ``dask.delayed`` objects, one per partition.

        Parameters
        ----------
        optimize_graph : bool, optional
            If True [default], the graph is optimized before converting into
            ``dask.delayed`` objects.

        Examples
        --------
        >>> partitions = df.to_delayed()  # doctest: +SKIP

        See Also
        --------
        dask.dataframe.from_delayed
        r   r   rp   r   c                   s   g | ]}t | d qS )r   )rK   r  r   r   rp   rq   ry     s     z%_Frame.to_delayed.<locals>.<listcomp>)r   r   r   r   r   rN   r   )rk   r   keysrp   r  rq   r     s    
z_Frame.to_delayedc                   s    fddS )Nc                   s
   t  | S r   r*  r   r   rp   rq   r     r   z,_Frame._get_unary_operator.<locals>.<lambda>rp   )r   r   rp   r   rq   r     s    z_Frame._get_unary_operatorc                   s    |r fddS  fddS d S )Nc                   s   t  || S r   r*  r   r   rp   rq   r     r   z-_Frame._get_binary_operator.<locals>.<lambda>c                   s   t  | |S r   r*  r   r   rp   rq   r     r   rp   r   rp   r   rq   r     s    z_Frame._get_binary_operatorr   c                 C  sd   ddl m} t|tr&|dk r&td|dk	rPt|ts@td|dk rPtd|| |||||dS )aM  Provides rolling transformations.

        Parameters
        ----------
        window : int, str, offset
           Size of the moving window. This is the number of observations used
           for calculating the statistic. When not using a ``DatetimeIndex``,
           the window size must not be so large as to span more than one
           adjacent partition. If using an offset or offset alias like '5D',
           the data must have a ``DatetimeIndex``

           .. versionchanged:: 0.15.0

              Now accepts offsets and string offset aliases

        min_periods : int, default None
            Minimum number of observations in window required to have a value
            (otherwise result is NA).
        center : boolean, default False
            Set the labels at the center of the window.
        win_type : string, default None
            Provide a window type. The recognized window types are identical
            to pandas.
        axis : int, default 0

        Returns
        -------
        a Rolling object on which to call a method to compute a statistic
        r   )Rollingzwindow must be >= 0Nzmin_periods must be an integerzmin_periods must be >= 0)windowmin_periodscenterwin_typer  )rh  r  r|   r   r  )rk   r  r  r  r  r  r  rp   rp   rq   rolling  s"    

z_Frame.rollingc                 C  sn   |  |}t|tstd|dkr:| jtjd|dddS |dkrJ|dfnd| f\}}| jtj||d|dS )a|  
        .. note::

           Pandas currently uses an ``object``-dtype column to represent
           boolean data with missing values. This can cause issues for
           boolean-specific operations, like ``|``. To enable boolean-
           specific operations, at the cost of metadata that doesn't match
           pandas, use ``.astype(bool)`` after the ``shift``.
        periods must be an integerr  diffF)r  periodsr  rB  r   r  r  )r  r|   r   r   r   rQ   r  rg  )rk   r  r  ri  rj  rp   rp   rq   r  .  s    

    z_Frame.diffc              	   C  s   |  |}t|tstd|dkr<| jtjd||dddS |d krx|dkrT|dfnd| f\}}| jtj||d|dS | jj||d}| jtjd|||ddd	}t	|||dS )
Nr  r  shiftF)r  r  r  r  rB  r   r  r  )r  r  r  r   rB  transform_divisions)
r  r|   r   r   r   rQ   r  rg  r   maybe_shift_divisions)rk   r  r  r  ri  rj  r   r   rp   rp   rq   r  E  s@    

	    	z_Frame.shiftc              	   C  s   |  |}tt| j|dr&d|i}ni }t " t| j|f ||d|}W 5 Q R X | j| }	|dkr| jtf||	|||d|}
t||
S | j	tf||	||||d|}
t
| tr| j | j f|
_t||
S d S )Nrg   r  skipnar  )r   r  r  r  _dask_method_name)r   r  r  r  r  r  )r  rW   r  r   r.   _token_prefixr   _getattr_numeric_only
handle_outr  r|   r#  r=  r   r   r   )rk   r   r  r  r  r   rg   Znumeric_only_kwargsr   r  rE  rp   rp   rq   _reduction_aggh  sP    	

 
	


z_Frame._reduction_aggc                   s:   |  tj }| jr6t| r6t fdd| jD |_|S )Nc                 3  s   | ]} t | V  qd S r   rN  rw   divisionprefixrp   rq   r    s     z$_Frame.add_prefix.<locals>.<genexpr>)r   rQ   
add_prefixr/  rE   r   r   )rk   r  resrp   r  rq   r    s    z_Frame.add_prefixc                   s:   |  tj }| jr6t| r6t fdd| jD |_|S )Nc                 3  s   | ]}t |  V  qd S r   r  r  suffixrp   rq   r    s     z$_Frame.add_suffix.<locals>.<genexpr>)r   rQ   
add_suffixr/  rE   r   r   )rk   r  r  rp   r  rq   r    s    z_Frame.add_suffixc                 C  s&   t | d | j }| jtj|ddS )NabsFr   rB  )_raise_if_object_seriesr   r  r   rQ   rk   r   rp   rp   rq   r    s    

z
_Frame.absc                 C  s   | j d||||dS )Nallr  r  r  r   r  rk   r  r  r  r   rp   rp   rq   r    s        z
_Frame.allc                 C  s   | j d||||dS )Nr  r  r  r  rp   rp   rq   r    s        z
_Frame.anyc           	        sb   | j d||||d |rZ|  j|d|k}t|rD j|tjdS t fdd|dS n S d S )Nr  r  r  r   c                   s   | |kr S t jS r   r~   NaNr  yrE  rp   rq   r     r   z_Frame.sum.<locals>.<lambda>Tr  notnullr  rE   wherer~   r  r   	rk   r  r  r  r   r   Z	min_countrg   condrp   r  rq   r    s"        
  z
_Frame.sumc           	        sb   | j d||||d |rZ|  j|d|k}t|rD j|tjdS t fdd|dS n S d S )Nprodr  r  r  c                   s   | |kr S t jS r   r   r  r  rp   rq   r     r   z_Frame.prod.<locals>.<lambda>Tr  r  rp   r  rq   r
    s"    
  z_Frame.prodc                 C  s   | j d||||dS )Nr   r  r  rk   r  r  r  r   rg   rp   rp   rq   r     s    z
_Frame.maxc                 C  s   | j d||||dS )Nr   r  r  r  rp   rp   rq   r     s    z
_Frame.minc                 C  s   d}|  |}| jj||d}|dkrDttj| || j| ||ddS t| }t| gtt	t
|d|i| j| |||d
}t| trt| jt| jf|_|S d S )Nidxmaxr  r  Fr   r  r  r  rB  scalar	rS  rT  rs  r   ru  r  r  r  fn)r  r   r  r   rQ   r  rE   rZ  idxmaxmin_chunkidxmaxmin_aggidxmaxmin_combiner|   r#  r   r=  r   r   rk   r  r  r  r  r   r  rE  rp   rp   rq   r    s:    



z_Frame.idxmaxc                 C  s   d}|  |}| jj|d}|dkrBttj| || j| ||ddS t| }t| gt	t
t|d|i| j| |||d
}t| trt| jt| jf|_|S d S )Nidxminr  r  Fr  r  r  )r  r   r  r   rQ   r  r  rE   rZ  r  r  r  r|   r#  r   r=  r   r   r  rp   rp   rq   r  -  s:    



z_Frame.idxminc                 C  s   |  |}| jd }|dkr@| jj|d}| jtj|||ddS | j }| jtjt|||d}t| t	r| j
 | j
 f|_|S d S )Ncountr  r  Fr   r  r  rB  )rT  r   r  r  )r  r  r   r  r   rQ   r  _count_aggregater|   r#  r=  r   r   r   )rk   r  r  rg   r  r   rE  rp   rp   rq   r  N  s,    

    

z_Frame.countc                 C  s.   | j tjtjt|d|id|id}| j|_|S )Ndropna)rS  rs  rT  r  rt  ru  )r  rQ   value_countsr  _mode_aggregater   )rk   r  r  mode_seriesrp   rp   rq   r  g  s    z_Frame.modec              
   C  s   |  |}t| d t  | jj|||d}W 5 Q R X |dkrjttj| || jd ||d|d}t||S | 	 }	|	j
||d}
|	j|d}| jdt| ||  }ttj|
|||d| jd	}t| tr| j | j f|_t||S d S )
Nmeanr  r  rg   r  F)r   r  r  r  rB  rg   )r  r  r  zmean-%s)r  r   rB  r   )r  r  r.   r   r  r   rQ   r  r  rj   r  r  r$   r*   Zmean_aggregater   r|   r#  r=  r   r   r   )rk   r  r  r  r   r   rg   r   rE  numsr  r   rp   rp   rq   r  t  sH    

  

	
z_Frame.meandefaultc                 C  s   | j d||ddS )a  Return the approximate median of the values over the requested axis.

        Parameters
        ----------
        axis : {0, 1, "index", "columns"} (default 0)
            0 or ``"index"`` for row-wise, 1 or ``"columns"`` for column-wise
        method : {'default', 'tdigest', 'dask'}, optional
            What method to use. By default will use Dask's internal custom
            algorithm (``"dask"``).  If set to ``"tdigest"`` will use tdigest
            for floats and ints and fallback to the ``"dask"`` otherwise.
              ?)qr  r'  N)quantiler   rk   r  r'  rp   rp   rq   median_approximate  s    z_Frame.median_approximatec                 C  s,   |dks| j dkr | j||dS tdd S )Nr  r=  r  )r  r'  Dask doesn't implement an exact median in all cases as this is hard to do in parallel. See the `median_approximate` method instead, which uses an approximate algorithm.r   r'  ri   r&  rp   rp   rq   median  s
    z_Frame.medianc           
      C  s   |  |}t| d t  | jj|||d}W 5 Q R X |dkrlttj| || jd |||d|d	}	t||	S | j	dkr| 
| |||}	t||	S | |||}	t| tr| j | j f|	_t||	S d S )Nvarr  r  F)r   r  r  r  ddofrB  rg   )r  r  r.   r   r,  r   rQ   r  r  ndim_var_1d_var_numericr|   r#  r=  r   r   r   )
rk   r  r  r-  r  r   r   rg   r   rE  rp   rp   rq   r,    s8    

  



z
_Frame.varc                 C  s   | j ddgtjgd}|jj}|j}t|tjs>|jd}|sJ|d krPtj	ntj
}||d||d}| jd t|| }	t|r|jjnd }
|jjj
ddj}|jfd	t|  }|	dftj||
fi}tj|	||gd
}t||	|j
 d d gdS )Nnumberr  includeexcludef8r   r  r-  r  var-numericr  r   r   r  )select_dtypesr~   timedelta64r  r   
issubdtyper1  astyper   nanvarr,  r  r$   rC   r   r=  r   r"  r   ru   r*   wrap_var_reductionrN   r   r   )rk   r  r-  r  r   values_dtypearray_valuesr,  	array_varr   rR  Z	var_shapeZarray_var_namer   r   rp   rp   rq   r0    s&       z_Frame._var_numericc           	        s   j tjgd fddjjD }dd |D }jd t }|dftj|jjfi}t	j
|||d}t||j d d gdS )	Nr3  c                   s    g | ]} |  qS rp   )r/  rw   col_idxr-  rk   r  r  Z
timedeltasrp   rq   ry   	  s   z*_Frame._var_timedeltas.<locals>.<listcomp>c                 S  s   g | ]}|j d fqS r8  r   r  rp   rp   rq   ry   	  s     zvar-timedeltas-r   r   r  )r9  r~   r:  r   r=  r  r$   r*   r>  rN   r   r   r   r,  )	rk   r  r-  r  Zvar_timedeltasZvar_timedelta_namesr   r   r   rp   rE  rq   _var_timedeltas	  s0         z_Frame._var_timedeltasc           
      C  s   | j ddtjgd}| |||}| |||}| jd t|| }|dftj|j	df|j	df|j
jfi}tj||||gd}	t|	|| j d d gdS )Nr1  r  rB  z
var-mixed-r   r   r  )r9  r~   r:  rF  r0  r  r$   r*   Zvar_mixed_concatr   r   r=  rN   r   r   r   r,  )
rk   r  r-  r  r:  Ztimedelta_varsZnumeric_varsr   r   r   rp   rp   rq   
_var_mixed 	  s,    	     z_Frame._var_mixedc                 C  s   t |j}|r>|s0| }|d}||}n| d}tj|j	rV|d}t
|jt
jsp|d}| jd t|| }|s|d krtjntj}||jd||d}	|dftj|	jfd fi}
tj||
|	gd}t|||j	 d d gdS )Ni8r5  zvar-1d-r   r6  r   r  )r   r   isnar<  maskr  r   
Int64Dtypeis_dtyper   r~   r;  r   r1  r  r$   r   r=  r,  r  r*   r>  r   rN   r   r   )rk   columnr  r-  r  Zis_timedeltais_nanr   r,  rA  r   r   rp   rp   rq   r/  8	  s,    



   z_Frame._var_1dc                 C  sz  |  |}t| d t| d t  | jj|||d}W 5 Q R X t| j}	d}
| }tr|	r| jj	ddj
}t|dkr| ||||\}}
n tr|	st| j}
|
rt| |}|dkrt|
stjnt||| jd |||d|| jd
}t||S |j|||d	}| jd }|
r*|	|	r|nd |d
}t}n
i }tj}t||f||d| jd|}|	rpt|drp||j}t||S )Nstdr  FdatetimerB  r   r  )r   r  r  r  r-  rB  rg   r   r  r-  r  )
is_df_like	time_colsr  r   r  rB  r   r   )r  r  !_raise_if_not_series_or_dataframer.   r   rO  rC   r   r:   r9  r=  ru   _convert_time_cols_to_numericr   _convert_to_numericr   rQ   _sqrt_and_convert_to_timedeltar  r  r,  r~   sqrtr   r<  r   )rk   r  r  r-  r  r   r   rg   r   rR  needs_time_conversion
numeric_ddrS  rE  r  r   Zsqrt_func_kwargsZ	sqrt_funcrp   rp   rq   rO  U	  s|    


  




z
_Frame.stdc           	      C  s   ddl m} d}|dkr(| |j  }n|  }|dkr|t|t| jkr|d}|t| dt| tj	gi| jd| j
d}n|D ]}t|| |||< q||fS )	Nr   from_pandasTr  Fr  r   r   )r  r]  r   r  ru   r=  rG   rH   r~   nanr   rW  )	rk   rS  r  r   r  r]  rZ  r[  rJ  rp   rp   rq   rV  	  s"    	z$_Frame._convert_time_cols_to_numeric	propagatec                 C  s   |  |}t| d | j }|dkrLttj| || jd |dd}t||S | jdkrp| j	| ||d}t||S | j
||d}t| tr| j | j f|_t||S dS )af  
        .. note::

           This implementation follows the dask.array.stats implementation
           of skewness and calculates skewness without taking into account
           a bias term for finite sample size, which corresponds to the
           default settings of the scipy.stats skewness calculation. However,
           Pandas corrects for this, so the values differ by a factor of
           (n * (n - 1)) ** 0.5 / (n - 2), where n is the number of samples.

           Further, this method currently does not support filtering out NaN
           values, which is again a difference to Pandas.
        skewr  Fr  )bias
nan_policyN)r  r  r   rb  r   rQ   r  r  r.  _skew_1d_skew_numericr|   r#  r=  r   r   r   )rk   r  rc  rd  r   rg   r   rE  rp   rp   rq   rb  	  s(    






z_Frame.skewc           	      C  s   ddl m} tj|jr$|d}t|j	tj
s>|d}| jd t| }|j|jd||d}|dftj|jfdfi}tj|||gd}t|||j ddgdS )	z1D version of the skew calculation.

        Uses the array version from da.stats in case we are passing in a single series
        r   statsr5  zskew-1d-r  rc  rd  Nr   r  )
dask.arrayrh  r   rK  rL  r   r<  r~   r;  r   r1  r  r$   rb  r  r*   wrap_skew_reductionr   rN   r   r   )	rk   rM  rc  rd  da_statsr   
array_skewr   r   rp   rp   rq   re  	  s(    

      z_Frame._skew_1dc                 C  s   ddl m} | jddgtjgd}|jj}|j}t|tjsJ|j	d}|j
|d||d}| jd t| }t|r~|jjnd	}	|jjjdd
j}
|jfdt|
  }|dftj||	fi}tj|||gd}t|||j
 d	d	gdS )Method for dataframes with numeric columns.

        Maps the array version from da.stats onto the numeric array of columns.
        r   rg  r1  r  r2  r5  ri  r7  Nr  r8  r   r  )rj  rh  r9  r~   r:  r  r   r;  r1  r<  rb  r  r$   rC   r   r=  r   r,  r"  r   ru   r*   rk  rN   r   r   )rk   rc  rd  rl  r   r?  r@  rm  r   rR  Z
skew_shapeZarray_skew_namer   r   rp   rp   rq   rf  
  s0          z_Frame._skew_numericc           	      C  s   |  |}t| d | j }|dkrLttj| || jd |dd}t||S | jdkrr| j	| |||d}t||S | j
|||d}t| tr| j | j f|_t||S dS )a  
        .. note::

           This implementation follows the dask.array.stats implementation
           of kurtosis and calculates kurtosis without taking into account
           a bias term for finite sample size, which corresponds to the
           default settings of the scipy.stats kurtosis calculation. This differs
           from pandas.

           Further, this method currently does not support filtering out NaN
           values, which is again a difference to Pandas.
        kurtosisr  Fr  )fisherrc  rd  N)r  r  r   ro  r   rQ   r  r  r.  _kurtosis_1d_kurtosis_numericr|   r#  r=  r   r   r   )	rk   r  rp  rc  rd  r   rg   r   rE  rp   rp   rq   ro  ,
  s:    




   
  
z_Frame.kurtosisc           
      C  s   ddl m} tjj|jr&|d}t	|j
tjs@|d}| jd t| }|j|jd|||d}|dftj|jfdfi}tj|||gd}	t|	||j ddgdS )	z1D version of the kurtosis calculation.

        Uses the array version from da.stats in case we are passing in a single series
        r   rg  r5  zkurtosis-1d-r  rp  rc  rd  Nr   r  )rj  rh  r   apitypesZis_integer_dtyper   r<  r~   r;  r   r1  r  r$   ro  r  r*   wrap_kurtosis_reductionr   rN   r   r   )
rk   rM  rp  rc  rd  rl  r   array_kurtosisr   r   rp   rp   rq   rq  `
  s6    

          z_Frame._kurtosis_1dc                 C  s   ddl m} | jddgtjgd}|jj}|j}t|tjsJ|j	d}|j
|d|||d}| jd t| }	t|r|jjnd	}
|jjjdd
j}|jfdt|  }|	dftj||
fi}tj|	||gd}t||	|j
 d	d	gdS )rn  r   rg  r1  r  r2  r5  rs  zkurtosis-numericNr  r8  r   r  )rj  rh  r9  r~   r:  r  r   r;  r1  r<  ro  r  r$   rC   r   r=  r   r,  r"  r   ru   r*   rv  rN   r   r   )rk   rp  rc  rd  rl  r   r?  r@  rw  r   rR  Zkurtosis_shapeZarray_kurtosis_namer   r   rp   rp   rq   rr  
  s>         
     z_Frame._kurtosis_numericc                 C  s   |  |}t| d t  | jj||||d}W 5 Q R X |dkrfttj| || jd |||| j|d	S | 	 }|j
|||d}|j|d}	| jd }
ttj||	 ||
d| jd}t| tr| j | j f|_|S d S )	Nsem)r  r  r-  rg   r  )r   r  r  r  r-  r   rg   rQ  r  FrT  )r  r  r.   r   rx  r   rQ   r  r   rj   r,  r  r~   rY  r|   r#  r=  r   r   r   )rk   r  r  r-  r  rg   r   r   r  r  r   rE  rp   rp   rq   rx  
  sH    

   
	
z
_Frame.semr#  c                   s@   |}dt| }jj||d}|dkrhttrFtdttj||d|dfjd	S t	d 
 }t fd	d
|jD }dd |D }	t|d tr|dft||	|jd|jfi}
tj||
|d}t|jt|jf}t||||S |dftj|	dfi}
tj||
|d}t||||d jS dS )a  Approximate row-wise and precise column-wise quantiles of DataFrame

        Parameters
        ----------
        q : list/array of floats, default 0.5 (50%)
            Iterable of numbers ranging from 0 to 1 for the desired quantiles
        axis : {0, 1, 'index', 'columns'} (default 0)
            0 or 'index' for row-wise, 1 or 'columns' for column-wise
        method : {'default', 'tdigest', 'dask'}, optional
            What method to use. By default will use dask's internal custom
            algorithm (``'dask'``).  If set to ``'tdigest'`` will use tdigest
            for floats and ints and fallback to the ``'dask'`` otherwise.
        zquantiles-concat--)r  rg   r  z+'q' must be scalar when axis=1 is specifiedFr5  )r  rB  rg   r   r   r%  c                 3  s   | ]}t |  V  qd S r   r%  rw   cr'  r$  rk   rp   rq   r  
  s     z"_Frame.quantile.<locals>.<genexpr>c                 S  s   g | ]}|j d fqS r8  r   )rw   Z_qrp   rp   rq   ry   
  s     z#_Frame.quantile.<locals>.<listcomp>r   Nr   )r  r$   r   r%  r|   r   r  r   rQ   r  rj   r   r=  r   r   r   rN   r   r   r   r   r*   r   r   r   )rk   r$  r  rg   r'  keynamer   r   Z	quantilesqnamesr   r   r   rp   r|  rq   r%  
  sN    


     z_Frame.quantilec                   s  t rd i}n rtdni }jjdkrbjjf ||d|} }	||	_|	S |d kr|d krtjtj	g}
 r|

tj jj|
d}t|jdkrjj}nNtj	tg}
 r|

tj jj|
d}t|jdkr S |j}n:|dkr2|d k	r(d}t|jj}njj||d	} fd
d|D }dd |D }dt }|dftj|fi}tj|||d}jjf ||d	|}t|||d d gdS )Ndatetime_is_numeric>datetime_is_numeric=True is only supported for pandas >= 1.1.0r  )percentilesr3  r4  rB  r   r  z*exclude must be None when include is 'all'r2  c              	     s"   g | ]} |  qS rp   )_describe_1drC  r  r  percentiles_methodrk   r  rp   rq   ry   F  s   z#_Frame.describe.<locals>.<listcomp>c                 S  s   g | ]}|j d fqS r8  r   rw   r!  rp   rp   rq   ry   P  s     z
describe--r   r  )r9   ri   r   r.  r   describer  r~   r1  r:  r   Z
datetime64r9  ru   r=  r  _describe_numericr  r$   r*   Zdescribe_aggregaterN   r   r   )rk   r  r  r  r3  r4  r  datetime_is_numeric_kwargr   outputZ_includer:  Zchosen_columnsZbools_and_timesrO  rh  stats_namesr   r   r   rp   r  rq   r    s~    
    





 z_Frame.describec                 C  s   t |jr| j|||dS t|jr6| j||||dS t|jrX| j| |||ddS t|jr~|r~| j| |||ddS | j|||dS d S )N)r  r  )r  r  r  T)r  r  r  is_timedelta_column)r  r  r  is_datetime_column)r   r   _describe_nonnumeric_1dr   r  r   r  r   )rk   r:  r  r  r  r  rp   rp   rq   r  Z  sD    
  

  z_Frame._describe_1dc                 C  s\  ddl m} |s|r||}n| }|jdkrHt|jdkrHtdn|jdkrd|jdkrdtd|d krxdd	d
g}n(t	|}t
|d	}t|}t|}|j|d|j|d|j|d|j|d|j||d|j|dg}	dd |	D }
t|jr|jjnd }dt|| }|dftj|
|||fi}tj|||	d}|j }t|||d d gdS )Nr   
to_numericrM  z)DataFrame contains only non-numeric data.r  r  z,Cannot compute ``describe`` on object dtype.g      ?r#  g      ?r  r'  c                 S  s   g | ]}|j d fqS r8  r   r  rp   rp   rq   ry     s     z,_Frame._describe_numeric.<locals>.<listcomp>zdescribe-numeric--r   r  )dask.dataframe.numericr  rj   r.  ru   r=  r  r   r~   r  r   r   r   r  r  rO  r   r%  r   rE   r   r   r$   r*   Zdescribe_numeric_aggregaterN   r   r   r  r   )rk   r:  r  r  r  r  r  r  r   rh  r  colnamer   r   r   r   rp   rp   rq   r    sF    	








	
z_Frame._describe_numericc                 C  s  ddl m} |j|d}||dk }|j}||j|d|jdddddg}t|jr|s|| j	|d}	|| j
|d}
||	|
g dd |D }|jj}d	t|| }|dftj||fi}tj|||d
}trd|i}n|rtdni }|jjf |}t|||d d gdS )Nr   r  r  r  F)r   r   r  c                 S  s   g | ]}|j d fqS r8  r   r  rp   rp   rq   ry     s     z2_Frame._describe_nonnumeric_1d.<locals>.<listcomp>zdescribe-nonnumeric-1d--r   r  r  r  )r  r  r  r  r  r  r   r   r  r   r   extendr   r$   r*   Zdescribe_nonnumeric_aggregaterN   r   r9   ri   r   r  r   )rk   r:  r  r  r  ZvcountsZcount_nonzeroZcount_uniquerh  Zmin_tsZmax_tsr  r  r   r   r   r  r   rp   rp   rq   r    s8    
	 

z_Frame._describe_nonnumeric_1dc                 C  s  |  |}|dkrB| j | d}| j|fd|i|}	t||	S | j | d}
t|| f|
| d|}| j | d}tt||t| g dd|d	}t| }| j | d
| }| j | d| }i }|jdf||df< td| j	D ]j}|dkr|j|d f|||f< n(t
j|||d f|j|d ff|||f< ||j|f||ff|||f< qtj||||gd}t|||| j| j}	t||	S dS )z Wrapper for cumulative operationr  z(axis=1)r  z-mapr  r   z
-take-lastr_  r   r   r  r   z
-cum-last-r   r   N)r  r  r   r  
_take_lastrH   r$   r   r   r   r*   Z_cum_aggregate_applyrN   r   r   r   r   )rk   Zop_namerS  rT  r  r  rt  r   r   rE  name1Zcumpartr  Zcumlastr  cnamer   rx   r   rp   rp   rq   _cum_agg  sX    

  

  z_Frame._cum_aggc              
   C  s$   | j dtjtj||t||d|dS )Ncumsumr  rS  rT  r  r  rt  r   )r  rQ   r  r*   Zcumsum_aggregater  rk   r  r  r   r   rp   rp   rq   r    s    
z_Frame.cumsumc              
   C  s$   | j dtjtj||t||d|dS )Ncumprodr  r  )r  rQ   r  r*   Zcumprod_aggregater  r  rp   rp   rq   r  '  s    
z_Frame.cumprodc              
   C  s$   | j dtjtj||t||d|dS )Ncummaxr  r  )r  rQ   r  r*   Zcummax_aggregater  rk   r  r  r   rp   rp   rq   r  3  s    
z_Frame.cummaxc              
   C  s$   | j dtjtj||t||d|dS )Ncumminr  r  )r  rQ   r  r*   Zcummin_aggregater  r  rp   rp   rq   r  ?  s    
z_Frame.cumminc                 C  s   t tj| ||ddS rC  )r   rQ   r  rk   r	  r   rp   rp   rq   r  K  s    z_Frame.wherec                 C  s   t tj| ||ddS rC  )r   rQ   rJ  r  rp   rp   rq   rJ  Q  s    z_Frame.maskc                 C  s   | j tjddS rC  )r   rQ   r  r   rp   rp   rq   r  U  s    z_Frame.notnullc                 C  s   | j tjddS rC  )r   rQ   isnullr   rp   rp   rq   r  Y  s    z_Frame.isnullc                 C  s&   t tdr| jtjddS tdd S )NrI  FrD  zNNeed more recent version of Pandas to support isna. Please use isnull instead.)r   r   r   rQ   rI  ri   r   rp   rp   rq   rI  ]  s
    
z_Frame.isnac                 C  s   t | jrttjtjf}ntf}t||r>tdtt	| | j
|}t|trztt|}W n tk
rx   Y nX tdd |D sztj|td}W n tk
r   Y nX | jtjt||ddS )NzPassing a %r to `isin`c                 s  s   | ]}t |V  qd S r   )r"   r  rp   rp   rq   r  ~  s     z_Frame.isin.<locals>.<genexpr>r   Fr  )rC   r   r   r   r   r#  r|   ri   rb   r   r   isinr   r   r   r  r~   Zfromiterr  r  r   rQ   rL   )rk   r  Z	bad_typesr   rp   rp   rq   r  h  s,    


   z_Frame.isinc                 C  s   t | jr t|r | j|}n| j|}t|drVdd | D }t||d}n t|rvt|dd d krvt|}| j	t
j||ddS )Nitemsc                 S  s,   g | ]$\}}t |rt|d ddkr|qS )
categoriesN)rB   r  rw   r  r  rp   rp   rq   ry     s    z!_Frame.astype.<locals>.<listcomp>)rR  r  F)r   r   rB  )rC   r   rB   r   r<  r   r  r=   r  r   rQ   )rk   r   r   Zset_unknownrp   rp   rq   r<    s     
   z_Frame.astypec                 C  sH   t rtdt ddlm} t|ttfr6d}t	||| |gd|dS )NzzThe frame.append method is deprecated and will be removed fromdask in a future version. Use dask.dataframe.concat instead.r   r   z)append doesn't support list or dict inputouter)joininterleave_partitions)
r+   r  r  FutureWarningdask.dataframe.multir   r|   r   r  ri   )rk   r   r  r   rO  rp   rp   rq   r     s      z_Frame.appendc                   st   t |tstdt |trN jtj|d|d}|j|jdj	dd |j
dS  fdd	} j||d|djd
dS )Nz9The second operand must be a dask array or dask dataframedotr  )byc                 S  s   | j ddS )NFr  )r  r  rp   rp   rq   r     r   z_Frame.dot.<locals>.<lambda>r   c                    s   t  tj| |S r   )rH   rQ   r  )rl   rm   r   rp   rq   _dot_series  s    z_Frame.dot.<locals>._dot_seriesFr  )r|   r   r   r#  r   rQ   r  groupbyr   rT   r   r  )rk   r   r   r!  r  rp   r   rq   r    s    

 z
_Frame.dotr  c                   s   t tj| ||||d\}}| jtj||||dd}t| ||||}d|   fddt| D }	|	|j t	|	 ||j
}
d| fddt| D }||j t	|||j
}|
|fS )	Nr  
fill_valueF)r  r  r  rB  zalign1-c                   s    i | ]\}} |ft |d fqS r8  r
   r  )r  rp   rq   r    s    z _Frame.align.<locals>.<dictcomp>zalign2-c                   s    i | ]\}} |ft |d fqS r  r
   r  )r  rp   rq   r    s    )_emulaterQ   alignr   r$   r  r   r   r   r   r   )rk   r   r  r  r  Zmeta1Zmeta2Zalignedr  Zdsk1Zresult1r  Zresult2rp   )r  r  rq   r    s<         
	



z_Frame.alignc                 C  s   | j tj||||dS )N)r  	overwriter   rQ   rs  )rk   r   ro   r  r  rp   rp   rq   rs    s        z_Frame.combinec                 C  s   |  tj|S r   r   rQ   combine_firstr   rp   rp   rq   r    s    z_Frame.combine_firstc                 C  s   t dS )5bind operator method like DataFrame.add to this classNr  )r   r   r   originalrp   rp   rq   _bind_operator_method  s    z_Frame._bind_operator_methodc                 C  s   ddl m} || |||dS )Nr   )	Resampler)closedlabel)Zdask.dataframe.tseries.resampler  )rk   Zruler  r  r  rp   rp   rq   resample  s    z_Frame.resamplec           	        s   j | jstdtjj|}jd | }j	
|}| }|pXt|d }|jd krpj}njd |d  |f }dt|   fddt|D }tjj|fd ||df| |f< tj |gd	}t| |S )
Nz0`first` is not implemented for unknown divisionsr   deltar  zfirst-c                   s   i | ]} |fj |fqS rp   r   rv   r  rp   rq   r    s      z _Frame.first.<locals>.<dictcomp>Tr   )r   r   r/  r  r   tseriesfrequencies	to_offsetr   r  _get_partitionsis_anchoredr   r   r$   r   r*   boundary_slicer   rN   r   r   )	rk   offsetdateendr  Zinclude_rightdivsr   r   rp   r  rq   r     s,    z_Frame.firstc                   s   j | jstdtjj|}jd | }j	
|}|dkrRj}n|fj|d d   }dt|   fddtt|jD }tjj|f|d dd	f| df< tj |gd
}t| |S )Nz/`last` is not implemented for unknown divisionsr6  r   r  zlast-c                   s(   i | ] \}} |d  fj |d  fqS r  r   )rw   rx   r  r  rp   rq   r  5  s   
 z_Frame.last.<locals>.<dictcomp>TFr   )r   r   r/  r  r   r  r  r  r   r  r  r$   r  r   r   r*   r  r   rN   r   r   )rk   r  r  startr  r   r   rp   r  rq   last#  s,    z_Frame.lastc              	   C  s*   ddl m} t| g|j|j|j|dtdS )a9  Approximate number of unique rows.

        This method uses the HyperLogLog algorithm for cardinality
        estimation to compute the approximate number of unique rows.
        The approximate error is 0.406%.

        Parameters
        ----------
        split_every : int, optional
            Group partitions into groups of this size while performing a
            tree-reduction. If set to False, no tree-reduction will be used.
            Default is 8.

        Returns
        -------
        a float representing the approximate number of elements
        r   )hyperloglog   )rS  rs  rT  r  br   )dask.dataframer  rZ  Zcompute_hll_arrayZreduce_stateZestimate_countr_  )rk   r  r  rp   rp   rq   nunique_approxD  s    z_Frame.nunique_approxc                 C  s   |  tjS )zReturn a dask.array of the values of this dataframe

        Warning: This creates a dask.array without precise shape information.
        Operations that depend on shape information, like slicing or reshaping,
        will not work.
        )r   r*   r  r   rp   rp   rq   r  b  s    z_Frame.valuesc                 C  s   ddl m} t|trtt|}t|| jkrFtdt| d| j | jdkr\||f}n||t| j	ff}|S |d k	rtd| d|j
S )Nr   )normalize_chunkszJThe number of items in 'lengths' does not match the number of partitions.  != r  z!Unexpected value for 'lengths': '')dask.array.corer  r|   r   r   ru   r   r  r.  r=  r  )rk   r  r  r  r  rp   rp   rq   r  l  s    

z_Frame._validate_chunksc                 C  sF   | j jdk	oDt| oDt|s*t|toD|| j jkoD|t| ddkS )z
        Test whether a key is an index level reference

        To be considered an index level reference, `key` must match the index name
        and must NOT match the name of any column (if a dataframe).
        Nr=  rp   )r   r   r"   r~   isscalarr|   r   r  rk   r   rp   rp   rq   _is_index_level_reference  s    
z _Frame._is_index_level_referencec                   s.   t |tr t fdd|D S  |S dS )zb
        Test whether the input contains a reference to the index of the DataFrame/Series
        c                 3  s   | ]}  |V  qd S r   )r  rw   r  r   rp   rq   r    s     z._Frame._contains_index_name.<locals>.<genexpr>N)r|   r   r  r  )rk   columns_or_indexrp   r   rq   _contains_index_name  s    
z_Frame._contains_index_name)F)N)N)F)N)NNr  F)TF)NF)r1  r  T)r1  T)NNNNF)NNNFN)NNNN)NN)NN)NNFN)NNF)NN)r  F)
Nr  TNNNNTFN)T)F)NFNr   )r  r   )r  Nr   )NTFNN)NTFN)NTFN)NTFNNNN)NTFNNNN)NTFNN)NTFNN)NTF)NTF)NFN)TF)NTFNNN)Nr"  )Nr"  )NTr  FNNN)Tr  F)Tr  F)Tr  F)Tr  F)NTr  FNNN)NTra  NN)Tra  )Tra  )NTTra  NN)TTra  )TTra  )NTr  FN)r#  r   Tr"  )FNr"  NNF)FNr"  F)FNr"  FF)FF)TNN)NTNN)NTNN)NTN)NTN)F)r  NN)NT)NN)N)r   r   r   r   r   r   r   r   r   r)   r8   r!   r   r   r   r   r   r   r   r   r   r   setterr   rU   r   r#  r  r  r   r   r   r   r  r   r  r)  r+  r,  r0  r   r   rG  r/  rH  rL  rP  rX  r[  r   __nonzero__r^  r`  ra  Z__long__rc  rA   r   rg  rn  
no_defaultr  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r?  r  r  r  r  r  r   r   r   r   r  r  r  r  r  r  r  r  r  rs   r  r
  productr   r   r  r  r  r  r  r'  r+  r,  r0  rF  rG  r/  rO  rV  rb  re  rf  ro  rq  rr  rx  r%  r  r  r  r  r  r  r  r  r  r~   r`  r  rJ  r  r  rI  r  r<  r-   r   r   r  r  rs  r  r  r  r   r  r  r  r  r  r  rp   rp   rp   rq   r   ?  s    








	


	
8      
 
  
4#


    
g    
7>6$
         "
3%    
0


                                   0 
	      )      P        )!     2%%9     W   
+    
8  
-    
6



 !! 	r   c                 C  s.   t | tr*t| dr*| jtkr*td| dS )zv
    Utility function to raise an error if an object column does not support
    a certain operation like `mean`.
    r   z%`%s` not supported with object seriesN)r|   r   r   r   r  r  r  rV   rp   rp   rq   r    s    r  c                      s`  e Zd ZU dZejZeeZ	dZ
e Zded< dddZedd	 Zed
d Zejdd Zedd Zedd Zedd ZedeZedeZedeZdd Zedd Zdd Zdd Z ddd Z!e"ejdd"d#Z#e"ej$dd%d&Z%dd)d*Z&dd+d,Z'e"ejdd-d.Z(dd0d1Z)d2d3 Z*e"ej$dd5d6Z+e,s\e"ejd7d8 Z-e"ejd9d: Z.e/d;d<d=d> Z0e1dd?d@Z2e"ejde3dddfdAdBZ4e"ejd fdCdD	Z5e"ejd fdFdG	Z6e"ejdHdI Z7ddKdLZ8e"ejddMdNZ9e"ejddOdPZ:e"ejddRdSZ;e"ejddTdUZ<e"ej fdVdWZ=e>dXdYe"ejde?fdZd[Z@e"ejd\d] ZAe"ejdd_d`ZBe"ejddadbZCe"ejdcdd ZDe"ejdedf ZEe"ejd fdhdi	ZFe"ejddjdkZGe"ejdldm ZHe"ejdndo ZIddqdrZJe"ejddsdtZKe"ejddudvZLe1ejfdwdxZMe1ejfdydzZNe>dXdYdEe?d{fd|d}ZOe"ejdd~dZPe"ejdddZQe"ejdddZRe"ejdddZSdd ZTdd ZUe,see"ejdd ZVee"ejdd ZWee"ejdd ZXe"ejdd ZY  ZZS )r   a  Parallel Pandas Series

    Do not use this class directly.  Instead use functions like
    ``dd.read_csv``, ``dd.read_parquet``, or ``dd.from_pandas``.

    Parameters
    ----------

    dsk: dict
        The dask graph to compute this Series
    _name: str
        The key prefix that specifies which keys in the dask comprise this
        particular Series
    meta: pandas.Series
        An empty ``pandas.Series`` with names, dtypes, and index matching the
        expected output.
    divisions: tuple of index values
        Values along which we partition our blocks on the index

    See Also
    --------
    dask.dataframe.DataFrame
    series-ClassVar[set[str]]
_accessorsNc                 C  s   t |trRt|dkrRt |d d tjrB|d d jdkrBd }q|d d j}nLz$dd l}d| d d  d}W n t	k
r   d}Y nX t
| dt| ||| jdS )	Nr   r  rp   `   This methodz0 is not implemented for `dask.dataframe.Series`.r   r   )r|   r   ru   r~   r   r"  r   inspectstack
IndexErrorri   rH   r   rk   r  r  r   r  method_namerp   rp   rq   r    s    &
zSeries.__array_wrap__c                 C  s   | j gS r   r^  r   rp   rp   rq   axes  s    zSeries.axesc                 C  s   | j jS r   )r   r   r   rp   rp   rq   r     s    zSeries.namec                 C  s&   || j _t| |}|j| _|j| _d S r   )r   r   _rename_daskr   r   )rk   r   renamedrp   rp   rq   r     s    
c                 C  s   dS )Return dimensionalityr  rp   r   rp   rp   rq   r.    s    zSeries.ndimc                 C  s   | j fS )a  
        Return a tuple representing the dimensionality of a Series.

        The single element of the tuple is a Delayed result.

        Examples
        --------
        >>> series.shape  # doctest: +SKIP
        (dd.Scalar<size-ag..., dtype=int64>,)
        )r  r   rp   rp   rq   r"    s    zSeries.shapec                 C  s   | j jS )zReturn data typer   r   rp   rp   rq   r     s    zSeries.dtypedtcatrN  c                 C  sD   t tt| }|| j dD ]}t| j|s || q t|S )N)r  rN  r   )rk   r   accessorrp   rp   rq   r     s    zSeries.__dir__c                 C  s   | j tjtjdtddS )zNumber of bytesnbytesFr  )r  r*   r  r~   r  r  r   rp   rp   rq   r    s        zSeries.nbytesc                 C  s   t | j| jS r   )_repr_data_seriesr   r0  r   rp   rp   rq   r,    s    zSeries._repr_datac                 C  s\   | j dk	r d| j  d| j }nd| j }dj| jj|  |t| jtt	| j
jddS )zhave to overwrite footerNzName: z	, dtype: zdtype: zCDask {klass} Structure:
{data}
{footer}
Dask Name: {name}, {layers}r8  )r9  r:  footerr   r;  )r   r   r@  rA  r   r<  rZ   r   r[   ru   r   r;  )rk   r  rp   rp   rq   r     s    
zSeries.__repr__Fc                 C  s  ddl m}m}m} ddlm} ||sD||rl||slt||jsl|rTt	dt
 |r\| n|  }||_n| jtj|dd}| jr|rt|s||rtjt| jd | jd}	|	|j}
|
jsd	}t|tt|
|_n| }|r|j| _|j| _|j| _|j | _ | }|S )
a  Alter Series index labels or name

        Function / dict values must be unique (1-to-1). Labels not contained in
        a dict / Series will be left as-is. Extra labels listed don't throw an
        error.

        Alternatively, change ``Series.name`` with a scalar value.

        Parameters
        ----------
        index : scalar, hashable sequence, dict-like or callable, optional
            If dict-like or callable, the transformation is applied to the
            index. Scalar or hashable sequence-like will alter the
            ``Series.name`` attribute.
        inplace : boolean, default False
            Whether to return a new Series or modify this one inplace.
        sorted_index : bool, default False
            If true, the output ``Series`` will have known divisions inferred
            from the input series and the transformation. Ignored for
            non-callable/dict-like ``index`` or when the input series has
            unknown divisions. Note that this may only be set to ``True`` if
            you know that the transformed index is monotonically increasing. Dask
            will check that transformed divisions are monotonic, but cannot
            check all the values between divisions, so incorrectly setting this
            can result in bugs.

        Returns
        -------
        renamed : Series

        See Also
        --------
        pandas.Series.rename
        r   )is_dict_likeis_list_like	is_scalarNzE'inplace' argument for dask series will be removed in future versionsFrD  r  r^  zGsorted_index=True, but the transformed index isn't monotonic_increasing)!pandas.api.typesr  r  r  r  Z	dataframer|   r   r  r  PendingDeprecationWarningr  r   r   rQ   r   r/  callabler   r   r   r   r   is_monotonic_increasingr  r   r*   r  r  rH  r   r   r   )rk   r   inplaceZsorted_indexr  r  r  ddr  oldnewrO  rp   rp   rq   r   +  sD    #
zSeries.renamer   c                 C  s   t tj| |S r   r&  rQ   roundrk   Zdecimalsrp   rp   rq   r  v  s    zSeries.roundr  c                 C  s,   t tj| |||}tt| j |_|S r   r&  rQ   to_timestampr   r   r$  r   rk   r  howr  dfrp   rp   rq   r
  z  s    zSeries.to_timestampr#  r"  c                 C  s   t | ||dS )a  Approximate quantiles of Series

        Parameters
        ----------
        q : list/array of floats, default 0.5 (50%)
            Iterable of numbers ranging from 0 to 1 for the desired quantiles
        method : {'default', 'tdigest', 'dask'}, optional
            What method to use. By default will use dask's internal custom
            algorithm (``'dask'``).  If set to ``'tdigest'`` will use tdigest
            for floats and ints and fallback to the ``'dask'`` otherwise.
        r  ry  )rk   r$  r'  rp   rp   rq   r%    s    zSeries.quantilec                 C  s   | j d|dS )a  Return the approximate median of the values over the requested axis.

        Parameters
        ----------
        method : {'default', 'tdigest', 'dask'}, optional
            What method to use. By default will use Dask's internal custom
            algorithm (``"dask"``).  If set to ``"tdigest"`` will use tdigest
            for floats and ints and fallback to the ``"dask"`` otherwise.
        r#  )r$  r'  ry  rk   r'  rp   rp   rq   r'    s    
zSeries.median_approximatec                 C  s"   | j dkr| j|dS tdd S )Nr  r  r)  r*  r  rp   rp   rq   r+    s
    
zSeries.median      ?c                 C  s   ddl m} || ||dS )z7Approximate quantiles of Series used for repartitioningr   )partition_quantiles)upsample)Z!dask.dataframe.partitionquantilesr  )rk   r   r  r  rp   rp   rq   _repartition_quantiles  s    zSeries._repartition_quantilesc                 C  sd   t |trZ| j|jkrZdt| | }ttj|| |}tj||| |gd}t||| j	| jS | j
| S )Nzindex-%sr   )r|   r   r   r$   partitionwise_graphoperatorr   rN   r   r   r  )rk   r   r   r   r   rp   rp   rq   __getitem__  s    zSeries.__getitem__r  c                 C  s   | S r   rp   )rk   r  rQ  rp   rp   rq   rj     s    zSeries._get_numeric_datac                 C  s    t rtdt dd }|| S )NzTiteritems is deprecated and will be removed in a future version. Use .items instead.c                 s  s0   t | jD ] }| | }| E d H  q
d S r   )r   r   rP  r   r  rk   rx   r!  rp   rp   rq   r    s    zSeries.iteritems.<locals>._)r,   r  r  r  )rk   r  rp   rp   rq   	iteritems  s    	zSeries.iteritemsc                 c  s,   t | jD ]}| | }|E d H  q
d S r   )r   r   rP  r   r  rp   rp   rq   __iter__  s    zSeries.__iter__zUsing the ``in`` operator to test for membership in Series is deprecated. To test for membership in the index use ``(s.index == key).any()``. Similarly to test for membership in the values use ``(s == key).any()``)messagec                 C  s   | |k   S r   )r  r   r  rp   rp   rq   __contains__  s    	zSeries.__contains__c                 C  s(   |dkrt d| ddd||S )N)r   r   NNo axis named r   )Nr   r  rh   r   r  rp   rp   rq   r    s    zSeries._validate_axisc                 K  s(   ddl m} || f|||||d|S )Nr   )SeriesGroupByr  
group_keyssortobservedr  )dask.dataframe.groupbyr  )rk   r  r   r!  r"  r  rm   r  rp   rp   rq   r    s    
zSeries.groupbyc                   s   t  j|dS Nr  )superr  rk   r  rA  rp   rq   r    s    zSeries.countTc                   s   t  j||dS )Nr  r  )r%  r  )rk   r  r  r'  rp   rq   r    s    zSeries.modec                 C  s   | j  }| jtj|ddS NFr  r   exploder   rQ   r  rp   rp   rq   r+    s    
zSeries.exploder  c              
   C  s    t | tjtj| jd|| j|dS )z
        Return Series of unique values in the object. Includes NA values.

        Returns
        -------
        uniques : Series
        r   )rS  rT  r   r  r  Zseries_namerU  )rZ  r*   r   r   r   )rk   r  rU  rp   rp   rq   r     s    zSeries.uniquec                 C  s"   | j |d}|r| S |jS d S r$  )rX  r  r  )rk   r  r  Zuniqsrp   rp   rq   nunique  s    zSeries.nuniquec           	      C  s   ||d}|dk	r.t s&tdtj ||d< d|i}|dkr^|dkrNt| n
t|  |d< t| ftjt	j
t	j| jj|d	d
||t|d	|S )zo
        Note: dropna is only supported in pandas >= 1.1.0, in which case it defaults to
        True.
        )r!  	ascendingNzddropna is not a valid argument for dask.dataframe.value_counts if pandas < 1.1.0. Pandas version is r  	normalizer  Ftotal_length)r.  zvalue-counts)	rS  rT  rs  r   r  r  rU  rV  ru  )r9   ri   r   __version__ru   r  rZ  rQ   r  r*   Zvalue_counts_aggregateZvalue_counts_combiner   split_out_on_index)	rk   r!  r-  r  r.  r  rU  rm   ru  rp   rp   rq   r  "  s4    

zSeries.value_countsr1  c              	   C  s   t | tjtj| jd||dS )Nzseries-nlargestrS  rT  r   r  r  r  rZ  rQ   nlargestr   rk   r  r  rp   rp   rq   r4  N  s    zSeries.nlargestc              	   C  s   t | tjtj| jd||dS )Nzseries-nsmallestr2  rZ  rQ   	nsmallestr   r5  rp   rp   rq   r7  Z  s    zSeries.nsmallestc                   s   t  |S r   )r%  r  )rk   r  r'  rp   rq   r  f  s    zSeries.isinrd  re  c                   s   t  rt rt|  S t tsNt sNt  r<t rNtdt  dt|    fddt	| 
 D }tj|| gd}|tkrttj|  dd}nt|tt| dd | jd	}t| ||| jS )
Nz1arg must be pandas.Series, dict or callable. Got zmap-c                   s$   i | ]\}}|ft j| fqS rp   )rQ   map)rw   rx   r  arg	na_actionr   rp   rq   r  z  s    zSeries.map.<locals>.<dictcomp>r   T)r;  udfr   r   r   )rE   r"   
series_mapr|   r  r   r   r   r$   r  r   rN   r   r  r  rQ   r8  rF   r  r   r   )rk   r:  r;  r   r   r   rp   r9  rq   r8  k  s4    

z
Series.mapc                 C  s   | j tjddS rC  )r   rQ   r  r   rp   rp   rq   r    s    zSeries.dropnabothc                 C  s   | j tj|||dS )N)leftright	inclusive)r   rQ   between)rk   r@  rA  rB  rp   rp   rq   rC    s       zSeries.betweenc                 C  s$   |d k	rt d| jtj||ddS Nz'out' must be NoneF)lowerupperrB  r  r   rQ   cliprk   rE  rF  r   rp   rp   rq   rH    s       zSeries.clipc                 C  s   | j tj|ddS NF)	thresholdrB  r   rQ   
clip_lowerrk   rK  rp   rp   rq   rM    s
      zSeries.clip_lowerc                 C  s   | j tj|ddS rJ  r   rQ   
clip_upperrN  rp   rp   rq   rP    s
      zSeries.clip_upperr  c                   s   t  j||||dS )N)r  r  r  )r%  r  )rk   r   r  r  r  r'  rp   rq   r    s    zSeries.alignc                 C  s   | j tj|||dS )Nr  r  )rk   r   ro   r  rp   rp   rq   rs    s    zSeries.combinec                 C  s   | S r   rp   r   rp   rp   rq   squeeze  s    zSeries.squeezec                 C  s   |  tj|S r   r  r   rp   rp   rq   r    s    zSeries.combine_firstr   c                 C  s   ddl m} || ||dS )zCreate a Dask Bag from a Seriesr   to_bagr@  r  rT  rk   r   r@  rT  rp   rp   rq   rT    s    zSeries.to_bagc                 C  s2   |d krg n|g}| j tjf|d| jj| iS )Nr   )r   rQ   to_framer   )rk   r   rl   rp   rp   rq   rX    s    zSeries.to_framec                 C  s   |   j|dS )N)r3  r,  r<  rk   r3  rp   rp   rq   r<    s    zSeries.to_stringc                   s,   d fdd	}||_ t| |t|| dS )z2bind operator method like Series.add to this classNr   c                   s@   |d k	rt d| |}t | |||d}t | ||||dS )Nlevel must be Noner  )r   r  r  )ri   r  r  r   )rk   r   levelr  r  r   r   rp   rq   meth  s    
     z*Series._bind_operator_method.<locals>.meth)NNr   r   setattrrU   r   r   r   r  r]  rp   r   rq   r    s    	zSeries._bind_operator_methodc                   s,   d fdd	}||_ t| |t|| dS )z3bind comparison method like Series.eq to this classNr   c                   sR   |d k	rt d| |}|d kr2t | ||dS t |d}t|| ||dS d S )Nr[  r  rQ  )ri   r  r&  r   )rk   r   r\  r  r  r   
comparisonrp   rq   r]    s    
z,Series._bind_comparison_method.<locals>.meth)NNr   r^  r   r   rb  r  r]  rp   ra  rq   _bind_comparison_method  s    
zSeries._bind_comparison_methodrp   c                 K  sT   |t kr6ttj| j|f||dd|}tt| ttj| |||fd|i|S )a7  Parallel version of pandas.Series.apply

        Parameters
        ----------
        func : function
            Function to apply
        convert_dtype : boolean, default True
            Try to find better dtype for elementwise function results.
            If False, leave as dtype=object.
        $META
        args : tuple
            Positional arguments to pass to function in addition to the value.

        Additional keyword arguments will be passed as keywords to the function.

        Returns
        -------
        applied : Series or DataFrame if func returns a Series.

        Examples
        --------
        >>> import dask.dataframe as dd
        >>> s = pd.Series(range(5), name='x')
        >>> ds = dd.from_pandas(s, npartitions=2)

        Apply a function elementwise across the Series, passing in extra
        arguments in ``args`` and ``kwargs``:

        >>> def myadd(x, a, b=1):
        ...     return x + a + b
        >>> res = ds.apply(myadd, args=(2,), b=1.5)  # doctest: +SKIP

        By default, dask tries to infer the output metadata by running your
        provided function on some fake data. This works well in many cases, but
        can sometimes be expensive, or even fail. To avoid this, you can
        manually specify the output metadata with the ``meta`` keyword. This
        can be specified in many forms, for more information see
        ``dask.dataframe.utils.make_meta``.

        Here we specify the output is a Series with name ``'x'``, and dtype
        ``float64``:

        >>> res = ds.apply(myadd, args=(2,), b=1.5, meta=('x', 'f8'))

        In the case where the metadata doesn't change, you can also pass in
        the object itself directly:

        >>> res = ds.apply(lambda x: x + 1, meta=ds)

        See Also
        --------
        dask.Series.map_partitions
        T)convert_dtyperl   r<  r   )	r  r  rQ   rT   r   r  r  meta_warningr   )rk   ro   re  r   rl   kwdsrp   rp   rq   rT     s0    7	    zSeries.applyc                 C  s>   ddl m} t|tstd|| |gdd}t||d|dS )Nr   r  %other must be a dask.dataframe.Seriesr  r  T)r  r  )r  r   r|   r   r   cov_corr)rk   r   r  r  r   r  rp   rp   rq   cov1  s
    
z
Series.covpearsonc                 C  sP   ddl m} t|tstd|dkr.td|| |gdd}t||dd|d	S )
Nr   r  rh  rk  -Only Pearson correlation has been implementedr  r  T)corrr  r  )r  r   r|   r   r   ri   ri  )rk   r   r'  r  r  r   r  rp   rp   rq   rm  :  s    
    zSeries.corrc                 C  s2   t |tstd| j|dkr"| n| ||dS )Nzlag must be an integerr   r  )r|   r   r   rm  r  )rk   Zlagr  rp   rp   rq   autocorrG  s    
zSeries.autocorrc                 C  s$   | j tjtj||dd| jd dS )Nrk  Fmemory-usagert  r  r  r  rQ   memory_usager  r  rm  rp   rp   rq   rr  M  s    zSeries.memory_usagec                 C  s   | | }| | }||fS r   rp   rk   r   Zres1Zres2rp   rp   rq   
__divmod__W  s    zSeries.__divmod__c                 C  s   ||  }||  }||fS r   rp   rs  rp   rp   rq   __rdivmod__\  s    zSeries.__rdivmod__c                 C  s   t rtdt | jS Nzhis_monotonic is deprecated and will be removed in a future version. Use is_monotonic_increasing instead.)r,   r  r  r  r  r   rp   rp   rq   is_monotonicc  s    zSeries.is_monotonicc                 C  s   t | tjtjtjtddS )NZmonotonic_increasingrS  rs  rT  r   r  )rZ  r*   Zmonotonic_increasing_chunkZmonotonic_increasing_combineZmonotonic_increasing_aggregater  r   rp   rp   rq   r  n  s    zSeries.is_monotonic_increasingc                 C  s   t | tjtjtjtddS )NZmonotonic_decreasingrx  )rZ  r*   Zmonotonic_decreasing_chunkZmonotonic_decreasing_combineZmonotonic_decreasing_aggregater  r   rp   rp   rq   is_monotonic_decreasingz  s    zSeries.is_monotonic_decreasingc                 C  s   | j |}| jtj||dS Nr  )r   viewr   rQ   )rk   r   r   rp   rp   rq   r{    s    zSeries.view)N)NFF)r   )Nr  r   )r#  r"  )r"  )r"  )r  )r  N)r   )F)TF)Nr  )NT)NFNFNr  )r1  N)r1  N)r?  )NNN)r  NN)N)Fr   )N)r1  )NF)rk  NF)r  F)TF)[r   r   r   r   r   r   _partition_typer   rE   r   r  r   r  __annotations__r  r   r  r   r  r.  r"  r   r/   r0   r  r2   r  r1   rN  r   r  r,  r   r   rU   r  r#  r
  r%  r'  r+  r  r  rj   r-   r  r  rS   r  r   r  GROUP_KEYS_DEFAULTr  r  r  r+  r   r,  r  r4  r7  r  rA   r  r8  r  rC  rH  rM  rP  r  rs  rR  r  rT  rX  r<  r  rd  rT   rj  rm  rn  rr  rt  ru  rw  r  ry  r{  __classcell__rp   rp   r'  rq   r     s  












K

	





      +





F		

r   c                      s  e Zd ZU ejZeeZdZ	e
 Zded< dddddd	d
ddddddddhZdddddddddddddd hZd!d"d#hZd$d% Z fd&d'Zed(d) ZdMd+d,ZdNd/d0ZeejdOd2d3ZeejdPd4d5ZdQd6d7ZeejdRd9d:Zeejd;d< Zeejd=gd>dSd?d@ZedAdBeejd*ed1f fdCdD	Ze sdeeej fdEdFZ!eeej fdGdHZ"eeej fdIdJZ#eejdTdKdLZ$  Z%S )Ur$  zindex-r  r  Z
nanosecondmicrosecondZmillisecondZ	dayofyearminutehourdayZ	dayofweeksecondweekweekdayZ
weekofyearmonthZquarteryearZknownZas_knownZ
as_unknownZadd_categoriesr  Zremove_categoriesZreorder_categoriesZ
as_orderedcodesZremove_unused_categoriesZset_categoriesZas_unorderedr	  Zrename_categoriesrw  r  ry  c                 C  s\   t | jjr"|| jkr"t| j|S || jkr8t| j|S || jkrLt| |S t	d| d S )Nz"'Index' object has no attribute %r)
rB   r   r   _cat_attributesr  r  _dt_attributesr  _monotonic_attributesAttributeErrorr  rp   rp   rq   __getattr__  s    


zIndex.__getattr__c                   s0   t   }|| j t| jr,|| j |S r   )r%  r   r  r  rB   r   r  )rk   r   r'  rp   rq   r     s
    

zIndex.__dir__c                 C  s   t | jjdd S )Nz  object has no attribute 'index')r  rA  r   r   rp   rp   rq   r     s    zIndex.indexNc                 C  s   t j|| jdS )Nr-  )r   r$  r   r  rp   rp   rq   r    s    zIndex.__array_wrap__r1  Tc                 C  sj   d|| j f }|dftj| j dftd|fi}tj||| gd}t||| j| jdd }|rf|	 }|S )z[First n items of the Index.

        Caveat, this only checks the first partition.
        z
head-%d-%sr   r   NrM  )
r   r  r   r  rN   r   r   r   r   r   r  rp   rp   rq   r    s     z
Index.headFc                 C  s    | j tj| j | jd |dS )Nr   r   r  r  )r  rQ   r   r   r  r&  rp   rp   rq   r     s    z	Index.maxc                 C  s    | j tj| j | jd |dS )Nr   r  )r  rQ   r   r   r  r&  rp   rp   rq   r     s    z	Index.minc                 C  s   | j tjtjdt|dS )Nzindex-countr  )r  r*   Zindex_countr~   r  r  r&  rp   rp   rq   r    s    zIndex.countr  c                 C  s   t | jtjrB|d k	rtd| j|}| jtj||ddd}n(| jj||d}| jtj|d||dd}|d krx|j	}t
|||dS )Nz*PeriodIndex doesn't accept `freq` argumentr  F)r   r  r  r  )r  r   r  r  )r|   r   r   PeriodIndexr  r   r  r   rQ   r  r  )rk   r  r  r   r   rp   rp   rq   r    s.        zIndex.shiftc                 C  s   | j tj| j ddS NF)r   r  )r   rQ   	to_seriesr   r   rp   rp   rq   r    s
    zIndex.to_seriesr   Zua_argsc                 C  sB   |s
t  |d kr|gn||g}| jtjf|| jj| ddS r  )ri   r   rQ   rX  r   )rk   r   r   rl   rp   rp   rq   rX    s    
zIndex.to_framerd  re  c                   sF   t  j|||d}|r:| jr:tt| jj||d|_n| }|S )a  
        Note that this method clears any known divisions.

        If your mapping function is monotonically increasing then use `is_monotonic`
        to apply the maping function to the old divisions and assign the new
        divisions to the output.

        )r;  r   )r;  )r%  r8  r/  r   r   r   r   rH  )rk   r:  r;  r   rw  Zappliedr'  rp   rq   r8  ,  s    
z	Index.mapc                   s   t rtdt t jS rv  )r,   r  r  r  r%  r  r   r'  rp   rq   rw  B  s    zIndex.is_monotonicc                   s   t  jS r   )r%  r  r   r'  rp   rq   r  M  s    zIndex.is_monotonic_increasingc                   s   t  jS r   )r%  ry  r   r'  rp   rq   ry  R  s    zIndex.is_monotonic_decreasingc                 C  s"   | j tjtjd|id| jd dS )Nr  Fro  rp  rq  r  rp   rp   rq   rr  W  s    zIndex.memory_usage)N)r1  T)F)F)F)r  N)TN)F)&r   r   r   r   r$  r|  r   rD   r   r  r   r  r}  r  r  r  r  r   r   r   r  r  rU   r   r   r  r  r  rX  rA   r  r8  r-   rw  r  ry  rr  r  rp   rp   r'  rq   r$    s   
	



	
	r$  c                      s  e Zd ZU dZejZeeZ	dZ
e Zded<  fddZddd	Zed
d Zedd Zejdd Zedd Z fddZdd Zedd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zed'd( Z ed)d* Z!ed+d, Z"e#ejd-d. Z$e#ejd/d0 Z%e#ejdd1d2Z&dd5d6d7d8d9d:d d;d<d=Z'dd?d7d7d6d@d7dAdBdCZ(e#ejdDdE Z)e#ejddGdHZ*e#ejddIdJZ+e#ejde,dddfdKdLZ-e.e/ddMdNZ/e#ejdOdP Z0e#ejdQgdRddSdTZ1dUdV Z2e#ejddWdXZ3e#eje4de4fdYdZZ5e#ejdd[d\Z6e#ejd]d^ Z7e#ejd_d` Z8e#ejddadbZ9e#ejddedfZ:e#ejdgdh Z;ddjdkZ<dldm Z=dndo Z>e#ejddpdqZ?ddsdtZ@eAddudvZBe#ejddxdyZCdd|d}ZDe#ejdddZEeFs<e#ejdχ fdd	ZGe#ejdd ZHe#ejdddZIe#ejdd ZJeAejfddZKeAejfddZLeMdddddd>dde4dfddZNe#ejdddZOe#ejdddZPe#ejdddZQe#ejdddZRe#ejdddZSe#ejdddZTdddZUe#ejdddZVdddZWdddZXdddZYe#ejdddZZdd Z[dd Z\dd Z]dd Z^eAddddddZ_  Z`S )r#  ac  
    Parallel Pandas DataFrame

    Do not use this class directly.  Instead use functions like
    ``dd.read_csv``, ``dd.read_parquet``, or ``dd.from_pandas``.

    Parameters
    ----------
    dsk: dict
        The dask graph to compute this DataFrame
    name: str
        The key prefix that specifies which keys in the dask comprise this
        particular DataFrame
    meta: pandas.DataFrame
        An empty ``pandas.DataFrame`` with names, dtypes, and index matching
        the expected output.
    divisions: tuple of index values
        Values along which we partition our blocks on the index
    z
dataframe-r  r  c              	     s   t  ||||  jj| jd krr jdd  jD tt tt j	 fdd j	jD d jj| _nR jj| j
 jdd  jD tt tt j	 fdd j	jD d d S )Nc                 S  s   g | ]}|qS rp   rp   rw   rJ  rp   rp   rq   ry     s     z&DataFrame.__init__.<locals>.<listcomp>c                   s.   i | ]&}|t  j| d r& j| jndqS r   Nr   r   r   r  r   rp   rq   r    s
   z&DataFrame.__init__.<locals>.<dictcomp>)r   r=  r   Zdataframe_typeZseries_dtypesc                 S  s   g | ]}|qS rp   rp   r  rp   rp   rq   ry     s     c                   s.   i | ]&}|t  j| d r& j| jndqS r  r  r  r   rp   rq   r    s
   )r%  r   r   r;  Zcollection_annotationsr   r=  rb   r   r   r   r   r'  r   rq   r   |  s(    



zDataFrame.__init__Nc                 C  s   t |trRt|dkrRt |d d tjrB|d d jdkrBd }q|d d j}nLz$dd l}d| d d  d}W n t	k
r   d}Y nX t
| dt| ||| jdS )	Nr   r  rp   r  r  r  z3 is not implemented for `dask.dataframe.DataFrame`.r   r=  )r|   r   ru   r~   r   r"  r   r  r  r  ri   rG   r=  r  rp   rp   rq   r    s    &
zDataFrame.__array_wrap__c                 C  s   | j | jgS r   r  r   rp   rp   rq   r    s    zDataFrame.axesc                 C  s   | j jS r   )r   r=  r   rp   rp   rq   r=    s    zDataFrame.columnsc                 C  s&   t | |}|j| _|j| _|j| _d S r   )r  r   r   r   )rk   r=  r  rp   rp   rq   r=    s    
c                 C  s   ddl m} || S )aL  Purely integer-location based indexing for selection by position.

        Only indexing the column positions is supported. Trying to select
        row positions will raise a ValueError.

        See :ref:`dataframe.indexing` for more.

        Examples
        --------
        >>> df.iloc[:, [2, 0, 1]]  # doctest: +SKIP
        r   )_iLocIndexer)r  r  )rk   r  rp   rp   rq   iloc  s    zDataFrame.ilocc                   sB   z| j d d df }W n tk
r4   t   Y S X t|S d S r   )r  r  r%  r[  ru   )rk   r!  r'  rp   rq   r[    s
    zDataFrame.__len__c                 C  s
   || j kS r   r   r  rp   rp   rq   r    s    zDataFrame.__contains__c                 C  s   t dd S )NzChecking whether a Dask DataFrame has any rows may be expensive. However, checking the number of columns is fast. Depending on which of these results you need, use either `len(df.index) == 0` or `len(df.columns) == 0`)r;   r   rp   rp   rq   empty  s    zDataFrame.emptyc           	      C  s  dt | | }t|s&t|ttfrt| jjtj	tj
frb|| jjkrbtrXtdt | j| S | jt| }ttj|| |}tj||| gd}t|||| jS t|trddlm} tdd |j|j|jfD }|r|| jj s| j!| S | j| S t|tj"t#fs*t$|sjt%|s*t&|rj| jt| }ttj|| |}tj||| gd}t|||| jS t|t'r| j|jkrddl(m)} || |g\} }ttj|| |}tj||| |gd}t||| | jS t|t*r| +|tj,S t-|d S )	Nz
getitem-%szIndexing a DataFrame with a datetimelike index using a single string to slice the rows, like `frame[string]`, is deprecated and will be removed in a future version. Use `frame.loc[string]` instead.r   r   )is_float_dtypec                 s  s   | ]}t |tV  qd S r   )r|   r   rv   rp   rp   rq   r    s    z(DataFrame.__getitem__.<locals>.<genexpr>_maybe_align_partitions).r$   r~   r  r|   r   rN  r   r   r   DatetimeIndexr  r=  r:   r  r  r  r  _extract_metar  r  r   rN   r   r   r   r  r  r  r  r  stepstopr   r  r   r   r"   rE   rD   r   r  r  r#  r  r`  ri   )	rk   r   r   r   r   r   r  Zis_integer_slicer  rp   rp   rq   r    sV    



zDataFrame.__getitem__c                   s   t |ttfr<t  tr<| jf  fddt| jD }nt |tjrvt  tsvt|}| jf  fdd|D }n^t	|st
|st |ttfr| |  }n0t |tstdt| dn| jf | i}|j| _|j| _|j| _|j| _d S )Nc                   s   i | ]\}}| | qS rp   rp   )rw   r  r{  r  rp   rq   r  #  s      z)DataFrame.__setitem__.<locals>.<dictcomp>c                   s   i | ]
}| qS rp   rp   r  r  rp   rq   r  '  s      zItem assignment with z not supported)r|   r   r   r#  assignzipr=  r   r$  rC   rE   r   r  rN  ri   r   r   r   r   r   r  )rk   r   r  r  rp   r  rq   __setitem__!  s&    $
zDataFrame.__setitem__c                 C  s,   | j |gdd}|j| _|j| _|j| _d S )Nr  r  )rF  r   r   r   )rk   r   rE  rp   rp   rq   __delitem__8  s    zDataFrame.__delitem__c                 C  sX   zt | dj}W n tk
r*   d}Y nX ||krF|dkrF|| |< nt | || d S )Nr   rp   )r   r   r   r   )r  __getattribute__r=  r  __setattr__)rk   r   r  r=  rp   rp   rq   r  >  s    

zDataFrame.__setattr__c                 C  s8   || j kr| | S |dkr(t| | ntd| d S )Nr  z&'DataFrame' object has no attribute %r)r=  r  r  r  r  rp   rp   rq   r  J  s
    
zDataFrame.__getattr__c                 C  s:   t tt| }|| j |dd | jD  t|S )Nc                 s  s$   | ]}t |tr| r|V  qd S r   )r|   rN  isidentifierrz  rp   rp   rq   r  Z  s     
  z$DataFrame.__dir__.<locals>.<genexpr>)r   r   r   r   r   r=  r   r   rp   rp   rq   r   W  s    zDataFrame.__dir__c                 C  s
   t | jS r   )iterr   r   rp   rp   rq   r  ]  s    zDataFrame.__iter__c                 C  s   t | jS r   )r*   r  r=  r   rp   rp   rq   _ipython_key_completions_`  s    z#DataFrame._ipython_key_completions_c                 C  s   dS )r  rM  rp   r   rp   rp   rq   r.  c  s    zDataFrame.ndimc                 C  s<   t | j}|dkr"| jjd dfS tt| j| }||fS )aB  
        Return a tuple representing the dimensionality of the DataFrame.

        The number of rows is a Delayed result. The number of columns
        is a concrete integer.

        Examples
        --------
        >>> df.size  # doctest: +SKIP
        (Delayed('int-07f06075-5ecc-4d77-817e-63c69a9188a8'), 2)
        r   )ru   r=  r   r"  rL   r  r  )rk   Zcol_sizeZrow_sizerp   rp   rq   r"  h  s
    
zDataFrame.shapec                 C  s   | j jS )zReturn data types)r   dtypesr   rp   rp   rq   r  {  s    zDataFrame.dtypesc                 C  s
   | j  S r   )r   get_dtype_countsr   rp   rp   rq   r    s    zDataFrame.get_dtype_countsc                 C  s
   | j  S r   )r   get_ftype_countsr   rp   rp   rq   r    s    zDataFrame.get_ftype_countsc                 C  s   | j j||dj}| t| S )Nr2  )r   r9  r=  r   )rk   r3  r4  csrp   rp   rq   r9    s    zDataFrame.select_dtypesTr  rN  zint | Literal['auto'] | Noner  z"Literal['first'] | Literal['last']z-Callable[[pd.DataFrame], pd.DataFrame] | NonezMapping[str, Any] | None)r  r   r-  na_positionsort_functionsort_function_kwargsr   c           	      K  s*   ddl m} || |f|||||d|S )a  Sort the dataset by a single column.

        Sorting a parallel dataset requires expensive shuffles and is generally
        not recommended. See ``set_index`` for implementation details.

        Parameters
        ----------
        by: string
        npartitions: int, None, or 'auto'
            The ideal number of output partitions. If None, use the same as
            the input. If 'auto' then decide by memory use.
        ascending: bool, optional
            Sort ascending vs. descending.
            Defaults to True.
        na_position: {'last', 'first'}, optional
            Puts NaNs at the beginning if 'first', puts NaN at the end if 'last'.
            Defaults to 'last'.
        sort_function: function, optional
            Sorting function to use when sorting underlying partitions.
            If None, defaults to ``M.sort_values`` (the partition library's
            implementation of ``sort_values``).
        sort_function_kwargs: dict, optional
            Additional keyword arguments to pass to the partition sorting function.
            By default, ``by``, ``ascending``, and ``na_position`` are provided.

        Examples
        --------
        >>> df2 = df.sort_values('x')  # doctest: +SKIP
        r   )sort_values)r-  r   r  r  r  )rK  r  )	rk   r  r   r-  r  r  r  rm   r  rp   rp   rq   r    s    'zDataFrame.sort_valuesFzstr | SerieszSequence | None)r   rF  r
  r   r   r  c                 K  s  |rt d|}~t|tst|trjt|dkrXt|d tsNt|d tsX|d }qt d| dn t|trt dt|j dt|tr|j	| j
j	krtdt | S n@|| j
jkrtdt | S || jkrtd| d	t| j |d
k	rt| n,t|tr4|jr4|j| jkr4d}|j}|r^ddlm}	 |	| |f||d|S ddlm}
 |
| |f|||d|S d
S )a!  Set the DataFrame index (row labels) using an existing column.

        This realigns the dataset to be sorted by a new column. This can have a
        significant impact on performance, because joins, groupbys, lookups, etc.
        are all much faster on that column. However, this performance increase
        comes with a cost, sorting a parallel dataset requires expensive shuffles.
        Often we ``set_index`` once directly after data ingest and filtering and
        then perform many cheap computations off of the sorted dataset.

        This function operates exactly like ``pandas.set_index`` except with
        different performance costs (dask dataframe ``set_index`` is much more expensive).
        Under normal operation this function does an initial pass over the index column
        to compute approximate quantiles to serve as future divisions. It then passes
        over the data a second time, splitting up each input partition into several
        pieces and sharing those pieces to all of the output partitions now in
        sorted order.

        In some cases we can alleviate those costs, for example if your dataset is
        sorted already then we can avoid making many small pieces or if you know
        good values to split the new index column then we can avoid the initial
        pass over the data. For example if your new index is a datetime index and
        your data is already sorted by day then this entire operation can be done
        for free. You can control these options with the following parameters.

        Parameters
        ----------
        other: string or Dask Series
            Column to use as index.
        drop: boolean, default True
            Delete column to be used as the new index.
        sorted: bool, optional
            If the index column is already sorted in increasing order.
            Defaults to False
        npartitions: int, None, or 'auto'
            The ideal number of output partitions. If None, use the same as
            the input. If 'auto' then decide by memory use.
            Only used when ``divisions`` is not given. If ``divisions`` is given,
            the number of output partitions will be ``len(divisions) - 1``.
        divisions: list, optional
            The "dividing lines" used to split the new index into partitions.
            For ``divisions=[0, 10, 50, 100]``, there would be three output partitions,
            where the new index contained [0, 10), [10, 50), and [50, 100), respectively.
            See https://docs.dask.org/en/latest/dataframe-design.html#partitions.
            If not given (default), good divisions are calculated by immediately computing
            the data and looking at the distribution of its values. For large datasets,
            this can be expensive.
            Note that if ``sorted=True``, specified divisions are assumed to match
            the existing partitions in the data; if this is untrue you should
            leave divisions empty and call ``repartition`` after ``set_index``.
        inplace: bool, optional
            Modifying the DataFrame in place is not supported by Dask.
            Defaults to False.
        shuffle: string, 'disk' or 'tasks', optional
            Either ``'disk'`` for single-node operation or ``'tasks'`` for
            distributed operation.  Will be inferred by your current scheduler.
        compute: bool, default False
            Whether or not to trigger an immediate computation. Defaults to False.
            Note, that even if you set ``compute=False``, an immediate computation
            will still be triggered if ``divisions`` is ``None``.
        partition_size: int, optional
            Desired size of each partitions in bytes.
            Only used when ``npartitions='auto'``

        Examples
        --------
        >>> import dask
        >>> ddf = dask.datasets.timeseries(start="2021-01-01", end="2021-01-07", freq="1H").reset_index()
        >>> ddf2 = ddf.set_index("x")
        >>> ddf2 = ddf.set_index(ddf.x)
        >>> ddf2 = ddf.set_index(ddf.timestamp, sorted=True)

        A common case is when we have a datetime column that we know to be
        sorted and is cleanly divided by day.  We can set this index for free
        by specifying both that the column is pre-sorted and the particular
        divisions along which is is separated

        >>> import pandas as pd
        >>> divisions = pd.date_range(start="2021-01-01", end="2021-01-07", freq='1D')
        >>> divisions
        DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
                       '2021-01-05', '2021-01-06', '2021-01-07'],
                      dtype='datetime64[ns]', freq='D')

        Note that ``len(divisons)`` is equal to ``npartitions + 1``. This is because ``divisions``
        represents the upper and lower bounds of each partition. The first item is the
        lower bound of the first partition, the second item is the lower bound of the
        second partition and the upper bound of the first partition, and so on.
        The second-to-last item is the lower bound of the last partition, and the last
        (extra) item is the upper bound of the last partition.

        >>> ddf2 = ddf.set_index("timestamp", sorted=True, divisions=divisions.tolist())

        If you'll be running `set_index` on the same (or similar) datasets repeatedly,
        you could save time by letting Dask calculate good divisions once, then copy-pasting
        them to reuse. This is especially helpful running in a Jupyter notebook:

        >>> ddf2 = ddf.set_index("name")  # slow, calculates data distribution
        >>> ddf2.divisions  # doctest: +SKIP
        ["Alice", "Laura", "Ursula", "Zelda"]
        >>> # ^ Now copy-paste this and edit the line above to:
        >>> # ddf2 = ddf.set_index("name", divisions=["Alice", "Laura", "Ursula", "Zelda"])
        z%The inplace= keyword is not supportedr  r   zWDask dataframe does not yet support multi-indexes.
You tried to index with this index: z%
Indexes must be single columns only.zgDask dataframe does not yet support multi-indexes.
You tried to index with a frame with these columns: z5New index has same name as existing, this is a no-op.zData has no column 'z': use any column of NT)set_sorted_index)rF  r   )	set_index)rF  r   r   )ri   r|   rN  r   ru   r#  r   r=  r   r   r   r  r  UserWarningr   KeyErrorcheck_divisionsr$  r/  r   r   rK  r  r  )rk   r   rF  r
  r   r   r  rm   Z
pre_sortedr  r  rp   rp   rq   r    s    q





  



  zDataFrame.set_indexc                 C  s   | | }| |= |S r   rp   )rk   itemr   rp   rp   rq   pop  s    zDataFrame.popr1  c              
   C  s"   d}t | tjtj| j||||dS )Nzdataframe-nlargestrS  rT  r   r  r  r  r=  r3  rk   r  r=  r  r  rp   rp   rq   r4    s    zDataFrame.nlargestc              
   C  s"   d}t | tjtj| j||||dS )Nzdataframe-nsmallestr  r6  r  rp   rp   rq   r7    s    zDataFrame.nsmallestc                 K  s(   ddl m} || f|||||d|S )Nr   )DataFrameGroupByr  )r#  r  )rk   r  r   r!  r"  r  rm   r  rp   rp   rq   r    s    
zDataFrame.groupbyc                 K  s   t | f|||d|S )N)r=  r   r  )r3   )rk   r=  r   r  rm   rp   rp   rq   r3     s      zDataFrame.categorizec                 K  s.  |   }| D ]\}}t|tsjt|sjt|sjtjj	|sjt
|sjt|tsjtdtt| t|r~||||< t|trddlm} t|jdkrtd|j|jkrtd|j d|j d|||j|jd	||< ||| g}|jjf t||| id
d}ttj|f|d|i}q|S )Nz'Column assignment doesn't support type r   )from_dask_arrayr  z)Array assignment only supports 1-D arraysz#Number of partitions do not match (r  ))r   r   Tnonemptyr   )r  r  r|   r   rE   r   r   rt  ru  r  rD   r   r   rb   r   r  r  ru   r"  r  r   r   r   r   r  r  r&  r*   )rk   rm   r:  r  r  r  pairsZdf2rp   rp   rq   r    sD    
zDataFrame.assignr   r  c                 C  s"   |d k	rt d| jtjd |dS )NzCannot rename index.)r=  )r  r   rQ   r   )rk   r   r=  rp   rp   rq   r     s    zDataFrame.renamec                 K  s   | j tj|f|S )a  Filter dataframe with complex expression

        Blocked version of pd.DataFrame.query

        Parameters
        ----------
        expr: str
            The query string to evaluate.
            You can refer to column names that are not valid Python variable names
            by surrounding them in backticks.
            Dask does not fully support referring to variables using the '@' character,
            use f-strings or the ``local_dict`` keyword argument instead.

        Notes
        -----
        This is like the sequential version except that this will also happen
        in many threads.  This may conflict with ``numexpr`` which will use
        multiple threads itself.  We recommend that you set ``numexpr`` to use a
        single thread:

        .. code-block:: python

            import numexpr
            numexpr.set_num_threads(1)

        See also
        --------
        pandas.DataFrame.query
        pandas.eval

        Examples
        --------
        >>> import pandas as pd
        >>> import dask.dataframe as dd
        >>> df = pd.DataFrame({'x': [1, 2, 1, 2],
        ...                    'y': [1, 2, 3, 4],
        ...                    'z z': [4, 3, 2, 1]})
        >>> ddf = dd.from_pandas(df, npartitions=2)

        Refer to column names directly:

        >>> ddf.query('y > x').compute()
           x  y  z z
        2  1  3    2
        3  2  4    1

        Refer to column name using backticks:

        >>> ddf.query('`z z` > x').compute()
           x  y  z z
        0  1  1    4
        1  2  2    3
        2  1  3    2

        Refer to variable name using f-strings:

        >>> value = 1
        >>> ddf.query(f'x == {value}').compute()
           x  y  z z
        0  1  1    4
        2  1  3    2

        Refer to variable name using ``local_dict``:

        >>> ddf.query('x == @value', local_dict={"value": value}).compute()
           x  y  z z
        0  1  1    4
        2  1  3    2
        )r   rQ   query)rk   exprrm   rp   rp   rq   r    s    FzDataFrame.queryc                 K  sX   |d krd}d|kr$|dkr$t d| jj|fd|i|}| jtj|f||d|S )NF=)TNz4Inplace eval not supported. Please use inplace=Falser  )r   r  )ri   r   evalr   rQ   )rk   r  r  rm   r   rp   rp   rq   r  >  s    zDataFrame.evalc                 C  sZ   |t k	r|t k	rtdd|i}|t k	r2||d< n|t k	rB||d< | jtjf|ddiS )NzBYou cannot set both the how and thresh arguments at the same time.rQ  r  threshrB  F)r  r   r   rQ   r  )rk   r  rQ  r  rm   rp   rp   rq   r  I  s    
zDataFrame.dropnac                 C  s$   |d k	rt d| jtj||ddS rD  rG  rI  rp   rp   rq   rH  \  s       zDataFrame.clipc                 C  s   | j tj|ddS rJ  rL  rN  rp   rp   rq   rM  d  s
      zDataFrame.clip_lowerc                 C  s   | j tj|ddS rJ  rO  rN  rp   rp   rq   rP  j  s
      zDataFrame.clip_upperc                 C  sj   |dkr*t | jdkr$| | jd  S | S n<|dkrFtt|  dn |dkrftd| dt|  d S )N)Nr  r  r   z& does not support squeeze along axis 0)r   r  NzNo axis z for object type )ru   r=  ri   r   r  )rk   r  rp   rp   rq   rR  p  s    zDataFrame.squeezer  r   c                 C  s,   t tj| |||}tt| j |_|S r   r	  r  rp   rp   rq   r
    s    zDataFrame.to_timestampc                 C  s    | j |}| jtj||ddS r)  r*  )rk   rM  r   rp   rp   rq   r+    s    zDataFrame.exploder   c                 C  s   ddl m} || ||S )a   Convert to a dask Bag of tuples of each row.

        Parameters
        ----------
        index : bool, optional
            If True, the index is included as the first element of each tuple.
            Default is False.
        format : {"tuple", "dict", "frame"}, optional
            Whether to return a bag of tuples, dictionaries, or
            dataframe-like objects. Default is "tuple". If "frame",
            the original partitions of ``df`` will not be transformed
            in any way.
        r   rS  rV  rW  rp   rp   rq   rT    s    zDataFrame.to_bagc                 O  s   ddl m} || |f||S )z0See dd.to_parquet docstring for more informationr   )
to_parquet)r  r  )rk   pathrl   rm   r  rp   rp   rq   r    s    zDataFrame.to_parquetc                 O  s   ddl m} || |f||S )z,See dd.to_orc docstring for more informationr   )to_orc)r  r  )rk   r  rl   rm   r  rp   rp   rq   r    s    zDataFrame.to_orcc                 C  s   |   j|ddS )NFr2  rY  rZ  rp   rp   rq   r<    s    zDataFrame.to_stringr  c                 C  sB   | j  }t|jt| jk r:| jd }| jtj||dS | S d S )Nz-get_numeric_datar  )r   rj   ru   r=  r  r   rQ   )rk   r  rQ  Znumericsr   rp   rp   rq   rj     s
    

zDataFrame._get_numeric_datac                 C  s*   |dkrt d| dddd||S )N)r   r  r   r=  Nr  r   r  )Nr   r=  r  r  rp   rp   rq   r    s    zDataFrame._validate_axisraisec                 C  sN   |  |}|dkr*|d k	r*| jt||dS |dkrB| jt||dS tdd S )Nr   )errorsr  z@Drop currently only works for axis=1 or when columns is not None)r  r   r>   ri   )rk   labelsr  r=  r  rp   rp   rq   rF    s    
zDataFrame.dropinner_xZ_yc                 C  s>   t |stdddlm} || |||||||||
|	||dS )u3  Merge the DataFrame with another DataFrame

        This will merge the two datasets, either on the indices, a certain column
        in each dataset or the index in one dataset and the column in another.

        Parameters
        ----------
        right: dask.dataframe.DataFrame
        how : {'left', 'right', 'outer', 'inner'}, default: 'inner'
            How to handle the operation of the two objects:

            - left: use calling frame's index (or column if on is specified)
            - right: use other frame's index
            - outer: form union of calling frame's index (or column if on is
              specified) with other frame's index, and sort it
              lexicographically
            - inner: form intersection of calling frame's index (or column if
              on is specified) with other frame's index, preserving the order
              of the calling's one

        on : label or list
            Column or index level names to join on. These must be found in both
            DataFrames. If on is None and not merging on indexes then this
            defaults to the intersection of the columns in both DataFrames.
        left_on : label or list, or array-like
            Column to join on in the left DataFrame. Other than in pandas
            arrays and lists are only support if their length is 1.
        right_on : label or list, or array-like
            Column to join on in the right DataFrame. Other than in pandas
            arrays and lists are only support if their length is 1.
        left_index : boolean, default False
            Use the index from the left DataFrame as the join key.
        right_index : boolean, default False
            Use the index from the right DataFrame as the join key.
        suffixes : 2-length sequence (tuple, list, ...)
            Suffix to apply to overlapping column names in the left and
            right side, respectively
        indicator : boolean or string, default False
            If True, adds a column to output DataFrame called "_merge" with
            information on the source of each row. If string, column with
            information on source of each row will be added to output DataFrame,
            and column will be named value of string. Information column is
            Categorical-type and takes on a value of "left_only" for observations
            whose merge key only appears in `left` DataFrame, "right_only" for
            observations whose merge key only appears in `right` DataFrame,
            and "both" if the observation’s merge key is found in both.
        npartitions: int or None, optional
            The ideal number of output partitions. This is only utilised when
            performing a hash_join (merging on columns only). If ``None`` then
            ``npartitions = max(lhs.npartitions, rhs.npartitions)``.
            Default is ``None``.
        shuffle: {'disk', 'tasks'}, optional
            Either ``'disk'`` for single-node operation or ``'tasks'`` for
            distributed operation.  Will be inferred by your current scheduler.
        broadcast: boolean or float, optional
            Whether to use a broadcast-based join in lieu of a shuffle-based
            join for supported cases.  By default, a simple heuristic will be
            used to select the underlying algorithm. If a floating-point value
            is specified, that number will be used as the ``broadcast_bias``
            within the simple heuristic (a large number makes Dask more likely
            to choose the ``broacast_join`` code path). See ``broadcast_join``
            for more information.

        Notes
        -----

        There are three ways to join dataframes:

        1. Joining on indices. In this case the divisions are
           aligned using the function ``dask.dataframe.multi.align_partitions``.
           Afterwards, each partition is merged with the pandas merge function.

        2. Joining one on index and one on column. In this case the divisions of
           dataframe merged by index (:math:`d_i`) are used to divide the column
           merged dataframe (:math:`d_c`) one using
           ``dask.dataframe.multi.rearrange_by_divisions``. In this case the
           merged dataframe (:math:`d_m`) has the exact same divisions
           as (:math:`d_i`). This can lead to issues if you merge multiple rows from
           (:math:`d_c`) to one row in (:math:`d_i`).

        3. Joining both on columns. In this case a hash join is performed using
           ``dask.dataframe.multi.hash_join``.

        In some cases, you may see a ``MemoryError`` if the ``merge`` operation requires
        an internal ``shuffle``, because shuffling places all rows that have the same
        index in the same partition. To avoid this error, make sure all rows with the
        same ``on``-column value can fit on a single partition.
        zright must be DataFramer   r   )r  r  left_onright_on
left_indexright_indexsuffixesr   	indicatorr  	broadcast)rC   r  r  r   )rk   rA  r  r  r  r  r  r  r  r  r   r  r  r   rp   rp   rq   r     s$    hzDataFrame.merger@  r.  c                 C  s   t |rt|dr| }t|st|tr>tdd |D sFtd|dkrVtdddlm	} |d	kr| g| }	||	|||||d
S |||||||d
}ddlm
}
 |
| |||d kd|||f||d	S )Nr   c                 S  s   g | ]}t |qS rp   )rC   )rw   r   rp   rp   rq   ry   Y  s     z"DataFrame.join.<locals>.<listcomp>z-other must be DataFrame or list of DataFrames)r  r@  z-merge_multi only supports left or outer joinsr   )_recursive_pairwise_outer_joinr  )r  lsuffixrsuffixr   r  r  T)r  r  r  r  r  r   r  )rE   r   rX  rC   r|   r   r  r  r  r  r   )rk   r   r  r  r  r  r   r  r  fullr   rp   rp   rq   r  I  sP    
	zDataFrame.joinc                   s:   t |trd}t|nt|r*| j}t j||dS )NzMUnable to appending dd.Series to dd.DataFrame.Use pd.Series to append as row.)r  )r|   r   r  rE   rX  Tr%  r   )rk   r   r  rO  r'  rp   rq   r     s    


zDataFrame.appendc                 c  s0   t | jD ] }| | }| E d H  q
d S r   )r   r   rP  r   iterrows)rk   rx   r  rp   rp   rq   r    s    zDataFrame.iterrowsPandasc                 c  s6   t | jD ]&}| | }|j||dE d H  q
d S )Nr  )r   r   rP  r   
itertuples)rk   r   r   rx   r  rp   rp   rq   r    s    zDataFrame.itertuplesc                 c  s0   t | jD ] \}}|| jd d |f fV  q
d S r   )r  r=  r  )rk   rD  r  rp   rp   rq   r    s    zDataFrame.itemsc                   s.   d fdd	} |_ t|  t|| dS )r  r=  Nc              	     s   |d k	rt d| |}|dkrrt|trBd  d}t|n0t|rrt| |||d}t| ||||ddS t| |||d}t| ||||dd	S )
Nr[  r(  z
Unable to z dd.Series with axis=1)r   r  r  F)r   r   r  r  rB  r  )r   r  r  rB  )ri   r  r|   r   r  rE   r  r   )rk   r   r  r\  r  rO  r   r   r   rp   rq   r]    sD    


    
z-DataFrame._bind_operator_method.<locals>.meth)r=  NNr^  r`  rp   r  rq   r    s    (zDataFrame._bind_operator_methodc                   s,   d fdd	}||_ t| |t|| dS )z6bind comparison method like DataFrame.eq to this classr=  Nc                   s*   |d k	rt d| |}t | ||dS )Nr[  r  )ri   r  r&  )rk   r   r  r\  ra  rp   rq   r]    s    
z/DataFrame._bind_comparison_method.<locals>.meth)r=  Nr^  rc  rp   ra  rq   rd    s    z!DataFrame._bind_comparison_methodrd  re  rp   c	                 K  s   |dk	rt jdtd | |}|||d}
|	|
 |dkrJd}t||tkr~ttj	| j
|f|dd|	}t t| |	d	| ji ttj	| |f||d
|	S )a  Parallel version of pandas.DataFrame.apply

        This mimics the pandas version except for the following:

        1.  Only ``axis=1`` is supported (and must be specified explicitly).
        2.  The user should provide output metadata via the `meta` keyword.

        Parameters
        ----------
        func : function
            Function to apply to each column/row
        axis : {0 or 'index', 1 or 'columns'}, default 0
            - 0 or 'index': apply function to each column (NOT SUPPORTED)
            - 1 or 'columns': apply function to each row
        $META
        args : tuple
            Positional arguments to pass to function in addition to the array/series

        Additional keyword arguments will be passed as keywords to the function

        Returns
        -------
        applied : Series or DataFrame

        Examples
        --------
        >>> import pandas as pd
        >>> import dask.dataframe as dd
        >>> df = pd.DataFrame({'x': [1, 2, 3, 4, 5],
        ...                    'y': [1., 2., 3., 4., 5.]})
        >>> ddf = dd.from_pandas(df, npartitions=2)

        Apply a function to row-wise passing in extra arguments in ``args`` and
        ``kwargs``:

        >>> def myadd(row, a, b=1):
        ...     return row.sum() + a + b
        >>> res = ddf.apply(myadd, axis=1, args=(2,), b=1.5)  # doctest: +SKIP

        By default, dask tries to infer the output metadata by running your
        provided function on some fake data. This works well in many cases, but
        can sometimes be expensive, or even fail. To avoid this, you can
        manually specify the output metadata with the ``meta`` keyword. This
        can be specified in many forms, for more information see
        ``dask.dataframe.utils.make_meta``.

        Here we specify the output is a Series with name ``'x'``, and dtype
        ``float64``:

        >>> res = ddf.apply(myadd, axis=1, args=(2,), b=1.5, meta=('x', 'f8'))

        In the case where the metadata doesn't change, you can also pass in
        the object itself directly:

        >>> res = ddf.apply(lambda row: row + 1, axis=1, meta=ddf)

        See Also
        --------
        dask.DataFrame.map_partitions
        Nz]The `broadcast` argument is no longer used/supported. It will be dropped in a future release.)category)r  rawresult_typer   zEdd.DataFrame.apply only supports axis=1
  Try: df.apply(func, axis=1)T)rl   r<  r   )rl   r   )r  r  r  r  r   ri   r  r  rQ   rT   r   rf  r   r   )rk   ro   r  r  r  reducerl   r   r  rg  Zpandas_kwargsrO  rp   rp   rq   rT     s4    J

   zDataFrame.applyrf   c                 C  s   t tj| ||dS rz  )r&  rQ   applymap)rk   ro   r   rp   rp   rq   r  I  s    zDataFrame.applymapc                 C  s   t tj| |S r   r  r  rp   rp   rq   r  M  s    zDataFrame.roundc           	        s   |dkr.j j|d}jtj|d| ddS  fddjD }dt|  }|d	fttjd
d |D gdjifi}t	j
|||d}t||j dS d S )Nr  r  zseries-nuniqueF)r   r  r  r  rB  c                   s   g | ]}| j  d qS ))r  r  )r,  r  r  rk   r  rp   rq   ry   _  s   z%DataFrame.nunique.<locals>.<listcomp>r  r   c                 S  s   g | ]}|j d fqS r8  r   r  rp   rp   rq   ry   h  s     r   r   r   )r   r,  r   rQ   r=  r$   rT   r   r   rN   r   r   )	rk   r  r  r  r   Znunique_listr   r   r   rp   r  rq   r,  Q  s6    	  zDataFrame.nuniquec                 C  s   g }t t| jD ]0}| jd d |f }tj|||d}|| qdt|  }|dftt	j
dd |D gddifi}t	j
dd |D dd	}	tj|||d
}
t|
||	dd}|S )Nr(  zconcat-r   c                 S  s   g | ]}|j d fqS r8  r   rw   r  rp   rp   rq   ry     s     z"DataFrame.mode.<locals>.<listcomp>r  r  c                 S  s   g | ]
}|j qS rp   r   r  rp   rp   rq   ry     s     r  r   r   r  )r   ru   r=  r  r   r  r   r$   rT   r*   r   rN   r   r   )rk   r  r  Zmode_series_listZ	col_indexZ
col_seriesr  r   r   r   r   ddfrp   rp   rq   r  q  s2      	  zDataFrame.modec                 C  s   t | ||dS r$  )ri  )rk   r  r  rp   rp   rq   rj    s    zDataFrame.covrk  c                 C  s    |dkrt dt| |d|dS )Nrk  rl  Tr  )ri   ri  )rk   r'  r  r  rp   rp   rq   rm    s    zDataFrame.corrc                   s*  |dkrddl }|j}tt| g}t| jdkrn|d |dt| j  tr`|d t	|| dS i }|rd}|
| j|  d |r|
d| jtjdd	i tt| tj|  }|rddl}|d
 }|d }	|t| |dt| j d ddlm tfdd| jD d }
t|
d}|dj|djddd}|dj|d  fddtt| j|	| jD }|| d nt| jddg}|| dd t!| j" # tdD }|d d!$| |r|d % }|d"t&| d t	|| dS )#z6
        Concise summary of a Dask DataFrame.
        Nr   zIndex: 0 entriesr7  r.  T)r   r  rr  r^  r   r  zData columns (total z
 columns):pprint_thingc                 3  s   | ]}t  |V  qd S r   rt   r  r  rp   rq   r    s     z!DataFrame.info.<locals>.<genexpr>r  r   z             #   {{column:<{column_width}}} Non-Null Count  Dtype
            ---  {{underl:<{column_width}}} --------------  -----)column_widthZColumnz------)rM  ZunderlzP            {{i:^3}}  {{name:<{column_width}}} {{count}} non-null      {{dtype}}c                   s8   g | ]0\}\}}} j ||||d qS ))rx   r   r  r   rU  )rw   rx   r   r  r   Zcolumn_templater  rp   rq   ry     s   z"DataFrame.info.<locals>.<listcomp>r5  ZColumnsr-  c                 S  s   g | ]}d | qS )z%s(%d)rp   r  rp   rp   rq   ry     s    r   z
dtypes: {}z, zmemory usage: )'sysstdoutrN  r   ru   r=  r   r   r,   r`   r   r   r  r   rQ   rr  r  r  r  r   r   r  textwrapr@   Zpandas.io.formats.printingr  r   dedentr@  r  r  r  splitr
  r  r  r  r  r\   )rk   bufverboserr  r  linesZcomputationsr  r   countsspacer  headerZcolumn_infoZdtype_countsZ
memory_intrp   r  rq   info  sz    



 	
zDataFrame.infoc                 C  s&   | j tj||d}||j }|S Nrk  )r   rQ   rr  r  r   r  )rk   r   r  rE  rp   rp   rq   rr    s    zDataFrame.memory_usager  c                 C  s   ddl m} || ||||dS )a/  
        Create a spreadsheet-style pivot table as a DataFrame. Target ``columns``
        must have category dtype to infer result's ``columns``.
        ``index``, ``columns``, ``values`` and ``aggfunc`` must be all scalar.

        Parameters
        ----------
        values : scalar
            column to aggregate
        index : scalar
            column to be index
        columns : scalar
            column to be columns
        aggfunc : {'mean', 'sum', 'count'}, default 'mean'

        Returns
        -------
        table : DataFrame
        r   )pivot_table)r   r=  r  aggfunc)dask.dataframe.reshaper  )rk   r   r=  r  r  r  rp   rp   rq   r    s        zDataFrame.pivot_tabler  c                 C  s    ddl m} || |||||dS )a  
        Unpivots a DataFrame from wide format to long format,
        optionally leaving identifier variables set.

        This function is useful to massage a DataFrame into a format where
        one or more columns are identifier variables (``id_vars``), while
        all other columns, considered measured variables (``value_vars``),
        are "unpivoted" to the row axis, leaving just two non-identifier
        columns, 'variable' and 'value'.

        Parameters
        ----------
        frame : DataFrame
        id_vars : tuple, list, or ndarray, optional
            Column(s) to use as identifier variables.
        value_vars : tuple, list, or ndarray, optional
            Column(s) to unpivot. If not specified, uses all columns that
            are not set as `id_vars`.
        var_name : scalar
            Name to use for the 'variable' column. If None it uses
            ``frame.columns.name`` or 'variable'.
        value_name : scalar, default 'value'
            Name to use for the 'value' column.
        col_level : int or string, optional
            If columns are a MultiIndex then use this level to melt.

        Returns
        -------
        DataFrame
            Unpivoted DataFrame.

        See Also
        --------
        pandas.DataFrame.melt
        r   )melt)id_vars
value_varsvar_name
value_name	col_level)r  r  )rk   r  r   r  r  r  r  rp   rp   rq   r    s    +zDataFrame.meltc                 C  sJ   ddl m} |dkr&t| t }|| }| ||}|d f|_|S )Nr   )
to_recordsT)r  r  r   r   ru   r   r  r  )rk   r   r  r  recordsr  rp   rp   rq   r  G  s    zDataFrame.to_recordsc                 C  s6   |   j|dd}tdj|| jtt| jjddS )NFr2  dataframe.html.j2r8  r:  r   r;  	r,  to_htmlrc   renderr   r[   ru   r   r;  )rk   r3  r:  rp   rp   rq   r	  T  s    zDataFrame.to_htmlc                   s^   | j }| j |j}t|dkr:tjg gt  | d}n tj fdd| D dd}|S )Nr   r=  r   c                   s   g | ]\}}t | d qS )r^  )r  )rw   r  r!  r^  rp   rq   ry   f  s     z(DataFrame._repr_data.<locals>.<listcomp>r  r  )r   r0  r=  ru   r   r#  r   r  )rk   r   rR  Z	series_dfrp   r^  rq   r,  ^  s     zDataFrame._repr_datac                 C  s8   |   jdddd}tdj|| jtt| jjddS )Nr1  FT)r3  r4  Znotebookr  r8  r  r  )rk   r:  rp   rp   rq   _repr_html_j  s      zDataFrame._repr_html_c                   sJ   t |tr|n|g} fdd|D } | } |rF|j jd}|S )a~  
        Parameters
        ----------
        columns_or_index
            Column or index name, or a list of these

        Returns
        -------
        dd.DataFrame
            Dask DataFrame with columns corresponding to each column or
            index level in columns_or_index.  If included, the column
            corresponding to the index level is named _index
        c                   s   g | ]}  |r|qS rp   )_is_column_label_referencer  r   rp   rq   ry     s    
 z6DataFrame._select_columns_or_index.<locals>.<listcomp>)_index)r|   r   r  r  r   )rk   r  Zcolumn_namesZselected_dfrp   r   rq   _select_columns_or_indext  s    

z"DataFrame._select_columns_or_indexc                 C  s(   t | o&t|st|to&|| jkS )z
        Test whether a key is a column label reference

        To be considered a column label reference, `key` must match the name of at
        least one column.
        )r"   r~   r  r|   r   r=  r  rp   rp   rq   r    s
    
z$DataFrame._is_column_label_referencer=  )orientr   r=  c                C  s"   ddl m} ||||||| jdS )z
        Construct a Dask DataFrame from a Python Dictionary

        See Also
        --------
        dask.dataframe.from_dict
        r   )	from_dict)r  r   r=  constructor)r  r  r|  )r   r:  r   r  r   r=  r  rp   rp   rq   r    s    zDataFrame.from_dict)N)NN)NTr  NN)TFNNF)r1  NN)r1  NN)NNN)NN)N)NNN)N)Nr  r   )Fr   )r1  )r  N)r   )Nr   Nr  )r  NNNFFr  FNNN)Nr@  r.  r.  NN)F)Tr  )rf   )r   )FTr   )TF)NF)rk  NF)NFF)TF)NNNr  )NNNr  N)FN)r1  )ar   r   r   r   r   r#  r|  r   rC   r   r  r   r  r}  r   r  r   r  r=  r  r  r[  r  r  r  r  r  r  r  r   r  r  r.  r"  r  rU   r  r  r9  r  r  r  r4  r7  r~  r  r   r3   r  r   r  r  r  r  rH  rM  rP  rR  r
  r+  rT  r  r  r<  rj   r   r  rF  r   r  r-   r   r  r  r  r  rd  rA   rT   r  r  r,  r  rj  rm  r  rr  r  r  r  r	  r,  r  r  r  r  r  rp   rp   r'  rq   r#  b  s\  






	<




     7      I

&H





           
}      =

0d
Y
     
6
	
!  r#  )addsubmuldivdividetruedivfloordivmodpowZraddZrsubZrmulZrdivZrtruedivZ	rfloordivZrmodZrpow)ltgtlegeneeqc                   s:   dd  t to8jdko8jo8t fdd| D S )zP
    This Series is broadcastable against another dataframe in the sequence
    c                 S  s8   z| j |j |j fkW S  tk
r2   Y dS X d S )NF)r   r=  r   r   r   )r!  r  rp   rp   rq   compare  s    z!is_broadcastable.<locals>.comparer  c                 3  s"   | ]}t |tr |V  qd S r   )r|   r#  r  r"  r!  rp   rq   r    s     
 z#is_broadcastable.<locals>.<genexpr>)r|   r   r   r/  r  )dfsr!  rp   r#  rq   is_broadcastable  s    
r%  )r   r   r  c             	     sF  t | d t| f|| }t|}ddlm} ||}dd |D }dd |D | }	t|D ]r\}
ttsxqdt	fddD sd	t |  }t
|jd
krddd tjd
d D ||
< qdd j}|rptd trptd
krpz4| fdd|D |}t|tjr8t|}W n tk
rP   Y n X t|spdgd jd
  }tt tt dd t|D }t| |f||}tj|||	d}|tkr.tdkrt	dd |D sd}t| fdd|D }t t |  t!|| |d}W 5 Q R X t"||||}t#||S )a  Elementwise operation for Dask dataframes

    Parameters
    ----------
    op: callable
        Function to apply across input dataframes
    *args: DataFrames, Series, Scalars, Arrays,
        The arguments of the operation
    meta: pd.DataFrame, pd.Series (optional)
        Valid metadata for the operation.  Will evaluate on a small piece of
        data if not provided.
    transform_divisions: boolean
        If the input is a ``dask.dataframe.Index`` we normally will also apply
        the function onto the divisions and apply those transformed divisions
        to the output.  You can pass ``transform_divisions=False`` to override
        this behavior
    out : ``dask.array`` or ``None``
        If out is a dask.DataFrame, dask.Series or dask.Scalar then
        this overwrites the contents of it with the result
    **kwargs: scalars

    Examples
    --------
    >>> elemwise(operator.add, df.x, df.y)  # doctest: +SKIP
    r   r   r  c                 S  s    g | ]}t |tttfr|qS rp   r|   r   r   r   rw   r:  rp   rp   rq   ry   (  s      zelemwise.<locals>.<listcomp>c                 S  s   g | ]}t |tr|qS rp   r|   r   r  rp   rp   rq   ry   )  s     
 c                 3  s*   | ]"} j  p t j d  |jkV  qdS )r   N)r  ru   r   r  r  rp   rq   r  1  s     zelemwise.<locals>.<genexpr>z[When combining dask arrays with dataframes they must match chunking exactly.  Operation: %sr  c                 S  s   i | ]\}}|d  |qS r  rp   )rw   rx   drp   rp   rq   r  9  s      zelemwise.<locals>.<dictcomp>Nc                   s(   g | ] }| d  kr t |jn|qS r8  r   r$  r   r'  r$  rp   rq   ry   @  s     c                 S  s(   g | ] \}}t |tttfs||fqS rp   r&  )rw   rx   r:  rp   rp   rq   ry   N  s   r   rM  c                 s  s   | ]}t |d V  qdS )r   N)r   rw   r*  rp   rp   rq   r  Z  s     z>elemwise with 2 or more DataFrames and Scalar is not supportedc                   s:   g | ]2} |r|j nt|tr0tjd |jdn|jqS )rp   r   )r   r|   r   r~   r  r   r   r-  )_is_broadcastablerp   rq   ry   _  s   )functionr   )$rV   r$   _maybe_from_pandasr  r  r  r  r|   r   r  r  r.  Zrechunkr"  r   r$  ru   r   r*   r  r   rJ   r   r   r%  r   r   r  rN   r   r  ri   rI   r^   r   r  )r   r   r   r  rl   rm   r   r  Zdasksdepsrx   rO  r   r   r   r   r  rE  rp   )r.  r  r$  rq   r&    sd    

"

$


"
r&  c                 C  s$  t | tr:t| dkr | d } nt| dkr6tdnd} | dk	rn| j|jkrntdtt| tt|f t | trt| j	t|j	krt
dtt| j	tt|j	f t | tttfr|j| _|j| _|j| _t | ts|j| _n4| dk	rdtt| tt|f }t|n|S dS )zHandle out parameters

    If out is a dask.DataFrame, dask.Series or dask.Scalar then
    this overwrites the contents of it with the result
    r  r   z(The out parameter is not fully supportedNzDMismatched types between result and out parameter. out=%s, result=%szLMismatched columns count between result and out parameter. out=%s, result=%szHThe out parameter is not fully supported. Received type %s, expected %s )r|   r   ru   ri   rA  r   rN  r   r#  r=  r  r   r   r   r   r   r   r  rb   )r   rE  rO  rp   rp   rq   r  n  sD    









r  c                   s"   ddl m   fdd| D } | S )Nr   r\  c                   s2   g | ]*}t |st|r*t|s* |d n|qS r  )rE   rC   r"   r  r\  rp   rq   ry     s   z&_maybe_from_pandas.<locals>.<listcomp>)r  r]  r,  rp   r\  rq   r0    s
    
r0  c                 C  s:   |r|| f|pi }n| }t |dd}t| || ||dS )NFr^  r{   )r6   r5   )r  ZnpartsrV  rW  r{   hrp   rp   rq   
hash_shard  s
    r4  c                   s4   t dt |d t fddt|D S )z*Split dataframe into k roughly equal partsr   r  c                   s(   i | ] }| j | |d    qS r  r  rv   r  r   rp   rq   r    s      z split_evenly.<locals>.<dictcomp>)r~   linspaceru   r<  r  r   )r  r  rp   r6  rq   split_evenly  s    r8  c                 C  s*   | j }t|tjr&t| g |d }|S )Nr^  )r   r|   r   Z
MultiIndexrG   rG  )r  r3  rp   rp   rq   r1    s    r1  c                 C  s   | | S r   rp   )r  rR  rp   rp   rq   rY    s    rY  c                 K  s  |
dkrd}
|dkrt  }|dkr(t  }|| || |dkrZ|rPtd|}|}n|dkrht  }|| t| ttfs| g} dd | D }dd |D }t|dkrtd| }|	dkrd	}	n(|	d
kr|}	n|	dk st|	tstdt	|p||f|| ||||	|
||
}|p$t
| d| }t|fdd | D d|i|}|
r||
dkr||jt|
|||d| d}|dk	r|r|
dkrtd|pi }||d< |pt
| d| }t||j|tt|d|rt|f|n||rt|f|n||	|
r|
dkr|
nd|p(t
| d| d	}|tkrxt|f| ddi|}t|t|g|fddi|}t||rtt|d ddnd|d jd}tj|||fd}dg|
pdd  }t|||||d jdS )a  Apply a function to blocks, then concat, then apply again

    Parameters
    ----------
    args :
        Positional arguments for the `chunk` function. All `dask.dataframe`
        objects should be partitioned and indexed equivalently.
    chunk : function [block-per-arg] -> block
        Function to operate on each block of data
    aggregate : function concatenated-block -> block
        Function to operate on the concatenated result of chunk
    combine : function concatenated-block -> block, optional
        Function to operate on intermediate concatenated results of chunk
        in a tree-reduction. If not provided, defaults to aggregate.
    $META
    token : str, optional
        The name to use for the output keys.
    chunk_kwargs : dict, optional
        Keywords for the chunk function only.
    aggregate_kwargs : dict, optional
        Keywords for the aggregate function only.
    combine_kwargs : dict, optional
        Keywords for the combine function only.
    split_every : int, optional
        Group partitions into groups of this size while performing a
        tree-reduction. If set to False, no tree-reduction will be used,
        and all intermediates will be concatenated and passed to ``aggregate``.
        Default is 8.
    split_out : int, optional
        Number of output partitions. Split occurs after first chunk reduction.
    split_out_setup : callable, optional
        If provided, this function is called on each chunk before performing
        the hash-split. It should return a pandas object, where each row
        (excluding the index) is hashed. If not provided, the chunk is hashed
        as is.
    split_out_setup_kwargs : dict, optional
        Keywords for the `split_out_setup` function only.
    sort : bool, default None
        If allowed, sort the keys of the output aggregation.
    ignore_index : bool, default False
        If True, do not preserve index values throughout ACA operations.
    kwargs :
        All remaining keywords will be passed to ``chunk``, ``aggregate``, and
        ``combine``.

    Examples
    --------
    >>> def chunk(a_block, b_block):
    ...     pass

    >>> def agg(df):
    ...     pass

    >>> apply_concat_apply([a, b], chunk=chunk, aggregate=agg)  # doctest: +SKIP
    Nr  ro  c                 S  s   g | ]}t |tr|qS rp   r(  r'  rp   rp   rq   ry   )  s     
 z&apply_concat_apply.<locals>.<listcomp>c                 S  s   h | ]
}|j qS rp   r_  r'  rp   rp   rq   	<setcomp>+  s     z%apply_concat_apply.<locals>.<setcomp>z1All arguments must have same number of partitions   FrM  #split_every must be an integer >= 2-chunk-c                 S  s&   g | ]}t |tr|jd dn|qS )framerU  )r|   r   rT  r'  rp   rp   rq   ry   I  s   r  zsplit-%s)r  zcCannot guarantee sorted keys for `split_out>1`. Try using split_out=1, or grouping with sort=False.r!  z-agg-r2  	-combine-)Zfinalize_funcr  rU  Ztree_node_namer<  Tr   r   r=  r   r   )r  r   r  r|   r   r   ru   r  r   r$   rV   map_bag_partitionsr   r4  ri   rO   r   r   r   r  r  rF   r  r   rN   r   r   )rl   rS  rT  rs  r   r  rt  ru  rv  r  rU  rV  rW  r!  r{   rm   r$  r   	token_keyZ
chunk_namechunked
final_namer   Z
meta_chunkr   r   rp   rp   rq   apply_concat_apply  s    J







 
rC  c                   s   t | ttfr r| jS | jS t | tr: fdd| D S t | trZt fdd| D S t | tri }| D ]}t| |  ||< ql|S t | t	rt
dn| S dS )zO
    Extract internal cache data (``_meta``) from dd.DataFrame / dd.Series
    c                   s   g | ]}t | qS rp   r  rw   r  r  rp   rq   ry     s     z!_extract_meta.<locals>.<listcomp>c                 3  s   | ]}t | V  qd S r   rD  rE  r  rp   rq   r    s     z _extract_meta.<locals>.<genexpr>z>Cannot infer dataframe metadata with a `dask.delayed` argumentN)r|   r   r   r   r   r   r   r  r  rK   r  )r  r  r  r  rp   r  rq   r    s     



r  r<  c                O  s\   t t| |dB t 0 | t|dt|dW  5 Q R  W  5 Q R  S Q R X W 5 Q R X dS )z
    Apply a function using args / kwargs. If arguments contain dd.DataFrame /
    dd.Series, using internal cache (``_meta``) for calculation
    rF  TN)rI   rV   r.   r  )ro   r<  rl   rm   rp   rp   rq   r    s    r  )r   rB  r  align_dataframesc             
     s  | dd}| dd}t| s$t|dk	r>t|f||}	nt| }t| |f||}	| d|	 }ddlm}
 |rt|}z|
|}W n2 tk
r } zt| d|W 5 d}~X Y nX dd	 |D }t	||| |||}t
d
d |D r0|dft| tdd	 |D f|fi}tj|||d}t|||S g }g }|D ]d}t|trd|| || q<t|}t|\}}|r|| || n
|| q<i }d}| D ]:\}}t|}t|\}}|| |||< |rd}qt|||| ||}t| drFdd t|dd D }|dt| |   fdd} |rltt|f||| |d|}n(|rv|n|}t| |f||d|i}tj|||d}t||||S )aI  Apply Python function on each DataFrame partition.

    Parameters
    ----------
    func : function
        Function applied to each partition.
    args, kwargs :
        Arguments and keywords to pass to the function.  At least one of the
        args should be a Dask.dataframe. Arguments and keywords may contain
        ``Scalar``, ``Delayed`` or regular python objects. DataFrame-like args
        (both dask and pandas) will be repartitioned to align (if necessary)
        before applying the function (see ``align_dataframes`` to control).
    enforce_metadata : bool, default True
        Whether to enforce at runtime that the structure of the DataFrame
        produced by ``func`` actually matches the structure of ``meta``.
        This will rename and reorder columns for each partition,
        and will raise an error if this doesn't work,
        but it won't raise if dtypes don't match.
    transform_divisions : bool, default True
        Whether to apply the function onto the divisions and apply those
        transformed divisions to the output.
    align_dataframes : bool, default True
        Whether to repartition DataFrame- or Series-like args
        (both dask and pandas) so their divisions align before applying
        the function. This requires all inputs to have known divisions.
        Single-partition inputs will be split into multiple partitions.

        If False, all inputs must have either the same number of partitions
        or a single partition. Single-partition inputs will be broadcast to
        every partition of multi-partition inputs.
    $META
    r  Nr   r   r   r  z{. If you don't want the partitions to be aligned, and are calling `map_partitions` directly, pass `align_dataframes=False`.c                 S  s   g | ]}t |tr|qS rp   r(  r  rp   rp   rq   ry     s     
 z"map_partitions.<locals>.<listcomp>c                 s  s   | ]}t |tV  qd S r   r|   r   r'  rp   rp   rq   r    s     z!map_partitions.<locals>.<genexpr>c                 S  s   g | ]}|j d fqS r8  r   r'  rp   rp   rq   ry     s     r   TFpartition_infoc                 S  s   i | ]\}}|f||d qS ))r1  r  rp   )rw   rx   r  rp   rp   rq   r    s    z"map_partitions.<locals>.<dictcomp>r6  c                   s    ||d| iS )NrI  rp   )rI  rl   rm   Z	orig_funcrp   rq   ro   $  s    zmap_partitions.<locals>.func)r   _funcr   r   )r  r   AssertionErrorr$   rV   r  r  r0  r  _get_meta_map_partitionsr  rT   r   rN   r   r   r|   r   r   r   rM   r  r  _get_divisions_map_partitionsrW   r  insertr'   r  apply_and_enforcer   )ro   r   rB  r  rG  rl   rm   r   r   r  r  er$  r   r   r   r   r:  Zarg2collectionsZkwargs3simpler  r  r   rI  r   Zkwargs4rp   rJ  rq   r     s    *



     
 r   c                   s   | r d j }ntdd  D td}|rt d trt dkrz2| fdd|D |}t|tjrtt|}W n tk
r   Y nX t	|sdg d j
d  }|S )	zL
    Helper to get divisions for map_partitions and map_overlap output.
    r   c                 s  s   | ]}|j V  qd S r   r  r-  rp   rp   rq   r  F  s     z0_get_divisions_map_partitions.<locals>.<genexpr>r   r  c                   s(   g | ] }| d  kr t |jn|qS r8  r+  )rw   r  r,  rp   rq   ry   J  s     z1_get_divisions_map_partitions.<locals>.<listcomp>N)r   r   ru   r|   r$  r   r*   r  r   rJ   r   )rG  r  r$  ro   rl   rm   r   rp   r,  rq   rN  ;  s      rN  c                 C  s   |rt t|d ddnd}|dkr2|r2|d j}|tkrXt|f| ddi|}d}nt|||d}d}t|st|r|jstdd	 | D s|st	
d
t tt|g|d}t||d}|S )zP
    Helper to generate metadata for map_partitions and map_overlap output.
    r   r   Nr<  Tr=  Fc                 s  s   | ]}t |tV  qd S r   rH  r'  rp   rp   rq   r  f  s    z+_get_meta_map_partitions.<locals>.<genexpr>zMeta is not valid, `map_partitions` and `map_overlap` expects output to be a pandas object. Try passing a pandas object as meta or a dict or tuple representing the (name, dtype) of the columns. In the future the meta you passed will not work.r^  r   )r  rF   r   r  r  r   rX   r"  r  r  r  r  r   )rl   r$  ro   rm   r   r   Z
meta_indexZmeta_is_emulatedrp   rp   rq   rM  V  s&    
rM  c                  O  sp   | d}| d}|| |}t|s6t|s6t|rlt|sB|S t|r\t|| |j}n|j}t||S |S )zsApply a function, and enforce the output to match meta

    Ensures the output has the same columns, even if empty.rK  r   )	r  rC   rE   rD   ru   r<   r=  r   _rename)rl   rm   ro   r   r  r{  rp   rp   rq   rP  z  s    




rP  c                 C  s   t |trt| tkr|S t | tr,t| } t|rt| rB| j} t | tj	sXt	| } t
| t
|jkrt| t|jkr| |jr|S |jdd}| |_|S t|st|rt| st| r| j} |j| kr|S || S |S )au  
    Rename columns of pd.DataFrame or name of pd.Series.
    Not for dd.DataFrame or dd.Series.

    Parameters
    ----------
    columns : tuple, string, pd.DataFrame or pd.Series
        Column names, Series name or pandas instance which has the
        target column names / name.
    df : pd.DataFrame or pd.Series
        target DataFrame / Series to be renamed
    Fr  )r|   r   rL  r  r   r   rC   r=  r   r$  ru   r   equalsr  rE   rD   r   r   )r=  r  rp   rp   rq   rT    s6    




rT  c                 C  sZ   t | tstt|| j}dt| | }tt||| }tj||| gd}t	|||| j
S )a  
    Destructively rename columns of dd.DataFrame or name of dd.Series.
    Not for pd.DataFrame or pd.Series.

    Internally used to overwrite dd.DataFrame.columns and dd.Series.name
    We can't use map_partition because it applies function then rename

    Parameters
    ----------
    df : dd.DataFrame or dd.Series
        target DataFrame / Series to be renamed
    names : tuple, string
        Column names/Series name
    zrename-r   )r|   r   rL  rT  r   r$   r  rN   r   r   r   )r  namesmetadatar   r   r   rp   rp   rq   r    s    r  r"  c              	     s  t }|jdkr$|jdd |t| ts2tdddg}||krLtd|dkrZd}n|}t| tr| j	
 j| j
 }n(t| j	r| j	jn| j	j| j}t|r| jfdd	}t}nd
d	 }t}gt d }	t| |	}
t|	dkrJd|
 tjg td}tdfg | j|ddi| j	ddgS t t g}|  } |dkrt | jt jst | jt jrddlm} |dd ddl m!}m" d|
 fddt#| $ D }d|
 }|df|||	t%|fi}nddl&m'  ddl m(} t j)|	ddddd< d|
  fd dt#| $ D }d!|
 }|df|||	g| j* t%|d"dd#fi}t+||}t,j-||| gd$}|||||S )%a  Approximate quantiles of Series.

    Parameters
    ----------
    q : list/array of floats
        Iterable of numbers ranging from 0 to 100 for the desired quantiles
    method : {'default', 'tdigest', 'dask'}, optional
        What method to use. By default will use dask's internal custom
        algorithm (``'dask'``).  If set to ``'tdigest'`` will use tdigest for
        floats and ints and fallback to the ``'dask'`` otherwise.
    r   Z	mergesort)kindr"  r   Ztdigestz1method can only be 'default', 'dask' or 'tdigest'c                   s   | d  fS r   rp   Ztsk)df_namer$  
series_typrp   rq   r     r   zquantile.<locals>.<lambda>c                 S  s
   t | dfS r   r
   rZ  rp   rp   rq   r     r   d   z
quantiles-r   r_  )r   r   r   N)import_requiredZcrickz=crick is a required dependency for using the t-digest method.)_percentiles_from_tdigest_tdigest_chunkzquantiles_tdigest-1-c                   s$   i | ]\}}|f t |d ffqS )r  )r  r  )r`  r   rp   rq   r  )  s    zquantile.<locals>.<dictcomp>zquantiles_tdigest-2-)percentile_lookup)merge_percentilesr  Zconstant)r  r6  zquantiles-1-c                   s    i | ]\}}|f |fqS rp   rp   r  )_percentilecalc_qsr   rp   rq   r  ;  s    zquantiles-2-rE  Fr   ).r~   r  r.  r!  r|   r   rL  r  r$  r   r  r   r   r%  rE   Z_constructor_slicedr   r   r   r$   ru   r   r_  r   r   r  r;  r   Zfloatinginteger
dask.utilsr^  Zdask.array.percentiler_  r`  r  r   r
  Zdask.array.dispatchra  rb  rf  r   r   rN   r   )r  r$  r'  Z	q_ndarrayZallowed_methodsZinternal_methodr   Zfinalize_tskr   qsr  Zempty_indexZnew_divisionsr^  r_  Zval_dskr  Z	merge_dskrb  r   r   rp   )rc  r`  rd  r[  r   r$  r\  rq   r%    s    







 
 
 

r%  c                   s  |dkrd}n|dk rt d|dkr.| j}n|dk s@t|tsHt d|  } |rjt| jdkrjt dt| |||}rdnd}| d	| j   fd
dt	| 
 D }| d| j d}| j}	 }
d}|	|kr8|t| }
t	t|t|	D ](\}}t fdd|D f||
|f< q|d }	|
 |d7 }q| d| }t fddt|	D | j||| jf||df< tj||| gd}|rt||dS tdd | jD t| | j| jd}t|||| j | j fS )ag  DataFrame covariance and pearson correlation.

    Computes pairwise covariance or correlation of columns, excluding NA/null
    values.

    Parameters
    ----------
    df : DataFrame
    min_periods : int, optional
        Minimum number of observations required per pair of columns
        to have a valid result.
    corr : bool, optional
        If True, compute the Pearson correlation. If False [default], compute
        the covariance.
    scalar : bool, optional
        If True, compute covariance between two variables as a scalar. Only
        valid if `df` has 2 columns.  If False [default], compute the entire
        covariance/correlation matrix.
    split_every : int, optional
        Group partitions into groups of this size while performing a
        tree-reduction. If set to False, no tree-reduction will be used.
        Default is False.
    NrM  zmin_periods must be >= 2Fr;  z(scalar only valid for 2 column dataframerm  rj  r<  c                   s    i | ]\}} |ft |fqS rp   )cov_corr_chunk)rw   rx   r   r  rm  rp   rq   r  ~  s     zcov_corr.<locals>.<dictcomp>r>  r   r   c                   s   g | ]} |fqS rp   rp   rv   r)  rp   rq   ry     s     zcov_corr.<locals>.<listcomp>r  c                   s   g | ]} |fqS rp   rp   rv   r)  rp   rq   ry     s     r   r5  c                 S  s   g | ]}|d fqS )r5  rp   rz  rp   rp   rq   ry     s     r=  )r  r   r|   r   rj   ru   r=  r$   r   r  r   rN  r   r   cov_corr_combinecov_corr_aggr   rN   r   r   rF   rH   r   r   r   )r  r  rm  r  r  r  rV   r   r  r  r  depthZpart_iZindsr   r   r   rp   ri  rq   ri  S  s\    

"
	ri  c              	   C  s  | j d | j d f}| jddd} tj| j|d}tj| j|d}tt| jD ]>}| jdd|f 	 }| | 
 j||< | |  j||< qP|  j}d|jfd|jfd	|jfg}|rtjd
d td || j}	W 5 Q R X tj| j|d}
|  j}tt| jD ]^}t| jdd|f jdddf |	| dddf d }tj||< tj|dd|
|< q|
j}
|d|
jf ||||d  d}|r|
|d< |S )z5Chunk part of a covariance or correlation computationr  r   F)r  )r"  Nr  r  rj  T)recordalwaysrM  r   r  mr  r  rj  )r"  r<  r~   Z
zeros_liker  r   ru   r=  r  r  r  r  rj  r   r  catch_warningssimplefilterr  r  subtractr`  nansumr   )r  rm  r"  sumsr  idxrJ  rj  r   muro  Zmu_discrepancyr   rp   rp   rq   rh    s6    


:
rh  c              	     s  d d d d}|rd |d< |  D ]J  fdd| D | < t|  t|  f|  d j | < q t|d }|d }t|d}t|d}|d d }|d	d  }|d d }	|d	d  }
tjd
dL ||
 ||	  }t	|	|
 |	|
  ||
d  dt	|d d }W 5 Q R X |d |d |d}|rt|d |d tj}|d | }t||tj}tj	|d ||| | d   dd}||d< |S )Nrp  ro  c                   s   g | ]}|  qS rp   rp   r-  r  rp   rq   ry     s     z$cov_corr_combine.<locals>.<listcomp>r   r  r  r6  r  ignore)invalid)r   rM  r  rj  rM  r  )r  r~   concatenateZreshaperu   r"  Z
nan_to_numr  errstatert  Z	transposer  r`  )Zdata_inrm  r:  ru  r  Zcum_sumsZ
cum_countss1s2Zn1Zn2r*  Cr   Znobsrw  Z	counts_naro  rp   rx  rq   rj    s<    2 &rj  rM  c              	   C  s   t | |}|d }|d }tj|||k < |rF|d }	t|	|	j }
nt||tjd }
tjddd ||
 }W 5 Q R X |rt|d S |d krtj	nt
||||dS )	Nr  rj  ro  r  ry  )rz  r  r   r  r  )rj  r~   r`  rY  r  r  r|  r_  r   r#  rG   )r:  rR  r  rm  r  Zlike_dfr   r  r  m2Zdenmatrp   rp   rq   rk    s"    
  rk  c                   s`   t |}|r4t|tjjs&tj|} jd|d tt || fddtt|D S )a  Split DataFrame into multiple pieces pseudorandomly

    >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
    ...                    'b': [2, 3, 4, 5, 6, 7]})

    >>> a, b = pd_split(
    ...     df, [0.5, 0.5], random_state=123, shuffle=True
    ... )  # roughly 50/50 split
    >>> a
       a  b
    3  4  5
    0  1  2
    5  6  7
    >>> b
       a  b
    1  2  3
    4  5  6
    2  3  4
    r  )r  r  c                   s   g | ]} j |k qS rp   r5  rv   r  r   rp   rq   ry     s     zpd_split.<locals>.<listcomp>)	r   r|   r~   r  r  r  r_   ru   r   )r  pr  r  rp   r  rq   r}    s    r}  c                   s~   dd  |dkrj d S trrtj dddf }jrJ|g ddS | fd	d
ttjD jdS  S dS )z
    take last row (Series) of DataFrame / last value of Series
    considering NaN.

    Parameters
    ----------
    a : pd.DataFrame or pd.Series
    skipna : bool, default True
        Whether to exclude NaN

    c                 S  s\   t dtdt| d D ]"}| j|  }t|s|  S q| |   }|jsX|jd S d S )Nr  r   r6  )r   r   ru   r  r   r  Znotnar  )r!  rx   valZnonnullrp   rp   rq   _last_valid&  s    


z_take_last.<locals>._last_validFr6  r   r  r_  r   c                   s"   g | ]} j d d |f qS r   r5  rv   r  r  rp   rq   ry   <  s     z_take_last.<locals>.<listcomp>r^  N)r  rC   r   r  r   ru   r=  )r  r  r\  rp   r  rq   r    s    
r  c                 C  s~   t | ttfstdt| } t| dkr2td| t| krFtdt| d d ttt| d d krzd}t|d S )Nz"New division must be list or tupler   zNew division must not be emptyzNew division must be sortedr6  z8New division must be unique, except for the last element)r|   r   r   r  ru   r
  r   )r   rO  rp   rp   rq   r  C  s    (r  c                 C  s:  t | t|dk rtd|rZ| d |d k r<d}t|| d |d krd}t|n8| d |d krvd}t|| d |d krd}t|d	d
 }| d g}t }	| d }
d\}}d}|| }|t| k r|t|k r| | || k r*tj||d f|
| | df|	||f< | | }
|d7 }n| | || krrtj||d f|
|| df|	||f< || }
|d7 }ndtj||d f|
|| df|	||f< || }
t| |d ks| | | |d  k r|d7 }|d7 }||
 |d7 }q| d |d k s|d |d krlt|t|D ]L}t| d }tj||f|
|| df|	||f< || }
||
 |d7 }qnR|r|t| k rtj||d f| | | | df|	||f< |d7 }|| d  |	||d f dd d |	||d f< d\}}||}|t|k r6g }|| || k r4|||f |d7 }q|r|| |d kr|d |d ksp|t|d kr||k r|||f |d7 }q4t|dkrtj|df| d | d df|	||d f< n^t|dkr|d |	||d f< n:|stdt| t|t|f tj	|f|	||d f< |d7 }q|	S )a  dask graph to repartition dataframe by new divisions

    Parameters
    ----------
    a : tuple
        old divisions
    b : tuple, listmypy
    out2 : str
        name of new dataframe
    force : bool, default False
        Allows the expansion of the existing divisions.
        If False then the new divisions lower and upper bounds must be
        the same as the old divisions.

    Examples
    --------
    >>> from pprint import pprint
    >>> pprint(repartition_divisions([1, 3, 7], [1, 4, 6, 7], 'a', 'b', 'c'))  # doctest: +ELLIPSIS
    {('b', 0): (<function boundary_slice at ...>, ('a', 0), 1, 3, False),
     ('b', 1): (<function boundary_slice at ...>, ('a', 1), 3, 4, False),
     ('b', 2): (<function boundary_slice at ...>, ('a', 1), 4, 6, False),
     ('b', 3): (<function boundary_slice at ...>, ('a', 1), 6, 7, True),
     ('c', 0): (<function concat at ...>, [('b', 0), ('b', 1)]),
     ('c', 1): ('b', 2),
     ('c', 2): ('b', 3)}
    rM  z+New division must be longer than 2 elementsr   zHleft side of the new division must be equal or smaller than old divisionr6  zHright side of the new division must be equal or larger than old divisionz0left side of old and new divisions are differentz1right side of old and new divisions are differentc                 S  s   t | dko| d | d kS )z0Whether last division only contains single labelrM  r6  r  rt   r  rp   rp   rq   _is_single_last_div  s    z2repartition_divisions.<locals>._is_single_last_div)r  r  r  Fr  N)Tr  z=check for duplicate partitions
old:
%s

new:
%s

combined:
%s)
r  ru   r  r  r*   r  r   r   r   r   )r  r  r   Zout1Zout2r  rO  r  r{  r*  lowrx   r  r  Z	last_elemZ_jro  tmprp   rp   rq   repartition_divisionsP  s    

$
$
$(

$ 
((*r  c                 C  s   t | jd tjstdt|}z| jd |}W n tk
rT   | jd }Y nX t	tj
|| jd |d}t|s| jd | jd g}n2|| jd  |d | jd kr| jd g| }| j|dS )z5Repartition a timeseries dataframe by a new frequencyr   z0Can only repartition on frequency for timeseriesr6  )r  r  r  r  )r|   r   r   	Timestampr   _map_freq_to_period_startceilr  r*   r  Z
date_rangeru   r   r  )r  r  r  r   rp   rp   rq   r    s     r  c                 C  s   t | ts| S tjj| }t|j}|ds4| S |dt	d  d }z^t
tjj|}d| krz| d\}}d| }nd}|jdkrt|jnd}| |j | W S  tk
r   |  Y S X dS )a  Ensure that the frequency pertains to the **start** of a period.

    If e.g. `freq='M'`, then the divisions are:
        - 2021-31-1 00:00:00 (start of February partition)
        - 2021-2-28 00:00:00 (start of March partition)
        - ...

    but this **should** be:
        - 2021-2-1 00:00:00 (start of February partition)
        - 2021-3-1 00:00:00 (start of March partition)
        - ...

    Therefore, we map `freq='M'` to `freq='MS'` (same for quarter and year).
    ZEndNZBeginr   r.  r  )r|   rN  r   r  r  r  r   r   endswithru   r  offsetsr  r  _prefixr  )r  r  Zoffset_type_nameZ
new_offsetZnew_offset_typer  anchorr  rp   rp   rq   r    s"    



r  c                 C  s   t |trt|}t|}| jtdd }d||  }t|dkrd| dt	|  }t
| ||} g }t||D ]\}}||| g|  qtt|}t||ksttttt||}t|}	d| dt	|  }
t| |	|
S )zb
    Repartition dataframe so that new partitions have approximately `size` memory usage each
    TrU  r  repartition-split-r   zrepartition-)r|   rN  r]   r  r   rl  r   r~   r  r$   _split_partitionsr  r  r   r   r  rL  r   r8  ru   rY   r  _repartition_from_boundaries)r  r  Z
mem_usagesnsplits
split_nameZsplit_mem_usagesr  usageZnew_npartitionsnew_partitions_boundariesnew_namerp   rp   rq   r     s"    


r  c                 C  s"   | j ||d}t|r| }|S r  )rr  rE   r  )r  r   r  Z	mem_usagerp   rp   rq   rl  =  s    rl  c           
        s  d|t | f }| j|kr| S | j|krX| j|   fddt|d D }t| ||S t| j  }}| jrt	|j
st|j
rt	|j
r|jd}t|r|j}t|}tjtd||d td|||d}t	|j
rtt||j
}nt|j
tjr"||j
}t|tjr8| }t|}| jd |d< | jd |d< tt|d	d |d g }| j|d
S t|| j\}}|g| j }	|	d  |7  < t| |	|S d	S )z7Repartition dataframe to a smaller number of partitionszrepartition-%d-%sc                   s   g | ]}t |  qS rp   )r  )rw   Znew_partition_indexZnpartitions_ratiorp   rq   ry   K  s   z+repartition_npartitions.<locals>.<listcomp>r  r   r   )r  Zxpfpr6  Nr  )r$   r   r   r  r   r   r   rX  r/  r   r   r   r  r<  rE   ru   r~   Zinterpr7  r*   r  r;  re  r|   r   r   r   r  divmodr  )
r  r   r  r  Zoriginal_divisionsr   r  r  r  r  rp   r  rq   r  D  sT    





r  c           	        s   t |tst|}|d dkr*|dd |d  jk rD| j i }tt||dd  D ]2\}\}}tj fddt	||D f|||f< q^ fdd|D }t
j|| gd}t|| j|S )Nr   r6  r  c                   s   g | ]} j |fqS rp   r   r  r  rp   rq   ry     s     z0_repartition_from_boundaries.<locals>.<listcomp>c                   s   g | ]} j | qS rp   r  rv   r  rp   rq   ry     s     r   )r|   r   rO  r   r   r  r  r*   r   r   rN   r   r   r   )	r  r  r  r   rx   r  r  r   r   rp   r  rq   r  |  s    
(r  c                 C  s   t || jkrtd| j i }dt| | }d}t|D ]p\}}|dkrj| j|f|||f< |d7 }q>t| j|f|f|||f< t|D ]"}t||f|f|||f< |d7 }qq>dgdt	|  }	t
j||| gd}
t|
|| j|	S )aY  Split a Dask dataframe into new partitions

    Parameters
    ----------
    df: DataFrame or Series
    nsplits: List[int]
        Number of target dataframes for each partition
        The length of nsplits should be the same as df.npartitions
    new_name: str

    See Also
    --------
    repartition_npartitions
    repartition_size
    znsplits should have len=r|  r   r  Nr   )ru   r   r  r$   r  r   r8  r   r   r  rN   r   r   r   )r  r  r  r   r  r  rx   r  Zjjr   r   rp   rp   rq   r    s     
r  c           
        s   t | |}t| tr^d| }d| }t| j|| j|||d}tj||| gd}t||| j	|S t
| snt| rd|  ddlm} || |dd	 }	 fd
dt|	D }t| | |S tddS )a  Repartition dataframe along new divisions

    Dask.DataFrame objects are partitioned along their index.  Often when
    multiple dataframes interact we need to align these partitionings.  The
    ``repartition`` function constructs a new DataFrame object holding the same
    data but partitioned on different values.  It does this by performing a
    sequence of ``loc`` and ``concat`` calls to split and merge the previous
    generation of partitions.

    Parameters
    ----------

    divisions : list
        List of partitions to be used
    force : bool, default False
        Allows the expansion of the existing divisions.
        If False then the new divisions lower and upper bounds must be
        the same as the old divisions.

    Examples
    --------

    >>> df = df.repartition([0, 5, 10, 20])  # doctest: +SKIP

    Also works on Pandas objects

    >>> ddf = dd.repartition(df, [0, 5, 10, 20])  # doctest: +SKIP
    r  zrepartition-merge-r  r   zrepartition-dataframe-r   )shard_df_on_indexr  r6  c                   s   i | ]\}} |f|qS rp   rp   )rw   rx   r  r-  rp   rq   r    s      zrepartition.<locals>.<dictcomp>z Data must be DataFrame or SeriesN)r$   r|   r   r  r   r   rN   r   r   r   rC   rE   dask.dataframe.utilsr  r  r  )
r  r   r  r  r  r   r   r   r  r$  rp   r-  rq   r    s*    

     r  c                 K  s"   || f|}t |r| jS |S r   )rE   rX  r  )r  rp  rm   r   rp   rp   rq   rw    s    rw  c                 K  s6   t | trt| } || f|}t|r2| jS |S r   )r|   r   r   r   rE   rX  r  )r  rq  rm   r   rp   rp   rq   ry    s    

ry  c                 K  s    t | trt| } || f|S r   )r|   r   r   r   )r  rr  rm   rp   rp   rq   rx    s    

rx  c                 C  s   |dkrdnd}t | dkr>t| ||d}t| ||d}nt| g dd }}t|rlt| ||dS t| |g|gdS )	Nr  r   r   r   r  rH  r   rv  r  )ru   r  rH   rE   rG   r  r  r  Zminmaxrv  r  rp   rp   rq   r    s    r  c                 C  sv   |dkrdnd}t | dkrP| d} t| j||dg}t| j||dg}nt| g dd }}t| ||d	S )
Nr  r   r   r   rv  r  rH  r   r  )ru   r  r  r  rH   rG   r  rp   rp   rq   idxmaxmin_row  s    
r  c                 C  s2   t | dkr| S | jddjt||djdddS )Nr  r   )r\  )r  r  T)r\  rF  )ru   r  rT   r  rG  )r  r  r  rp   rp   rq   r    s       r  c                 C  s<   t | ||dd }t|dkr&td|r2|d S d |_|S )Nr  rv  r   z*attempt to get argmax of an empty sequence)r  ru   r  r   )r  r  r  r  r  rp   rp   rq   r    s    r  c                 C  s6   |   }|j|d}|||k j  jdd}|S )Nr  T)rF  )r  r   r   r  r  rG  )r  r  Zvalue_count_seriesZmax_valr  rp   rp   rq   r  "  s    r  c                 C  s   |   dS )NZint64)r  r<  r  rp   rp   rq   r  .  s    r  c                 C  s8   t | |}t||kr4td| dt| d |S )Nz"Insufficient elements for `head`. z elements requested, only z@ elements available. Try passing larger `npartitions` to `head`.)rQ   r  ru   r  r  )r  r  rrp   rp   rq   r  2  s    r  c                 C  s   t |trtjj|}t |tj}|rB| s:t|dsB| 	 S | j
rtjtt| j| jd}|j||dj}| | j| j| j|S | S )a  Maybe shift divisions by periods of size freq

    Used to shift the divisions for the `shift` method. If freq isn't a fixed
    size (not anchored or relative), then the divisions are shifted
    appropriately. Otherwise the divisions are cleared.

    Parameters
    ----------
    df : dd.DataFrame, dd.Series, or dd.Index
    periods : int
        The number of periods to shift.
    freq : DateOffset, timedelta, or time rule string
        The frequency to shift by.
    r  r^  r  )r|   rN  r   r  r  r  Z
DateOffsetr  r   rH  r/  r   r   ru   r   r  r   rA  r   r   r   )r  r  r  Z	is_offsetr  r   rp   rp   rq   r  <  s    
r  c                 K  s   | drddini }|d krt| tr@tjg f|}| j|_nNt| sZt| sZtdn4t	| tj
d|g}|j| jj|_| jj|j_ttj| fd|i|S )NutctzzSdask.dataframe.to_datetime does not support non-index-able arguments (like scalars)2000r   )r  )rh   r|   r$  r   r  r   rC   rE   ri   rH   r  r   r<  r   r   to_datetime)r:  r   rm   Ztz_kwargrp   rp   rq   r  \  s    

r  r  c                 C  s<   t s|d krd}t| tjd|dg}ttj| |||dS )Nnsr  )unit)r  r  r   )r9   rH   r   Z	Timedeltar   to_timedelta)r:  r  r  r   rp   rp   rq   r  o  s    r  rI  c                 C  s   t tj| S r   )r   r   rI  )r:  rp   rp   rq   rI  y  s    c                 C  sP   t |d }t| r(t| r"d}q2d}n
t| j}tj|gdg|  || jdS )z1A helper for creating the ``_repr_data`` propertyr  zcategory[known]zcategory[unknown]r   r  )ru   rB   r?   rN  r   r   r   r   )r!  r   r   r   rp   rp   rq   r  ~  s    
r  c                 C  s   t | tk	S )z2Does this object have a dask dataframe equivalent?)r4   r   r  rp   rp   rq   r     s    r   c           
      C  s  t |rt|| |||S t|r|jrddlm} tjft|d  ft	dd |jdd D  }t|dkr| j
| }t|tr|d d |jd< |jd |_n@dt|d  }tt|d D ]}	|||	f|||	f| < q|j| |||jd	S t|| |||S dS )
zGeneric constructor for dask.dataframe objects.

    Decides the appropriate output class based on the type of `meta` provided.
    r   Nr  c                 s  s   | ]}|fV  qd S r   rp   r-  rp   rp   rq   r    s    z new_dd_object.<locals>.<genexpr>r  )r  r8  )r   r  r   )r   r4   rX   r"  rj  r  r~   r`  ru   r   r;  r|   r%   Znew_axesZoutput_indicesr   r  r   r   )
r   r   r   r   r   r   r  r   r  rx   rp   rp   rq   r     s"    


r   c                 O  sn  g }i }|D ]@}t |tr<||jdg |jf||j< qt |trb||jdg d||j< qt |tr|jdkr||jdg n@|jdkr||jdg n$|jdkr||jdg nt	d|j
||j< qt |tr@t|j
dkr||dg n<t|j
dkr"||dg nt	d	|d
t|j
 dq||dg qt| |df||dd|S )a  
    Apply a function partition-wise across arguments to create layer of a graph

    This applies a function, ``func``, in an embarrassingly parallel fashion
    across partitions/chunks in the provided arguments.  It handles Dataframes,
    Arrays, and scalars smoothly, and relies on the ``blockwise`` machinery
    to provide a nicely symbolic graph.

    It is most commonly used in other graph-building functions to create the
    appropriate layer of the resulting dataframe.

    Parameters
    ----------
    func: callable
    layer_name: str
        Descriptive name for the operation. Used as the output name
        in the resulting ``Blockwise`` graph layer.
    *args:
    **kwargs:

    Returns
    -------
    out: Blockwise graph

    Examples
    --------
    >>> subgraph = partitionwise_graph(function, x, y, z=123)  # doctest: +SKIP
    >>> layer = partitionwise_graph(function, df, x, z=123)  # doctest: +SKIP
    >>> graph = HighLevelGraph.from_collections(name, layer, dependencies=[df, x])  # doctest: +SKIP
    >>> result = new_dd_object(graph, name, metadata, df.divisions)  # doctest: +SKIP

    See Also
    --------
    map_partitions
    rx   r  r  r   r.  rM  Zijz/Can't add multi-dimensional array to dataframeszBlockwiseDep arg z has z' dimensions; only 1 or 2 are supported.NT)	numblocksr{  )r|   r   r  r   r   r   r   r.  r   r  r  r&   ru   r(   )ro   Z
layer_namerl   rm   r  r  r:  rp   rp   rq   r    sN    $






   r  c                 C  s\   t | r"dd | j  D }nt| r<| jt| jf}nd}d}|rX|dt| 7 }|S )zS
    Provide an informative message when the user is asked to provide metadata
    c                 S  s   i | ]\}}|t |qS rp   r  r  rp   rp   rq   r    s      z meta_warning.<locals>.<dictcomp>Na<  
You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.z8
  Before: .apply(func)
  After:  .apply(func, meta=%s)
)rC   r  Zto_dictr  rE   r   rN  r   )r  Zmeta_strrO  rp   rp   rq   rf    s    rf  c                 K  s2  t  }dt| ||f| }|j}t|jd }dg|d  }d}	|	|k rR|	d9 }	q@t|D ]&}
t| |j|
f|g|f|||
ddf< qZt||	D ]}
||||
ddf< qd}||	k r td|	d| D ]T}
t| ||
| d |df||
d|  d |dfg|f|||
d|  d d| df< q|d9 }q||||	d |	df< |dkr|d }td|	d| D ]}
||
d|  d d| df|||
| d |df< t| ||
d|  d d| df||
| d |dfg|f|||
d|  d |df< qVq4t|D ]&}
t| ||
ddf|g|f|||
f< qtj	|||gd}t
||||S )a  Computes the prefix sums of f on df

    If df has partitions [P1, P2, ..., Pn], then returns the DataFrame with
    partitions [f(identity, P1),
                f(f(identity, P1), P2),
                f(f(f(identity, P1), P2), P3),
                ...]

    Parameters
    ----------
    f : callable
        an associative function f
    ddf : dd.DataFrame
    identity : pd.DataFrame
        an identity element of f, that is f(identity, df) = f(df, identity) = df
    zprefix_reduction-r  NrM  r   r   r  r$   r   ru   r   r   rT   r   rN   r   r   r   r  identityrm   r   r   r   r  r   Nrx   r*  r   rp   rp   rq   prefix_reduction  sF    
$
*"

0.$$r  c                 K  sB  t  }dt| ||f| }|j}t|jd }dg|d  }d}	|	|k rR|	d9 }	q@t|D ].}
t| |j|d |
 f|g|f|||
ddf< qZt||	D ]}
||||
ddf< qd}||	k r(td|	d| D ]T}
t| ||
d|  d |df||
| d |dfg|f|||
d|  d d| df< q|d9 }q||||	d |	df< |dkr|d }td|	d| D ]}
||
d|  d d| df|||
| d |df< t| ||
| d |df||
d|  d d| dfg|f|||
d|  d |df< q^q<t|D ].}
t| ||d |
 ddf|g|f|||
f< qtj	|||gd}t
||||S )a  Computes the suffix sums of f on df

    If df has partitions [P1, P2, ..., Pn], then returns the DataFrame with
    partitions [f(P1, f(P2, ...f(Pn, identity)...)),
                f(P2, ...f(Pn, identity)...),
                ...f(Pn, identity)...,
                ...]

    Parameters
    ----------
    f : callable
        an associative function f
    ddf : dd.DataFrame
    identity : pd.DataFrame
        an identity element of f, that is f(identity, df) = f(df, identity) = df
    kwargs : ??
        keyword arguments of f ??
    zsuffix_reduction-r  NrM  r   r   r  r  rp   rp   rq   suffix_reductionK  sF    
,
*"

0.$,r  c                 C  s
   |  |S r   )r8  )Z
base_chunkZ
concat_maprp   rp   rq   	mapseries  s    r  c                 C  s   |  }|  |}|S r   )Z
sort_indexr  r8  )r   Zconcat_resultZfinal_seriesrp   rp   rq   mapseries_combine  s    r  c                   s  | j }|j }i }t| |}d| }d| }t|  D ]B\ }t||f|| f< t|D ]t| ff||d f< q\q:t|}	d|	 }
d|	 t| D ]F\ }t||td f||
 f< t|D ]t|
 ff|d f< qqt| |}d| t|D ]J t|D ]:t|d ft	fddt|D ff| f< qq
d	| }t| j
 D ]4\ }t|t	 fd
dt|D ff|| f< qn|j }| jj
|_
t|}| || j
g}tj|||d}t| j}t||||S )Nzbase-split-zbase-shard-r   z
map-split-z
map-shard-zmap-series-c                   s   g | ]}d | fqS r8  rp   r  )r  map_shard_prefixrp   rq   ry     s     zseries_map.<locals>.<listcomp>zmap-series-combine-c                   s   g | ]} |fqS rp   rp   r  )rx   
map_prefixrp   rq   ry     s     r   )r   r$   r  r   r4  r   r   r1  r  r   r   r  r   r  rF   rN   r   r   r   r   )Zbase_seriesZ
map_seriesr   rU  r   Zbase_token_keyZbase_split_prefixZbase_shard_prefixr   Zmap_token_keyZmap_split_prefixr@  Zfinal_prefixr   r   r   r   rp   )rx   r  r  r  rq   r>    s^    










  
r>  c                 C  s*   |r|   dS | d|  tjS )NrH  )r  r{  rJ  r  r~   r`  )Zseriesr  rp   rp   rq   rW    s    rW  c              
   O  s   |dkrTt  > t jdtdd ttj| f|d|i|W  5 Q R  S Q R X |d |d  }}t	| }|s~t|S |j
|}|| }t||D ]\}	}
t|
||	< q|S )Nr  ry  z!invalid value encountered in cast)r  r  r  rR  rS  )r  rq  filterwarningsRuntimeWarningr   r  rQ   rO  r~   rY  r   r  r  )r>  r  rl   rm   rR  rS  rY  Ztime_col_maskZmatching_valsZtime_colZmatching_valrp   rp   rq   rX    s"    
2

rX  c                 C  s    t | st| std| dS )zV
    Utility function to raise an error if an object is not a Series or DataFrame
    zA`%s` is only supported with objects that are Dataframes or SeriesN)rE   rC   ri   r  rp   rp   rq   rU    s    rU  c              
   O  s.   t   tt| ||W  5 Q R  S Q R X d S r   )r.   r  rQ   )r  rl   rm   rp   rp   rq   r    s    r  )F)F)NNF)N)F)r"  )NFFF)F)F)rM  FFN)NF)T)F)N)TF)NF)N)N)N)NT)NT)NT)NTF)N)Nr  )N)
__future__r   r  r  collections.abcr   r   r   	functoolsr   r   Znumbersr   r	   r   pprintr   typingr   r   r   r   r   Znumpyr~   Zpandasr   r  r   r   r   r   Ztlzr   r   r   r   r   rj  r  r   r   r   r  r   r   Zdask.bagr   r?  Z	dask.baser    r!   r"   r#   r$   Zdask.blockwiser%   r&   r'   r(   Zdask.contextr)   r  r*   Zdask.dataframe._compatr+   r,   r-   r.   Zdask.dataframe.accessorr/   r0   r1   Zdask.dataframe.categoricalr2   r3   Zdask.dataframe.dispatchr4   r5   r6   r7   Zdask.dataframe.optimizer8   r  r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   Zdask.delayedrK   rL   rM   Zdask.highlevelgraphrN   Zdask.layersrO   rf  rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   Zdask.widgetsrc   rh   r   r  r~  Z
set_optionrs   r   r   r   r   r   r  r   r$  r#  r  r  and_r!  r  r  r   r  r  r  r  r   negor_r  r  r  r  xorr   Z_bind_operatorr   r  r]  r  rd  r%  r&  r  r0  r4  r8  r1  rY  rC  rZ  r  r  rN  rM  rP  rT  r  r%  ri  rh  rj  rk  r}  r  r  r  r  r  r  rl  r  r  r  r  rw  ry  rx  r  r  r  r  r  r  r  r  r  r  r   rI  r  r   r   r  rf  r  r  r  r  r>  rW  rX  rU  r  rp   rp   rp   rq   <module>   s  PT
 

!                        u	     l W            f
g2     

 B
	 $/
~
P
"
$


*
 
'
8%
1









 

G>@;