U
    /ea                  	   @  s  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl
mZ d dlmZ d dlmZmZmZ d dlZd dlZd dlmZmZ d dlmZmZ d d	lmZ d d
lmZm Z  d dl!m"Z"m#Z#m$Z$ d dl%m&Z&m'Z'm(Z( d dl)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/m0Z0 ej1ej2ej3ej4fZ5de6d< zd dl7m8Z9 e5e9j:f7 Z5W n e;k
r\   Y nX dd Z<dd Z=dd Z>dZ?dZ@ededZAedddddZBeddd d!dZBd"d ZBed]d$d%ZCd&ZDd'd( ZEd^d)d*ZFd_d,d-ZGd`d.d/ZHeId+eJd0eKd1eLd2eMd3eMd3eNd3d3d4ZOd5d6 ZPd7d8 ZQdad9d:ZRd;d< ZSdbd=d>ZTdcd?d@ZUdAdBdCdDZVdddFdGZWdHdI ZXdedJdKZYdLdM ZZdfdNdOZ[dgdPdQZ\dRdS Z]dhdUdVZ^G dWdX dXe_e`ZadYdZ Zbd[d\ ZcdS )i    )annotationsN)IteratorMapping)contextmanager)Number)CallableTypeVaroverload)is_categorical_dtypeis_dtype_equal)get_scheduleris_dask_collection)get_deps)_dtypesmethods)PANDAS_GT_110PANDAS_GT_120tm)	make_metamake_meta_objmeta_nonempty)make_scalar)
asciitableis_dataframe_likeis_index_likeis_series_liketypenameztuple[type, ...]meta_object_typesc                 C  s:   t | d| }tjtjtjtjtjtjtjtj	f}t
||S )Ndtype)getattrpdZ	Int8DtypeZ
Int16DtypeZ
Int32DtypeZ
Int64DtypeZ
UInt8DtypeZUInt16DtypeZUInt32DtypeZUInt64Dtype
isinstancetr   types r%   8/tmp/pip-unpacked-wheel-dbjnr7gq/dask/dataframe/utils.pyis_integer_na_dtype/   s    
r'   c                 C  s*   t sdS t| d| }tjtjf}t||S )NFr   )r   r   r    ZFloat32DtypeZFloat64Dtyper!   r"   r%   r%   r&   is_float_na_dtype>   s    r(   c                 c  s   t |trt|}t|s"| V  nt|}|  } | j}t|rJ|	 }|
|}| jd|d  V  tt|d D ] }| j|| ||d   V  qx| j|d d V  dS )aL  Shard a DataFrame by ranges on its index

    Examples
    --------

    >>> df = pd.DataFrame({'a': [0, 10, 20, 30, 40], 'b': [5, 4 ,3, 2, 1]})
    >>> df
        a  b
    0   0  5
    1  10  4
    2  20  3
    3  30  2
    4  40  1

    >>> shards = list(shard_df_on_index(df, [2, 4]))
    >>> shards[0]
        a  b
    0   0  5
    1  10  4

    >>> shards[1]
        a  b
    2  20  3
    3  30  2

    >>> shards[2]
        a  b
    4  40  1

    >>> list(shard_df_on_index(df, []))[0]  # empty case
        a  b
    0   0  5
    1  10  4
    2  20  3
    3  30  2
    4  40  1
    Nr      )r!   r   listlennparray
sort_indexindexr
   Z
as_orderedZsearchsortedilocrange)df	divisionsr0   indicesir%   r%   r&   shard_df_on_indexJ   s    '


r7   z?meta : pd.DataFrame, pd.Series, dict, iterable, tuple, optionala  An empty ``pd.DataFrame`` or ``pd.Series`` that matches the dtypes and
column names of the output. This metadata is necessary for many algorithms
in dask dataframe to work.  For ease of use, some alternative inputs are
also available. Instead of a ``DataFrame``, a ``dict`` of ``{name: dtype}``
or iterable of ``(name, dtype)`` can be provided (note that the order of
the names should match the order of the columns). Instead of a series, a
tuple of ``(name, dtype)`` can be used. If not provided, dask will try to
infer the metadata. This may lead to unexpected results, so providing
``meta`` is recommended. For more information, see
``dask.dataframe.utils.make_meta``.
T)bound)funcreturnc                 C  s   d S Nr%   )r:   r%   r%   r&   insert_meta_param_description   s    r=   intzCallable[[T], T])padr;   c                 C  s   d S r<   r%   )r?   r%   r%   r&   r=      s    c                    s   | s fddS | d }d  dd }tjt||dd}d	td
|}|jrd|jkrn|jd||_nPd|dd  }t	
d|j\}}|
dd\}	}
d|||	|dd ||
|_|S )zReplace `$META` in docstring with param description.

    If pad keyword is provided, will pad description by that number of
    spaces (default is 8).c                   s   t | f S r<   )r=   )fkwargsr%   r&   <lambda>       z/insert_meta_param_description.<locals>.<lambda>r    r?      N   )initial_indentsubsequent_indentwidthz{}
{}
z$METAzParameters
%s----------   NzParameters\n[ ]*----------z

r)   z{}{}{}
{}{}

{})gettextwrapwrap_META_DESCRIPTIONformat_META_TYPESjoin__doc__replaceresplit)argsrB   r@   indentbodydescrZparameter_headerfirstlast
parametersrestr%   rA   r&   r=      s4       
   
  Fc              
   c  s   z
dV  W n t k
r } zft \}}}dt|}d}|rJ|d7 }|d7 }|| rfd|  dndt||}t||W 5 d}~X Y nX dS )zReraise errors in this block to show metadata inference failure.

    Parameters
    ----------
    funcname : str, optional
        If provided, will be added to the error message to indicate the
        name of the method that failed.
    N zMetadata inference failed{0}.

zYou have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

zOOriginal error is below:
------------------------
{1}

Traceback:
---------
{2}z in ``)		Exceptionsysexc_inforS   	traceback	format_tbrQ   repr
ValueError)funcnameZudfeexc_type	exc_valueexc_tracebacktbmsgr%   r%   r&   raise_on_meta_error   s    

"rp   Z__UNKNOWN_CATEGORIES__c                 C  sH   t | d| } t| r t| jjkS t| r<t| dr<t| jkS tddS )zwReturns whether the categories in `x` are known.

    Parameters
    ----------
    x : Series or CategoricalIndex
    _meta
categoriesz#Expected Series or CategoricalIndexN)r   r   UNKNOWN_CATEGORIEScatrr   r   hasattr	TypeErrorxr%   r%   r&   has_known_categories   s    
ry   c                 C  s   t | tjtjfr|  } t | tjr| jdk}| r|| j}|D ]>}t| | sF|rp| | j	j
tdd qF| | j	g | |< qFn*t | tjrt| jrt| s| j	g } t | jtjrt| js| jg | _nt | tjrt| s| g } | S )zReplace any unknown categoricals with empty categoricals.

    Useful for preventing ``UNKNOWN_CATEGORIES`` from leaking into results.
    categoryT)inplace)r!   r    Series	DataFramecopydtypesanyr0   ry   rt   Zremove_categoriesrs   set_categoriesr
   r   CategoricalIndex)rx   Zjust_drop_unknownZcat_maskZcatscr%   r%   r&   strip_unknown_categories   s*    


r   Tc                 C  s   t | tjtjfr|  } t | tjr|| jdk}|dkrD|| j}n|j|  sZt	d|D ]}| | j
tg| |< q^n$t | tjrt| jr| j
tg} |rt | jtjr| jtg| _nt | tjr| tg} | S )a  Set categories to be unknown.

    Parameters
    ----------
    x : DataFrame, Series, Index
    cols : iterable, optional
        If x is a DataFrame, set only categoricals in these columns to unknown.
        By default, all categorical columns are set to unknown categoricals
    index : bool, optional
        If True and x is a Series or DataFrame, set the clear known categories
        in the index as well.
    rz   Nz Not all columns are categoricals)r!   r    r|   r}   r~   r   r0   locallrh   rt   r   rs   r
   r   r   )rx   colsr0   maskr   r%   r%   r&   clear_known_categories
  s$    

r   c                 C  sZ   t |trH|dkrHtjttg| djd d }|d k	rDt||_|S tjg || |dS )Nrz   )namer   )r   r   r0   )	r!   strr    r|   ZCategoricalrs   r1   r   r0   )r   r   r0   sr%   r%   r&   _empty_series+  s     
r       z
1970-01-01r)   Zfoo)bVMmSaUOc                 C  sl   | j dkr| dS | j dkr.| tddS | j tkrZt| j  }| j dkrV|| S |S td|  d S )N)r6   r@   ur)   r   r   )r   r   zCan't handle dtype: )kindtypecomplex_simple_fake_mappingZastyperv   )r   or%   r%   r&   _scalar_from_dtype@  s    




r   c                 C  sb   t | tjkrt| S t| rFt| dr0| jntt | }t|S tdtt |  dd S )Nr   zCan't handle meta of type '')	r   r   Z_lookupr-   isscalarru   r   rv   r   )rx   r   r%   r%   r&   _nonempty_scalarL  s    
r   c                   sT  |rdddhnt    fddt|s8t|s8t|r@t|rTtdtt| | j|jkr~d	tt|tt| }nt|rt
j| j|jgdd	d
}fdd|d D }|rd	tt|tdddg|}nt||  | S nB| j|jr| S d	tt|tddgd| jfd|jfg}td|rDd| nd|f dS )a  Check that the dask metadata matches the result.

    If metadata matches, ``x`` is passed through unchanged. A nice error is
    raised if metadata doesn't match.

    Parameters
    ----------
    x : DataFrame, Series, or Index
    meta : DataFrame, Series, or Index
        The expected metadata that ``x`` should match
    funcname : str, optional
        The name of the function in which the metadata was specified. If
        provided, the function name will be included in the error message to be
        more helpful to users.
    numeric_equal : bool, optionl
        If True, integer and floating dtypes compare equal. This is useful due
        to panda's implicit conversion of integer to floating upon encountering
        missingness, which is hard to infer statically.
    r6   r@   r   c                   s   t | t |krdS t| tr&| dks8t|tr<|dkr<dS t | rlt |rlt| jks`t|jkrddS | |kS | j kr|j kpt| |S )NF-T)r
   r!   r   rs   rr   r   r   r   r   )eq_typesr%   r&   equal_dtypesm  s    $z check_meta.<locals>.equal_dtypesz>Expected partition to be DataFrame, Series, or Index, got `%s`z,Expected partition of type `{}` but got `{}`r)   T)axissortc                   s*   g | ]"\}}} ||st |||fqS r%   )rg   ).0colr   r   )r   r%   r&   
<listcomp>  s   
zcheck_meta.<locals>.<listcomp>r   zPartition type: `{}`
{}ZColumnFoundZExpectedr`   r   zMetadata mismatch found%s.

%sz in `%s`N)setr   r   r   r   rv   r   r   	__class__rQ   r    concatr   Zfillna
itertuplesr   check_matching_columnsr   rh   )rx   metari   numeric_equalerrmsgr   Z
bad_dtypesr%   )r   r   r&   
check_metaW  sV    






r   c                 C  st   t t | jt |jspt|j| j}t| j|j}|sL|r^d| d| }nd}td| d S )Nz  Extra:   z
  Missing: zOrder of columns does not matchzSThe columns in the computed data do not match the columns in the provided metadata
)r-   Zarray_equalZ
nan_to_numcolumnsr   tolist
differencerh   )r   actualextramissingZ
extra_infor%   r%   r&   r     s    r   c                 C  sV   t | }|dkr| jj}|r>| d }| d }d| d| }nd}| d| d| S )	z&Summarized representation of an Index.Nr   r*   z, z to r`   z: z entries)r,   r   __name__)idxr   nheadtailsummaryr%   r%   r&   index_summary  s    r   c                 C  s  dd l m} t| dr|  }t|dr2|  |d krF| j|d}t| |jrdt|j	kslt
t||r| j|jkst
| jj|jkst
t|tjr|j| jjkst
|rt| | nt| |jrldt|j	kst
t|t| jt|kst
t| j|rD| j|jks0t
| j|jf| jj|jksDt
|rTt| | t| j|||jd nt| |jr$dt|j	kst
t|t| jtjst
t| jt| jt|kst
t| j|rt| j|j t| jj|j |rt| | t| j|||jd n^t| |jjrjt|sXt|tjtjfsXt
|rt| | nd	t|  d
}t
||S | S )Nr   __dask_graph__validate	schedulerIndexr|   )check_namescheck_dtypesresultr}   zUnsupported dask instance z found)dask.dataframeZ	dataframeru   r   r   Zcomputer!   r   r   r   AssertionErrorr   rq   r    
MultiIndexnamesassert_dask_dtypesr|   _check_daskr0   r}   r   r   assert_index_equalcoreZScalarr-   r   Z	TimestampZ	Timedelta)Zdskr   r   r   r   ddgraphro   r%   r%   r&   r     sp    
"

"
 

r   bool)check_indexc              
   C  s   z^t | rTt| jjt| j@ r>dd tt| jjD | j_| jt	| jd} n|  } W n t
ttfk
rx   Y nX |r|  S | S )Nc                 S  s   g | ]}d | qS )z-overlapped-index-name-%dr%   )r   r6   r%   r%   r&   r   
  s    z_maybe_sort.<locals>.<listcomp>)Zby)r   r   r0   r   r   r2   r,   Zsort_valuesr   r   rv   
IndexErrorrh   r/   )r   r   r%   r%   r&   _maybe_sort  s    
r   syncc                 K  s  |rtt | |d t ||d t| drtt|drttt| j d }	tt|j d }
|	|
kstt|	|
ft|  t| t	| |||d} t	||||d}t| dr| 
 } t|dr|
 }t| tjtjfr|rt| |} t||}|s| jdd} |jdd}t| tjr:tj| |f||d| nt| tjrdtj| |f||d| nft| tjrtj| |fd	|i| n>| |krdS t| rt|stnt| |stdS )
Nr   r4   r   )r   r   r   	to_pandasT)drop)r   check_dtypeexact)assert_divisionsru   r   r-   Zasarrayr4   r   r   assert_sane_keynamesr   r   r!   r    r}   r|   r   Zreset_indexr   Zassert_frame_equalZassert_series_equalr   r   isnanZallclose)r   r   r   r   Zcheck_divisionsr   Zsort_resultsr   rB   atZbtr%   r%   r&   	assert_eq  sv          



    
r   c                 C  s\   t | dr| j} t| tst| D ]&}t|tr8|d }||r" dS q"td| d S )Ndaskr   Tz(given dask graph doesn't contain label: )ru   r   r!   r   r   tuple
startswith)r   labelkr%   r%   r&   assert_dask_graphN  s    


r   c                 C  s   t | dsd S t| jtstt| dds.d S dd }t|t| gd}|| j| 	 }t
|d d D ]H\}}t|rh|| | j| kst|| | j|d  k shtqht|d r||d  | jd	 kst||d  | jd kstd S )
Nr4   Zknown_divisionsFc                 S  s:   t | r| S z| jdW S  tk
r4   | j Y S X d S )Nr   )r   r0   Zget_level_valuesAttributeErrorrw   r%   r%   r&   r0   c  s    zassert_divisions.<locals>.index)r   collectionsr*   r)   )ru   r!   r4   r   r   r   r   r   r   Z__dask_keys__	enumerater,   minmax)ddfr   r0   rM   resultsr6   r3   r%   r%   r&   r   Z  s    
 r   c                 C  s~   t | dsd S | j D ]`}t|tr0|d }qt|ttfsBtt|dk sRtd|ks^t|	dd 
 st|qd S )Nr   r   d   rE   r   )ru   r   keysr!   r   r   bytesr   r,   rW   isidentifier)r   r   r%   r%   r&   r   w  s    


r   c                   s   ddddhg |r"  dddh  fdd	}t|sxt|rxtj| jj|jgd
djddD ]\}}|||s^tq^nt|st	|st
|r| jj}|j}|||stnnt| jdr| jj}t|dst|sttt|}n|j}|||stnt| jt|kstdS )a  Check that the dask metadata matches the result.

    If `numeric_equal`, integer and floating dtypes compare equal. This is
    useful due to the implicit conversion of integer to floating upon
    encountering missingness, which is hard to infer statically.r   r   r   r   r6   r@   r   c                   s    t  fddD p kS )Nc                 3  s"   | ]} j |koj |kV  qd S r<   )r   )r   r   r   r%   r&   	<genexpr>  s    z8assert_dask_dtypes.<locals>.eq_dtypes.<locals>.<genexpr>)r   r   Zeq_type_setsr   r&   	eq_dtypes  s
    z%assert_dask_dtypes.<locals>.eq_dtypesr)   )r   F)r0   r   N)appendr   r   r    r   rq   r   r   r   r   r   r   ru   r-   r   r   )r   resr   r   r   r   r%   r   r&   r     s*    
r   c                 C  sL   t | j\}}|r.ttt| |ksHtnttt| |ksHtd S r<   )r   r   r   mapr,   valuesr   )rx   r   eqZdependenciesZ
dependentsr%   r%   r&   assert_max_deps  s    r   c                 C  s   t | ttfsdS t| dd D ]8\}}|| |d  kr@ dS t |tr"t|r" dS q"| dd D ]}t |trht|rh dS qh| d | d krdS dS )aR  Are the provided divisions valid?

    Examples
    --------
    >>> valid_divisions([1, 2, 3])
    True
    >>> valid_divisions([3, 2, 1])
    False
    >>> valid_divisions([1, 1, 1])
    False
    >>> valid_divisions([0, 1, 1])
    True
    >>> valid_divisions(123)
    False
    >>> valid_divisions([0, float('nan'), 1])
    False
    FNr   r)   r*   T)r!   r   r+   r   r   mathr   )r4   r6   rx   r%   r%   r&   valid_divisions  s    r   raisec                 C  s4   | j dd}tjj|s |g}|j|d|d |S )z)Use shallow copy to drop columns in placeF)deepT)r   r{   errors)r~   r    apir$   Zis_list_liker   )r3   r   r   Zdf2r%   r%   r&   drop_by_shallow_copy  s
    r  c                   @  s   e Zd ZdZdS )AttributeNotImplementedErrorz&NotImplementedError and AttributeErrorN)r   
__module____qualname__rT   r%   r%   r%   r&   r    s   r  c                 C  s~   t | r:z
| j} W n& tk
r8   tt|  dY nX t| rH| jS t| rV| jS t	| rh| 
 jS tt|  ddS )zReturn a serial DataFrame constructor

    Parameters
    ----------
    like :
        Any series-like, Index-like or dataframe-like object.
    z( not supported by meta_frame_constructorN)r   rq   r   rv   r   r   _constructorr   Z_constructor_expanddimr   to_frameliker%   r%   r&   meta_frame_constructor  s    

r
  c                 C  s~   t | r:z
| j} W n& tk
r8   tt|  dY nX t| rH| jS t| rV| jS t	| rh| 
 jS tt|  ddS )zReturn a serial Series constructor

    Parameters
    ----------
    like :
        Any series-like, Index-like or dataframe-like object.
    z) not supported by meta_series_constructorN)r   rq   r   rv   r   r   Z_constructor_slicedr   r  r   r  r  r%   r%   r&   meta_series_constructor  s    

r  )NF)F)NT)N)NT)N)TTNN)TTTTTr   )N)T)T)r   )d
__future__r   r   rV   rc   rN   re   collections.abcr   r   
contextlibr   Znumbersr   typingr   r   r	   Znumpyr-   Zpandasr    Zpandas.api.typesr
   r   Z	dask.baser   r   Z	dask.corer   r   r   r   Zdask.dataframe._compatr   r   r   Zdask.dataframe.dispatchr   r   r   Zdask.dataframe.extensionsr   Z
dask.utilsr   r   r   r   r   r|   r}   r   r   r   __annotations__Zscipy.sparsesparsespZspmatrixImportErrorr'   r(   r7   rR   rP   r8   r=   rp   rs   ry   r   r   r   Zbool_voidZ
datetime64Ztimedelta64Zstr_Zunicode_r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  NotImplementedErrorr   r  r
  r  r%   r%   r%   r&   <module>   s    8"

!


M

=      
9

&
%
	