U
    f/ebu                     @  s
  d Z ddlmZ ddlmZ ddlmZmZ ddlZddlm	Z	m
Z
mZ ddlZddlmZmZ ddlZddlmZmZ dd	lmZ dd
lmZ ddlm  m  mZ ddlmZ dddddZ ddddddZ!G dd dZ"G dd dZ#G dd deej$Z%dS )a  
Read SAS7BDAT files

Based on code written by Jared Hobbs:
  https://bitbucket.org/jaredhobbs/sas7bdat

See also:
  https://github.com/BioStatMatt/sas7bdat

Partial documentation of the file format:
  https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf

Reference for binary data compression:
  http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
    )annotations)abc)datetime	timedeltaN)IOAnycast)EmptyDataErrorOutOfBoundsDatetime)	DataFrameisna)
get_handle)Parser)
ReaderBasefloatstrZsas_datetimeunitc                 C  sV   t | rtjS |dkr,tdddt| d S |dkrJtdddt| d S tdd S )Ns     )secondsd)dayszunit must be 'd' or 's')r   pdZNaTr   r   
ValueErrorr    r   :/tmp/pip-unpacked-wheel-tiezk1ph/pandas/io/sas/sas7bdat.py_parse_datetime1   s    r   z	pd.Series)sas_datetimesr   returnc                 C  sJ   zt j| |ddW S  tk
rD   | jt|d}tt j|}| Y S X dS )a  
    Convert to Timestamp if possible, otherwise to datetime.datetime.
    SAS float64 lacks precision for more than ms resolution so the fit
    to datetime.datetime is ok.

    Parameters
    ----------
    sas_datetimes : {Series, Sequence[float]}
       Dates or datetimes in SAS
    unit : {str}
       "d" if the floats represent dates, "s" for datetimes

    Returns
    -------
    Series
       Series of datetime64 dtype or datetime.datetime.
    z
1960-01-01)r   originr   N)r   Zto_datetimer
   applyr   r   ZSeries)r   r   Zs_seriesr   r   r   _convert_datetimes?   s    r$   c                   @  sB   e Zd ZU ded< ded< ded< ded< dddddddZd	S )
_SubheaderPointerintoffsetlengthcompressionptyper'   r(   r)   r*   c                 C  s   || _ || _|| _|| _d S Nr+   )selfr'   r(   r)   r*   r   r   r   __init___   s    z_SubheaderPointer.__init__N__name__
__module____qualname____annotations__r.   r   r   r   r   r%   Y   s
   
r%   c                   @  sV   e Zd ZU ded< ded< ded< ded< ded< ded	< ddddddd
ddZdS )_Columnr&   col_idzstr | bytesnamelabelformatbytesctyper(   r5   r6   r7   r8   r:   r(   c                 C  s(   || _ || _|| _|| _|| _|| _d S r,   r;   )r-   r5   r6   r7   r8   r:   r(   r   r   r   r.   n   s    	z_Column.__init__Nr/   r   r   r   r   r4   f   s   
r4   c                   @  s  e Zd ZU dZded< ded< dTdd	Zd
dddZd
dddZd
dddZddddZ	ddddZ
dd ZdddddZddddddZdddd d!Zddd"d#Zd$dd%d&Zd'd( Zddd)d*Zd+dd,d-d.Zddd/d0d1d2Zdd+d3d4d5Zdd/dd6d7d8Zdddd9d:d;Zdddd9d<d=Zdddd9d>d?Zdddd9d@dAZdddd9dBdCZdddd9dDdEZdddd9dFdGZdddd9dHdIZdUdJdKdLdMdNZdOdP Z dQddRdSZ!dS )VSAS7BDATReadera  
    Read SAS files in SAS7BDAT format.

    Parameters
    ----------
    path_or_buf : path name or buffer
        Name of SAS file or file-like object pointing to SAS file
        contents.
    index : column identifier, defaults to None
        Column to use as index.
    convert_dates : bool, defaults to True
        Attempt to convert dates to Pandas datetime values.  Note that
        some rarely used SAS date formats may be unsupported.
    blank_missing : bool, defaults to True
        Convert empty strings to missing values (SAS uses blanks to
        indicate missing character variables).
    chunksize : int, defaults to None
        Return SAS7BDATReader object for iterations, returns chunks
        with given number of lines.
    encoding : string, defaults to None
        String encoding.
    convert_text : bool, defaults to True
        If False, text variables are left as raw bytes.
    convert_header_text : bool, defaults to True
        If False, header text, including column names, are left as raw
        bytes.
    r&   _int_lengthzbytes | None_cached_pageNTc	           	      C  s   || _ || _|| _|| _|| _|| _|| _d| _d| _g | _	g | _
g | _g | _g | _d | _g | _g | _g | _d| _d| _d| _t|ddd| _ttt | jj| _z|   |   W n tk
r   |    Y nX d S )Nzlatin-1    r   rbF)Zis_text)indexconvert_datesblank_missing	chunksizeencodingconvert_textconvert_header_textdefault_encodingr)   column_names_stringscolumn_namescolumn_formatscolumns%_current_page_data_subheader_pointersr>   _column_data_lengths_column_data_offsets_column_types_current_row_in_file_indexZ_current_row_on_page_indexr   handlesr   r   r   handle_path_or_buf_get_properties_parse_metadata	Exceptionclose)	r-   Zpath_or_bufrA   rB   rC   rD   rE   rF   rG   r   r   r   r.      s:    zSAS7BDATReader.__init__z
np.ndarray)r    c                 C  s   t j| jt jdS )z5Return a numpy int64 array of the column data lengthsdtype)npasarrayrN   int64r-   r   r   r   column_data_lengths   s    z"SAS7BDATReader.column_data_lengthsc                 C  s   t j| jt jdS )z0Return a numpy int64 array of the column offsetsrY   )r[   r\   rO   r]   r^   r   r   r   column_data_offsets   s    z"SAS7BDATReader.column_data_offsetsc                 C  s   t j| jt ddS )zj
        Returns a numpy character array of the column types:
           s (string) or d (double)
        ZS1rY   )r[   r\   rP   rZ   r^   r   r   r   column_types   s    zSAS7BDATReader.column_typesNonec                 C  s   | j   d S r,   )rR   rX   r^   r   r   r   rX      s    zSAS7BDATReader.closec                 C  s  | j d tt| j d| _| jdttj tjkrBt	dd\}}| 
tjtj}|tjkrtj}d| _d| _tj| _tj| _nd| _tj| _tj| _d| _| 
tjtj}|tjkrtj}|| }| 
tjtj}|d	krd
| _nd| _| 
tjtjd }|tjkr tj| | _ nd| d| _ | 
tj!tj"}|dkrPd| _#n|dkrbd| _#nd| _#| 
tj$tj%}|&d| _'| j(r| j')| j*p| j+| _'| 
tj,tj-}|&d| _.| j(r| j.)| j*p| j+| _.t/ddd}| 0tj1| tj2}|t3j4|dd | _5| 0tj6| tj7}|t3j4|dd | _8| 9tj:| tj;| _<tt| j | j<d }|  j|7  _t| j| j<krt	d| 9tj=| tj>| _?| 9tj@| tjA| _B| 
tjC| tjD}|&d| _E| j(r| jE)| j*p| j+| _E| 
tjF| tjG}|&d| _H| j(r>| jH)| j*p8| j+| _H| 
tjI| tjJ}|&d| _K| j(r~| jK)| j*px| j+| _K| 
tjL| tjM}|&d}t|dkr|)| j*p| j+| _Nn@| 
tjO| tjP}|&d| _N| j(r| jN)| j*p| j+| _Nd S )Nr   i   z'magic number mismatch (not a SAS file?)r   r   T   F      <>zunknown (code=)   1unix   2Zwindowsunknown     r   r   r   r"   z*The SAS7BDAT file appears to be truncated.)QrT   seekr   r9   readr>   lenconstmagicr   _read_bytesZalign_1_offsetZalign_1_lengthZu64_byte_checker_valueZalign_2_valueU64r=   Zpage_bit_offset_x64_page_bit_offsetZsubheader_pointer_length_x64_subheader_pointer_lengthZpage_bit_offset_x86Zsubheader_pointer_length_x86Zalign_2_offsetZalign_2_lengthZalign_1_checker_valueZendianness_offsetZendianness_length
byte_orderZencoding_offsetZencoding_lengthZencoding_namesfile_encodingZplatform_offsetZplatform_lengthplatformZdataset_offsetZdataset_lengthrstripr6   rG   decoderE   rH   Zfile_type_offsetZfile_type_length	file_typer   _read_floatZdate_created_offsetZdate_created_lengthr   Zto_timedeltaZdate_createdZdate_modified_offsetZdate_modified_lengthZdate_modified	_read_intZheader_size_offsetZheader_size_lengthheader_lengthZpage_size_offsetZpage_size_length_page_lengthZpage_count_offsetZpage_count_lengthZ_page_countZsas_release_offsetZsas_release_lengthZsas_releaseZsas_server_type_offsetZsas_server_type_lengthZserver_typeZos_version_number_offsetZos_version_number_length
os_versionZos_name_offsetZos_name_lengthos_nameZos_maker_offsetZos_maker_length)r-   Zalign1Zalign2bufZtotal_alignepochxr   r   r   rU      s    




        
 zSAS7BDATReader._get_propertiesc                 C  s*   | j | jpdd}|d kr&|   t|S )Nr   )nrows)rp   rD   rX   StopIteration)r-   dar   r   r   __next__g  s
    zSAS7BDATReader.__next__)r'   widthc                 C  sJ   |dkr|    td| ||}|dkr0dnd}t| j| |d S )N)re   rd   zinvalid float widthre   fr   r   rX   r   rt   structunpackrx   )r-   r'   r   r   fdr   r   r   r~   o  s    zSAS7BDATReader._read_float)r'   r   r    c                 C  sP   |dkr|    td| ||}ddddd| }t| j| |d }|S )N)r      re   rd   zinvalid int widthbhlqr   r   )r-   r'   r   r   itZivr   r   r   r   x  s    zSAS7BDATReader._read_int)r'   r(   c                 C  s   | j d krX| j| | j|}t||k rT|   d|dd|dd}t||S || t| j krz|   td| j |||  S d S )NzUnable to read r   z bytes from file position .zThe cached page is too small.)r>   rT   ro   rp   rq   rX   r   )r-   r'   r(   r   msgr   r   r   rt     s    
zSAS7BDATReader._read_bytesc                 C  sT   d}|sPt t| j| j| _t| jdkr.qPt| j| jkrFtd|  }qd S )NFr   z2Failed to read a meta data page from the SAS file.)	r   r9   rT   rp   r   r>   rq   r   _process_page_meta)r-   doner   r   r   rV     s    zSAS7BDATReader._parse_metadataboolc                 C  sZ   |    tjtjgtj }| j|kr,|   | jtj@ }| jtjk}t|pV|pV| j	g kS r,   )
_read_page_headerrr   page_meta_typeZpage_amd_typepage_mix_types_current_page_type_process_page_metadatapage_data_typer   rM   )r-   ptis_data_pageZis_mix_pager   r   r   r     s    
z!SAS7BDATReader._process_page_metac                 C  sX   | j }tj| }| |tj| _tj| }| |tj| _tj	| }| |tj
| _d S r,   )rv   rr   Zpage_type_offsetr   Zpage_type_lengthr   Zblock_count_offsetZblock_count_lengthZ_current_page_block_countZsubheader_count_offsetZsubheader_count_length_current_page_subheaders_count)r-   
bit_offsetZtxr   r   r   r     s    


 z SAS7BDATReader._read_page_headerc                 C  sp   | j }t| jD ]Z}| tj| |}|jdkr2q|jtjkr@q| 	|j
}| ||j|j}| || qd S )Nr   )rv   ranger   _process_subheader_pointersrr   Zsubheader_pointers_offsetr(   r)   Ztruncated_subheader_id_read_subheader_signaturer'   _get_subheader_indexr*   _process_subheader)r-   r   ipointersubheader_signaturesubheader_indexr   r   r   r     s"     
  z%SAS7BDATReader._process_page_metadatar9   )	signaturer    c                 C  s`   t j|}|d kr\|t jkp$|dk}|t jk}| jdkrL|rL|rLt jj}n|   t	d|S )Nr   r?   zUnknown subheader signature)
rr   Zsubheader_signature_to_indexgetZcompressed_subheader_idZcompressed_subheader_typer)   SASIndexdata_subheader_indexrX   r   )r-   r   r)   r*   rA   f1f2r   r   r   r     s    

z#SAS7BDATReader._get_subheader_indexr%   )r'   subheader_pointer_indexr    c           
      C  st   | j }|||  }| || j}|| j7 }| || j}|| j7 }| |d}|d7 }| |d}t||||}	|	S )Nr   )rw   r   r=   r%   )
r-   r'   r   Zsubheader_pointer_lengthZtotal_offsetZsubheader_offsetZsubheader_lengthZsubheader_compressionZsubheader_typer   r   r   r   r     s     

   z*SAS7BDATReader._process_subheader_pointers)r'   r    c                 C  s   |  || j}|S r,   )rt   r=   )r-   r'   r   r   r   r   r     s    z(SAS7BDATReader._read_subheader_signature)r   r   r    c                 C  s   |j }|j}|tjjkr | j}n|tjjkr4| j}n|tjjkrH| j	}n|tjj
kr\| j}nt|tjjkrp| j}n`|tjjkr| j}nL|tjjkr| j}n8|tjjkr| j}n$|tjjkr| j| d S td||| d S )Nzunknown subheader index)r'   r(   rr   r   Zrow_size_index_process_rowsize_subheaderZcolumn_size_index_process_columnsize_subheaderZcolumn_text_index_process_columntext_subheaderZcolumn_name_index_process_columnname_subheaderZcolumn_attributes_index#_process_columnattributes_subheaderZformat_and_label_index_process_format_subheaderZcolumn_list_index_process_columnlist_subheaderZsubheader_counts_index_process_subheader_countsr   rM   appendr   )r-   r   r   r'   r(   	processorr   r   r   r     s.    z!SAS7BDATReader._process_subheader)r'   r(   r    c                 C  s   | j }|}|}| jr&|d7 }|d7 }n|d7 }|d7 }| |tj|  || _| |tj|  || _| |tj|  || _	| |tj
|  || _tj| }| || || _| |d| _| |d| _d S )Ni  i  ib  iz  r   )r=   ru   r   rr   Zrow_length_offset_multiplierZ
row_lengthZrow_count_offset_multiplier	row_countZcol_count_p1_multipliercol_count_p1Zcol_count_p2_multipliercol_count_p2Z'row_count_on_mix_page_offset_multiplierZ_mix_page_row_count_lcs_lcp)r-   r'   r(   int_lenZ
lcs_offsetZ
lcp_offsetZmxr   r   r   r   
  s8    
    
z)SAS7BDATReader._process_rowsize_subheaderc                 C  sT   | j }||7 }| ||| _| j| j | jkrPtd| j d| j d| j d d S )Nz Warning: column count mismatch (z + z != z)
)r=   r   column_countr   r   print)r-   r'   r(   r   r   r   r   r   '  s    z,SAS7BDATReader._process_columnsize_subheaderc                 C  s   d S r,   r   r-   r'   r(   r   r   r   r   2  s    z(SAS7BDATReader._process_subheader_countsc           
      C  s  || j 7 }| |tj}| ||}|d| d}|}| jrR|| jpN| j	}| j
| t| j
dkrd}tjD ]}||krx|}qx|| _|| j 8 }|d }	| jr|	d7 }	| |	| j}|d}|dkrd| _|d }	| jr|	d7 }	| |	| j}|d| j | _n|tjkrR|d	 }	| jr2|	d7 }	| |	| j}|d| j | _nH| jdkrd| _|d }	| jr||	d7 }	| |	| j}|d| j | _| jrt| d
r| j| jp| j	| _d S )Nr   rn   r   r?      re           (   creator_proc)r=   r   rr   Ztext_block_size_lengthrt   r{   rG   r|   rE   rH   rI   r   rq   Zcompression_literalsr)   ru   r   r   r   Zrle_compressionhasattr)
r-   r'   r(   Ztext_block_sizer   Z	cname_rawcnameZcompression_literalZclZoffset1r   r   r   r   5  sZ    




z,SAS7BDATReader._process_columntext_subheaderc                 C  s   | j }||7 }|d|  d d }t|D ]}|tj|d   tj }|tj|d   tj }|tj|d   tj }| |tj}	| |tj	}
| |tj
}| j|	 }| j||
|
|   q*d S )Nr      rd   r   )r=   r   rr   Zcolumn_name_pointer_lengthZ!column_name_text_subheader_offsetZcolumn_name_offset_offsetZcolumn_name_length_offsetr   Z!column_name_text_subheader_lengthZcolumn_name_offset_lengthZcolumn_name_length_lengthrI   rJ   r   )r-   r'   r(   r   Zcolumn_name_pointers_countr   Ztext_subheaderZcol_name_offsetZcol_name_lengthidx
col_offsetZcol_lenZname_strr   r   r   r   i  sB      
z,SAS7BDATReader._process_columnname_subheaderc           
      C  s   | j }|d|  d |d  }t|D ]}|| tj ||d   }|d|  tj ||d   }|d|  tj ||d   }| ||}	| j|	 | |tj	}	| j
|	 | |tj}	| j|	dkrdnd q&d S )Nr   r   rd   r      d   s)r=   r   rr   Zcolumn_data_offset_offsetZcolumn_data_length_offsetZcolumn_type_offsetr   rO   r   Zcolumn_data_length_lengthrN   Zcolumn_type_lengthrP   )
r-   r'   r(   r   Zcolumn_attributes_vectors_countr   Zcol_data_offsetZcol_data_lenZ	col_typesr   r   r   r   r     s*    
z2SAS7BDATReader._process_columnattributes_subheaderc                 C  s   d S r,   r   r   r   r   r   r     s    z,SAS7BDATReader._process_columnlist_subheaderc                 C  sl  | j }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }	| |tj	}
t
|
t| jd }| |tj}| |tj}| |tj}t
|t| jd }| |tj}| |	tj}| j| }||||  }| j| }||||  }t| j}t|| j| ||| j| | j| }| j| | j| d S )N   r   )r=   rr   Z)column_format_text_subheader_index_offsetZcolumn_format_offset_offsetZcolumn_format_length_offsetZ(column_label_text_subheader_index_offsetZcolumn_label_offset_offsetZcolumn_label_length_offsetr   Z)column_format_text_subheader_index_lengthminrq   rI   Zcolumn_format_offset_lengthZcolumn_format_length_lengthZ(column_label_text_subheader_index_lengthZcolumn_label_offset_lengthZcolumn_label_length_lengthrL   r4   rJ   rP   rN   rK   r   )r-   r'   r(   r   Ztext_subheader_formatZcol_format_offsetZcol_format_lenZtext_subheader_labelZcol_label_offsetZcol_label_lenr   Z
format_idxZformat_startZ
format_lenZ	label_idxZlabel_startZ	label_lenZlabel_namesZcolumn_labelZformat_namesZcolumn_formatZcurrent_column_numbercolr   r   r   r     sR       


	z(SAS7BDATReader._process_format_subheaderz
int | NonezDataFrame | None)r   r    c                 C  s   |d kr| j d k	r| j }n|d kr(| j}t| jdkrF|   td| j| jkrVd S | j| j }||krn|}| jd}| jd}tj	||ft
d| _tj|d| ftjd| _d| _t| }|| |  }| jd k	r|| j}|S )Nr   zNo columns to parse from filer   r   rY   rd   )rD   r   rq   rP   rX   r	   rQ   countr[   emptyobject_string_chunkzerosZuint8_byte_chunk_current_row_in_chunk_indexr   rp   _chunk_to_dataframerA   Z	set_index)r-   r   mZndnsprsltr   r   r   rp     s.    

zSAS7BDATReader.readc                 C  s   g | _ tt| j| j| _t| jdkr.dS t| j| jkrl|   dt| jdd| jdd}t	|| 
  | j}|tjkr|   |tj@ }tjgtj }|s| j|kr|  S dS )Nr   Tz-failed to read complete page from file (read r   z of z bytes)F)rM   r   r9   rT   rp   r   r>   rq   rX   r   r   r   rr   r   r   r   r   _read_next_page)r-   r   Z	page_typer   r   r   r   r   r     s$    

zSAS7BDATReader._read_next_pager   c           
      C  s  | j }| j}t|| |}t|d}d\}}t| jD ]V}| j| }| j| dkr| j|d d f j| j	d d||< t
j|| t
jd||< | jr| j| tjkrt|| d||< n"| j| tjkrt|| d||< |d7 }q6| j| dkrr| j|d d f ||< | jr>| jd k	r>|| j| jp6| j||< | jrh|| j d	k}	t
j|j|	|f< |d7 }q6|   td
| j|  q6|S )N)rA   rc   r   r   rY   r   r   r   r   zunknown column type )r   rQ   r   r   r   rJ   rP   r   viewrx   r[   r\   Zfloat64rB   rK   rr   Zsas_date_formatsr$   Zsas_datetime_formatsr   rF   rE   r   r|   rH   rC   rq   nanlocrX   r   )
r-   nr   ixr   ZjsZjbjr6   iir   r   r   r     s:    

$


z"SAS7BDATReader._chunk_to_dataframe)NTTNNTT)N)"r0   r1   r2   __doc__r3   r.   r_   r`   ra   rX   rU   r   r~   r   rt   rV   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rp   r   r   r   r   r   r   r<      sP   
       
0 		
4 1"r<   )&r   
__future__r   collectionsr   r   r   r   typingr   r   r   Znumpyr[   Zpandas.errorsr	   r
   Zpandasr   r   r   Zpandas.io.commonr   Zpandas.io.sas._sasr   Zpandas.io.sas.sas_constantsioZsasZsas_constantsrr   Zpandas.io.sas.sasreaderr   r   r$   r%   r4   Iteratorr<   r   r   r   r   <module>   s$   