#
# This is a version to reduce scraping time but might face IP blocking problem.
#

import requests
import json
import numpy as np
import pandas as pd
import time
import random
from parsel import Selector
from datetime import datetime

while True:

    df = pd.read_csv('./dataforthai_test2.csv', dtype={'tax': str, 'tsic': str})

    # File path to store and load the value of 'i'
    index_file = './last_index.txt'

    # Load the last index value if the file exists
    try:
        with open(index_file, 'r') as file:
            last_index = int(file.read().strip())
    except FileNotFoundError:
        last_index = None

    # Set the parameter value as the sum of 'i' and 25
    break_parameter = last_index + 15 if last_index is not None else None

    # PAYLOAD & COOKIES FOR LOGGING TO DATAFORTHAI
    payload = {}
    cookies = {"PHPSESSID": "vgr9hc8bnc803cnhi7dnd7fkta"}
     

    # Create a session object for HTTP requests
    session = requests.Session()

    # PROCESS SCRAPING
    for i, v in enumerate(df.iterrows()):
        if last_index is not None and i <= last_index:
            continue
        
        if v[1]['name_en'] == '-':
            tax = v[1]['tax']
            tsic_code = v[1]['tsic']
            url = f'https://dataforthai.com/company/{tax}/'
            
            # Use the session object for the request
            req = session.get(url, cookies=cookies, params=json.dumps(payload))

            selector = Selector(text=req.text)
            created_at = datetime.now().isoformat()
            df.at[i, 'name_en'] = selector.xpath('//*/div[@id="main"]//h2/text()').get().strip()
            container_body = selector.xpath('//*/div[@id="main"]/div[3]/table[1]/tr/td')[0]
            df.at[i, 'business'] = container_body.xpath('//*/table[2]/tr[1]/td[2]/text()[1]').get().strip()
            df.at[i, 'status'] = container_body.xpath('//*/table[3]/tr/td[2]/text()').get().strip()
            df.at[i, 'date_register_thai'] = container_body.xpath('//*/table[4]/tr/td[2]/text()').get().strip()
            df.at[i, 'capital'] = container_body.xpath('//*/table[5]/tr/td[2]/text()').get().strip()
            address_element = container_body.xpath('//*/table[6]/tr/td[2]/a/text()')
            df.at[i, 'address'] = address_element.get().strip() if address_element else ""
            
            # Use list comprehension for committee lists
            list_committee = [committee.strip() for committee in container_body.xpath('//*/tr/td/table[8]/tr/td/text()').getall()]
            list_committee_other = [committee.strip() for committee in container_body.xpath('//*/tr/td/table[9]/tr/td/text()').getall()]
            df.at[i, 'list_committee'] = list_committee + list_committee_other

            df.to_csv('./dataforthai_test2.csv', encoding='utf-8', index=False)
            print(f'Process for row {i} is done.')
            
            
            # Save the current index value to the file
            with open(index_file, 'w') as file:
                file.write(str(i))
            
            sleep_time = random.uniform(4, 8)
            time.sleep(sleep_time)
    
    time.sleep(8)