# # This is a version to reduce scraping time but might face IP blocking problem. # import requests import json import numpy as np import pandas as pd import time import random from parsel import Selector from datetime import datetime while True: df = pd.read_csv('./dataforthai_test2.csv', dtype={'tax': str, 'tsic': str}) # File path to store and load the value of 'i' index_file = './last_index.txt' # Load the last index value if the file exists try: with open(index_file, 'r') as file: last_index = int(file.read().strip()) except FileNotFoundError: last_index = None # Set the parameter value as the sum of 'i' and 25 break_parameter = last_index + 15 if last_index is not None else None # PAYLOAD & COOKIES FOR LOGGING TO DATAFORTHAI payload = {} cookies = {"PHPSESSID": "vgr9hc8bnc803cnhi7dnd7fkta"} # Create a session object for HTTP requests session = requests.Session() # PROCESS SCRAPING for i, v in enumerate(df.iterrows()): if last_index is not None and i <= last_index: continue if v[1]['name_en'] == '-': tax = v[1]['tax'] tsic_code = v[1]['tsic'] url = f'https://dataforthai.com/company/{tax}/' # Use the session object for the request req = session.get(url, cookies=cookies, params=json.dumps(payload)) selector = Selector(text=req.text) created_at = datetime.now().isoformat() df.at[i, 'name_en'] = selector.xpath('//*/div[@id="main"]//h2/text()').get().strip() container_body = selector.xpath('//*/div[@id="main"]/div[3]/table[1]/tr/td')[0] df.at[i, 'business'] = container_body.xpath('//*/table[2]/tr[1]/td[2]/text()[1]').get().strip() df.at[i, 'status'] = container_body.xpath('//*/table[3]/tr/td[2]/text()').get().strip() df.at[i, 'date_register_thai'] = container_body.xpath('//*/table[4]/tr/td[2]/text()').get().strip() df.at[i, 'capital'] = container_body.xpath('//*/table[5]/tr/td[2]/text()').get().strip() address_element = container_body.xpath('//*/table[6]/tr/td[2]/a/text()') df.at[i, 'address'] = address_element.get().strip() if address_element else "" # Use list comprehension for committee lists list_committee = [committee.strip() for committee in container_body.xpath('//*/tr/td/table[8]/tr/td/text()').getall()] list_committee_other = [committee.strip() for committee in container_body.xpath('//*/tr/td/table[9]/tr/td/text()').getall()] df.at[i, 'list_committee'] = list_committee + list_committee_other df.to_csv('./dataforthai_test2.csv', encoding='utf-8', index=False) print(f'Process for row {i} is done.') # Save the current index value to the file with open(index_file, 'w') as file: file.write(str(i)) sleep_time = random.uniform(4, 8) time.sleep(sleep_time) time.sleep(8)