import csv import json import random import time from datetime import datetime import pandas as pd import requests from scrapy import Selector from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options # Function to read usernames and passwords from CSV def read_credentials_from_csv(file_path): credentials = [] with open(file_path, "r") as csvfile: reader = csv.DictReader(csvfile) for row in reader: credentials.append((row["username"], row["password"])) return credentials # Read usernames and passwords from CSV credentials_file = "credentials.csv" users_credentials = read_credentials_from_csv(credentials_file) # Loop through each set of credentials for username, password in users_credentials: chromium_driver_path = "./chromedriver" # chrome_options = webdriver.ChromeOptions() # chrome_options.headless = True driver = webdriver.Chrome(executable_path=chromium_driver_path) driver.get("https://www.dataforthai.com/login") # Find the username and password input fields and enter the credentials username_input = driver.find_element(By.NAME, 'username') # Replace "username" with the actual name attribute of the username input field password_input = driver.find_element('name', 'password') # Replace "password" with the actual name attribute of the password input field username_input.send_keys(username) password_input.send_keys(password) # Submit the login form login_button = driver.find_element(By.XPATH, '//*[@id="btn-login"]') # Replace the XPath with the correct XPath of the login button login_button.click() # After login, get the value of the "PHPSESSID" cookie phpsessid_cookie = driver.get_cookie("PHPSESSID") if phpsessid_cookie: phpsessid_value = phpsessid_cookie["value"] print(f"Value of 'PHPSESSID' cookie: {phpsessid_value}") else: print("PHPSESSID cookie not found.") time.sleep(1) #Scraping part try: df = pd.read_csv('./300_400.csv', dtype={'tax': str, 'tsic': str}) # File path to store and load the value of 'i' index_file = './last_index_fix.txt' # Load the last index value if the file exists try: with open(index_file, 'r') as file: last_index = int(file.read().strip()) except FileNotFoundError: last_index = None # Set the parameter value as the sum of 'i' and 25 break_parameter = last_index + 15 if last_index is not None else None # PAYLOAD & COOKIES FOR LOGGING TO DATAFORTHAI payload = {} cookies = {"PHPSESSID": phpsessid_value} # Create a session object for HTTP requests # session = requests.Session() # PROCESS SCRAPING for i, v in enumerate(df.iterrows()): if last_index is not None and i <= last_index: continue if v[1]['name_en'] == '-': tax = v[1]['tax'] # tsic_code = v[1]['tsic'] # added '0' cause after 131072 the 0 in front of tax is gone. url = f'https://dataforthai.com/company/0{tax}/' # proxies={ # "http": "http://osjgtpvk-rotate:iaf4kpi4fvgs@p.webshare.io:80/", # "https": "http://osjgtpvk-rotate:iaf4kpi4fvgs@p.webshare.io:80/" # } proxies={ "http": "http://sbtmgbme-rotate:tujc456rgvgf@p.webshare.io:80/", "https": "http://sbtmgbme-rotate:tujc456rgvgf@p.webshare.io:80/" } # Use the session object for the request # req = session.get(url, cookies=cookies, proxies=proxies, params=json.dumps(payload)) req = requests.get(url, cookies=cookies, proxies=proxies) selector = Selector(text=req.text) check_url = selector.xpath('//*[@id="main"]/div/div/h3/text()').get() if check_url == "ขออภัยค่ะ ไม่พบหน้าที่คุณต้องการ" : continue created_at = datetime.now().isoformat() df.at[i, 'name_en'] = selector.xpath('//*/div[@id="main"]//h2/text()').get().strip() container_body = selector.xpath('//*/div[@id="main"]/div[3]/table[1]/tr/td')[0] df.at[i, 'business'] = container_body.xpath('//*/table[2]/tr[1]/td[2]/text()[1]').get().strip() df.at[i, 'status'] = container_body.xpath('//*/table[3]/tr/td[2]/text()').get().strip() df.at[i, 'date_register_thai'] = container_body.xpath('//*/table[4]/tr/td[2]/text()').get().strip() df.at[i, 'capital'] = container_body.xpath('//*/table[5]/tr/td[2]/text()').get().strip() address_element = container_body.xpath('//*/table[6]/tr/td[2]/a/text()') df.at[i, 'address'] = address_element.get().strip() if address_element else "" # # TEST 0 # with open("./test.html", 'w') as file: # file.write(req.text) # name_en_value = selector.xpath('//*/div[@id="main"]//h2/text()').get() # df.at[i, 'name_en'] = name_en_value # container_body_elements = selector.xpath('//*/div[@id="main"]/div[3]/table[1]') # if container_body_elements: # container_body = container_body_elements[0] # df.at[i, 'business'] = container_body.xpath('//*/table[2]/tr[1]/td[2]/text()[1]').get() # df.at[i, 'status'] = container_body.xpath('//*/table[3]/tr/td[2]/text()').get() # df.at[i, 'date_register_thai'] = container_body.xpath('//*/table[4]/tr/td[2]/text()').get() # df.at[i, 'capital'] = container_body.xpath('//*/table[5]/tr/td[2]/text()').get() # address_element = container_body.xpath('//*/table[6]/tr/td[2]/a/text()') # df.at[i, 'address'] = address_element.get() if address_element else "" # else: # print("No container body elements found.") # # TEST # df.at[i, 'name_en'] = selector.xpath('//*[@id="maindata"]/h2').get().strip() # container_body = selector.xpath('//*/div[@id="main"]/div[3]/table[1]/tr/td')[0] # business_element = container_body.xpath('//*/table[2]/tr[1]/td[2]/text()[1]') # df.at[i, 'business'] = business_element.get().strip() if business_element else "Business is NONE" # status_element = container_body.xpath('//*/table[3]/tr/td[2]/text()') # df.at[i, 'status'] = status_element.get().strip() if status_element else "Status is NONE" # date_register_element = container_body.xpath('//*/table[4]/tr/td[2]/text()') # df.at[i, 'date_register_thai'] = date_register_element.get().strip() if address_element else "Date Register is none" # capital_element = container_body.xpath('//*/table[5]/tr/td[2]/text()') # df.at[i, 'capital'] = capital_element.get().strip() if address_element else "Capital element is NONE" # address_element = container_body.xpath('//*/table[6]/tr/td[2]/a/text()') # df.at[i, 'address'] = address_element.get().strip() if address_element else "" # Use list comprehension for committee lists list_committee = [committee.strip() for committee in container_body.xpath('//*/tr/td/table[8]/tr/td/text()').getall()] list_committee_other = [committee.strip() for committee in container_body.xpath('//*/tr/td/table[9]/tr/td/text()').getall()] list_committee_other2 = [committee.strip() for committee in container_body.xpath('//*/tr/td/table[10]/tr/td/text()').getall()] list_committee_other3 = [committee.strip() for committee in container_body.xpath('//*/tr/td/table[11]/tr/td/text()').getall()] df.at[i, 'list_committee'] = list_committee + list_committee_other + list_committee_other2 + list_committee_other3 df.to_csv('./300_400.csv', encoding='utf-8', index=False) print(f'Process for row {i} is done.') # Save the current index value to the file with open(index_file, 'w') as file: file.write(str(i)) sleep_time = random.uniform(1, 3) time.sleep(sleep_time) # except AttributeError: # # print("Getting IndexError in the Loop. Changing account.") # time.sleep(1) # continue except Exception as e: # Handle the error and print an error message print(f"An error occurred: {e}") print("Getting IndexError in the Loop. Changing account.") time.sleep(1) continue print("All ID in the credentials had been use. Exiting.") driver.quit()