Data-scraping PDF-parsing python bot

Example quality report
Not a fun process

So what’s the solution?

bots
pip install tabula-py
pip install -U selenium
Selenium controlled bot
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from time import sleep
import pandas as pd
import tabula
import time
import os
options = webdriver.ChromeOptions()
download_folder = "reports"
profile = {"plugins.plugins_list": [{"enabled": False,
"name": "Chrome PDF Viewer"}],
"download.default_directory": download_folder,
"download.extensions_to_open": ""}
options.add_experimental_option("prefs", profile)
driver = webdriver.Chrome(chrome_options = options)
driver.get("http://cottonhost.com/96726/")
actions = ActionChains(driver)
# Log in
element = driver.find_element_by_name("LOGINBUTTON").click();
# Filter to report
elem2 = driver.find_element_by_name('PRODID').send_keys("login")
elem3 = driver.find_element_by_name('PRODPASS').send_keys("password")
element = driver.find_element_by_name("LOGINBUTTON").click();
time.sleep(1);
s2= Select(driver.find_element_by_xpath("/html/body/form/table/tbody/tr[3]/td[2]/select")).select_by_value('PRODLOADRPT')
s2= Select(driver.find_element_by_xpath("/html/body/form/table/tbody/tr[4]/td[2]/select")).select_by_value('W2017')
filter = driver.find_element_by_xpath("/html/body/form/input[1]").click();
time.sleep(1);
# Run Report
reportit = driver.find_element_by_xpath("/html/body/form[@id='FILTERS']/input[10]").click();
driver.switch_to_frame(driver.find_element_by_tag_name("iframe"))
pdf = driver.find_element_by_xpath("html/body/object[@type='application/pdf']");
# print (pdf);
print (pdf.get_attribute("data"))
driver.get(pdf.get_attribute("data"))
time.sleep(4);
driver.close()

Selenium

find_element_by_id
find_element_by_name
find_element_by_xpath
find_element_by_tag_name
find_element_by_link_text
find_element_by_class_name
find_element_by_css_selector
find_element_by_partial_link_text

Tabula

tabula.convert_into("report.pd"), "report.csv", output_format="csv",area=(72, 42, 590, 755),guess = False, options=" --pages all  --columns 35,55,71,89,115,159,180.2,200,271,322,379,487,520,593,618,647,671,687,710,732,773")
Table that tabula extracted
for filename in os.listdir('reports'):
if filename.endswith(".PDF"):
print(filename)
filestring = str(filename)
filecsv = filestring[:-4]+'.csv'
tabula.convert_into(os.path.join(download_folder,
filename), filecsv, output_format="csv",area=(72, 42, 590, 755),guess = False,options="--pages all --columns 35,55,71,89,115,159,180.2,200,271,322,379,487,520,593,618,647,671,687,710,732,773")
Data in database

--

--

Working to solve to worlds problems starting with technology in agriculture.

Love podcasts or audiobooks? Learn on the go with our new app.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store