Restart scrape if invalid source

pull/11/head
Anson Lai 3 years ago
parent a8cf18b0c2
commit b6f543ace6

@ -38,8 +38,9 @@ class Webdriver:
renderer="Intel Iris OpenGL Engine", renderer="Intel Iris OpenGL Engine",
fix_hairline=True, fix_hairline=True,
) )
def quit(self): def restart_scrape(self):
self.driver.quit() self.driver.quit()
run()
def get_index(self): def get_index(self):
# Step 2: Login to Tesla # Step 2: Login to Tesla
driver = tesla_login(self.driver) driver = tesla_login(self.driver)
@ -116,7 +117,7 @@ class Webdriver:
# Step 6: Loop to get all the html pages, and store information about images to be downloaded later. # Step 6: Loop to get all the html pages, and store information about images to be downloaded later.
while upcoming_urls: while upcoming_urls:
for url in upcoming_urls: for url in upcoming_urls:
if len(visited_urls) % 5 == 0: if len(visited_urls) % 50 == 0:
save_session() save_session()
if url.startswith('GUID') and url.endswith('.html'): if url.startswith('GUID') and url.endswith('.html'):
self.driver.get(base_url + url) self.driver.get(base_url + url)
@ -125,6 +126,9 @@ class Webdriver:
continue continue
source = self.driver.find_element_by_css_selector("html").get_attribute('outerHTML') source = self.driver.find_element_by_css_selector("html").get_attribute('outerHTML')
if not check_source_validity(source):
self.restart_scrape()
with open('docs/' + url, 'w', encoding='utf-8') as f: with open('docs/' + url, 'w', encoding='utf-8') as f:
source = re.sub(mpulse_tracker, '', source) source = re.sub(mpulse_tracker, '', source)
source = re.sub(google_tracker, '', source) source = re.sub(google_tracker, '', source)

Loading…
Cancel
Save