refactor, setup for restarts

pull/11/head
Anson Lai 3 years ago
parent ffa9b1d0af
commit a8cf18b0c2

@ -1,3 +1,4 @@
from distutils.command.clean import clean
from selenium import webdriver from selenium import webdriver
from selenium_stealth import stealth from selenium_stealth import stealth
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -14,18 +15,22 @@ from secrets import tesla_login
service_manual_index = "https://service.tesla.com/docs/Model3/ServiceManual/en-us/index.html" service_manual_index = "https://service.tesla.com/docs/Model3/ServiceManual/en-us/index.html"
base_url = "https://service.tesla.com/docs/Model3/ServiceManual/en-us/" base_url = "https://service.tesla.com/docs/Model3/ServiceManual/en-us/"
visited_urls = []
banned_urls = []
upcoming_urls = []
img_urls = []
visited_img_urls = []
mpulse_tracker = r'<script.+go.mpulse.+</script>'
google_tracker = r'<script.+googletag.+</script>'
# Step 1: Set up the webdriver class Webdriver:
def __init__(self):
options = webdriver.ChromeOptions() options = webdriver.ChromeOptions()
# You can run this in headless mode, but it is not advised because Tesla might kick you out after ~250 pages or so
# options.add_argument("--headless")
options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False) options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options) self.driver = webdriver.Chrome(options=options)
stealth(self.driver,
stealth(driver,
languages=["en-US", "en"], languages=["en-US", "en"],
vendor="Google Inc.", vendor="Google Inc.",
platform="Win32", platform="Win32",
@ -33,9 +38,11 @@ stealth(driver,
renderer="Intel Iris OpenGL Engine", renderer="Intel Iris OpenGL Engine",
fix_hairline=True, fix_hairline=True,
) )
def quit(self):
self.driver.quit()
def get_index(self):
# Step 2: Login to Tesla # Step 2: Login to Tesla
driver = tesla_login(driver) driver = tesla_login(self.driver)
# Step 3: Get to the index page # Step 3: Get to the index page
driver.get(service_manual_index) driver.get(service_manual_index)
@ -43,34 +50,16 @@ time.sleep(10)
window1 = driver.window_handles[1] window1 = driver.window_handles[1]
driver.switch_to.window(window1) driver.switch_to.window(window1)
source = driver.find_element_by_css_selector("html").get_attribute('outerHTML') source = driver.find_element_by_css_selector("html").get_attribute('outerHTML')
append_upcoming_and_img_urls(source)
mpulse_tracker = r'<script.+go.mpulse.+</script>'
google_tracker = r'<script.+googletag.+</script>'
os.makedirs(os.path.dirname('docs/index.html'), exist_ok=True) os.makedirs(os.path.dirname('docs/index.html'), exist_ok=True)
with open('docs/index.html', 'w', encoding='utf-8') as f: with open('docs/index.html', 'w', encoding='utf-8') as f:
source = re.sub(mpulse_tracker, '', source) source = re.sub(mpulse_tracker, '', source)
source = re.sub(google_tracker, '', source) source = re.sub(google_tracker, '', source)
f.write(source) f.write(source)
if 'index.html' not in visited_urls:
visited_urls = ['index.html'] visited_urls.append('index.html')
banned_urls = [] def get_support_files(self):
upcoming_urls = []
img_urls = []
visited_img_urls = []
soup = BeautifulSoup(source, 'html.parser')
for link in soup.find_all('a'):
if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls:
if link.get('href').startswith('GUID') and link.get('href').endswith('.html'):
upcoming_urls.append(link.get('href'))
for img in soup.find_all('img'):
if img.get('src') not in img_urls:
img_urls.append(img.get('src'))
# Step 4: Get the index.json for search functionality (thanks to TheNexusAvenger!) and other assorted supporting files # Step 4: Get the index.json for search functionality (thanks to TheNexusAvenger!) and other assorted supporting files
headers = { headers = {
"User-Agent": "User-Agent":
@ -79,7 +68,7 @@ headers = {
s = requests.session() s = requests.session()
s.headers.update(headers) s.headers.update(headers)
for cookie in driver.get_cookies(): for cookie in self.driver.get_cookies():
c = {cookie['name']: cookie['value']} c = {cookie['name']: cookie['value']}
s.cookies.update(c) s.cookies.update(c)
@ -123,51 +112,23 @@ open("docs/tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2", 'wb').write(r.content)
r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2', allow_redirects=True) r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2', allow_redirects=True)
open("docs/tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2", 'wb').write(r.content) open("docs/tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2", 'wb').write(r.content)
def get_html(self):
# Step 5: Set up Python pickle to save session. You can stop the script and run it again to resume where you left off.
try:
pickle_in = open("dict.pickle","rb")
url_dict = pickle.load(pickle_in)
visited_urls = url_dict['visited_urls']
banned_urls = url_dict['banned_urls']
upcoming_urls = url_dict['upcoming_urls']
img_urls = url_dict['img_urls']
except:
pickle_out = open("dict.pickle","wb")
pickle.dump({
'visited_urls': visited_urls,
'banned_urls': banned_urls,
'upcoming_urls': upcoming_urls,
'img_urls': img_urls,
'visited_img_urls': visited_img_urls
}, pickle_out)
pickle_out.close()
print("****** SESSION SAVED ******")
# Step 6: Loop to get all the html pages, and store information about images to be downloaded later. # Step 6: Loop to get all the html pages, and store information about images to be downloaded later.
while upcoming_urls: while upcoming_urls:
for url in upcoming_urls: for url in upcoming_urls:
if len(visited_urls) % 50 == 0: if len(visited_urls) % 5 == 0:
pickle_out = open("dict.pickle","wb") save_session()
pickle.dump({
'visited_urls': visited_urls,
'banned_urls': banned_urls,
'upcoming_urls': upcoming_urls,
'img_urls': img_urls,
'visited_img_urls': visited_img_urls
}, pickle_out)
pickle_out.close()
print("****** SESSION SAVED ******")
if url.startswith('GUID') and url.endswith('.html'): if url.startswith('GUID') and url.endswith('.html'):
driver.get(base_url + url) self.driver.get(base_url + url)
else: else:
upcoming_urls.remove(url) upcoming_urls.remove(url)
continue continue
source = driver.find_element_by_css_selector("html").get_attribute('outerHTML') source = self.driver.find_element_by_css_selector("html").get_attribute('outerHTML')
with open('docs/' + url, 'w', encoding='utf-8') as f: with open('docs/' + url, 'w', encoding='utf-8') as f:
source = re.sub(mpulse_tracker, '', source) source = re.sub(mpulse_tracker, '', source)
source = re.sub(google_tracker, '', source) source = re.sub(google_tracker, '', source)
# TODO: Check if this is an error page, if yes, break out
f.write(source) f.write(source)
visited_urls.append(url) visited_urls.append(url)
@ -176,6 +137,36 @@ while upcoming_urls:
print("upcoming: " + str(len(upcoming_urls))) print("upcoming: " + str(len(upcoming_urls)))
print("images: " + str(len(set(img_urls)))) print("images: " + str(len(set(img_urls))))
append_upcoming_and_img_urls(source)
def get_imgs(self):
number_of_images = len(set(img_urls))
number_of_images_downloaded = len(set(visited_img_urls))
# Step 9: Download images with direct requests
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
}
s = requests.session()
s.headers.update(headers)
for url in set(img_urls):
if url not in visited_img_urls:
if number_of_images_downloaded % 200 == 0:
save_session()
for cookie in self.driver.get_cookies():
c = {cookie['name']: cookie['value']}
s.cookies.update(c)
r = s.get(base_url + url, allow_redirects=True)
open("docs/" + url, 'wb').write(r.content)
visited_img_urls.append(url)
print("images: " + str(number_of_images))
print("downloaded: " + str(number_of_images_downloaded))
number_of_images_downloaded += 1
def append_upcoming_and_img_urls(source):
soup = BeautifulSoup(source, 'html.parser') soup = BeautifulSoup(source, 'html.parser')
for link in soup.find_all('a'): for link in soup.find_all('a'):
if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls: if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls:
@ -186,7 +177,37 @@ while upcoming_urls:
if img.get('src') not in img_urls: if img.get('src') not in img_urls:
img_urls.append(img.get('src')) img_urls.append(img.get('src'))
# Step 7: Save session after all html files collected def check_source_validity(source):
if 'design-system/4.x/index.css' in source and '<title>Tesla Service</title>' in source:
return False
else:
return True
def new_session():
global visited_urls, banned_urls, upcoming_urls, img_urls, visited_img_urls
# Step 5: Set up Python pickle to save session. You can stop the script and run it again to resume where you left off.
try:
pickle_in = open("dict.pickle","rb")
url_dict = pickle.load(pickle_in)
visited_urls = url_dict['visited_urls']
banned_urls = url_dict['banned_urls']
upcoming_urls = url_dict['upcoming_urls']
img_urls = url_dict['img_urls']
visited_img_urls = url_dict['visited_img_urls']
print("****** SESSION LOADED ******")
except:
pickle_out = open("dict.pickle","wb")
pickle.dump({
'visited_urls': visited_urls,
'banned_urls': banned_urls,
'upcoming_urls': upcoming_urls,
'img_urls': img_urls,
'visited_img_urls': visited_img_urls
}, pickle_out)
pickle_out.close()
print("****** SESSION CREATED ******")
def save_session():
pickle_out = open("dict.pickle","wb") pickle_out = open("dict.pickle","wb")
pickle.dump({ pickle.dump({
'visited_urls': visited_urls, 'visited_urls': visited_urls,
@ -198,14 +219,15 @@ pickle.dump({
pickle_out.close() pickle_out.close()
print("****** SESSION SAVED ******") print("****** SESSION SAVED ******")
# Step 8: Clean image URLs def clean_img_urls():
# Step 7: Clean image URLs
for url in img_urls: for url in img_urls:
if not isinstance(url, str): if not isinstance(url, str):
img_urls.remove(url) img_urls.remove(url)
elif not url.startswith('GUID'): elif not url.startswith('GUID'):
img_urls.remove(url) img_urls.remove(url)
# Step 9: Sanity check on image URLs # Step 8: Sanity check on image URLs
for url in img_urls: for url in img_urls:
if url.endswith('jpg'): if url.endswith('jpg'):
continue continue
@ -215,41 +237,18 @@ for url in img_urls:
continue continue
print(url) print(url)
number_of_images = len(set(img_urls)) def run():
number_of_images_downloaded = 0 driver = Webdriver()
new_session()
# Step 10: Download images with direct requests driver.get_index()
headers = { save_session()
"User-Agent": driver.get_support_files()
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" save_session()
} driver.get_html()
s = requests.session() save_session()
s.headers.update(headers) clean_img_urls()
for url in set(img_urls): driver.get_imgs()
if url not in visited_img_urls: time.sleep(15)
if number_of_images_downloaded % 200 == 0: driver.driver.quit()
pickle_out = open("dict.pickle","wb")
pickle.dump({ run()
'visited_urls': visited_urls,
'banned_urls': banned_urls,
'upcoming_urls': upcoming_urls,
'img_urls': img_urls,
'visited_img_urls': visited_img_urls
}, pickle_out)
pickle_out.close()
print("****** SESSION SAVED ******")
for cookie in driver.get_cookies():
c = {cookie['name']: cookie['value']}
s.cookies.update(c)
r = s.get(base_url + url, allow_redirects=True)
open("docs/" + url, 'wb').write(r.content)
visited_img_urls.append(url)
print("images: " + str(number_of_images))
print("downloaded: " + str(number_of_images_downloaded))
number_of_images_downloaded += 1
time.sleep(25)
driver.quit()

Loading…
Cancel
Save