From a8cf18b0c2f12d9a04b7cf6bb23c89ab47595111 Mon Sep 17 00:00:00 2001 From: Anson Lai Date: Tue, 24 May 2022 20:07:13 -0700 Subject: [PATCH] refactor, setup for restarts --- scrape.py | 443 +++++++++++++++++++++++++++--------------------------- 1 file changed, 221 insertions(+), 222 deletions(-) diff --git a/scrape.py b/scrape.py index 9cd6def..90f816f 100644 --- a/scrape.py +++ b/scrape.py @@ -1,3 +1,4 @@ +from distutils.command.clean import clean from selenium import webdriver from selenium_stealth import stealth from bs4 import BeautifulSoup @@ -14,125 +15,199 @@ from secrets import tesla_login service_manual_index = "https://service.tesla.com/docs/Model3/ServiceManual/en-us/index.html" base_url = "https://service.tesla.com/docs/Model3/ServiceManual/en-us/" - - -# Step 1: Set up the webdriver -options = webdriver.ChromeOptions() -# You can run this in headless mode, but it is not advised because Tesla might kick you out after ~250 pages or so -# options.add_argument("--headless") - -options.add_experimental_option("excludeSwitches", ["enable-automation"]) -options.add_experimental_option('useAutomationExtension', False) -driver = webdriver.Chrome(options=options) - -stealth(driver, - languages=["en-US", "en"], - vendor="Google Inc.", - platform="Win32", - webgl_vendor="Intel Inc.", - renderer="Intel Iris OpenGL Engine", - fix_hairline=True, -) - -# Step 2: Login to Tesla -driver = tesla_login(driver) - -# Step 3: Get to the index page -driver.get(service_manual_index) -time.sleep(10) - -window1 = driver.window_handles[1] -driver.switch_to.window(window1) - -source = driver.find_element_by_css_selector("html").get_attribute('outerHTML') - -mpulse_tracker = r'' -google_tracker = r'' - -os.makedirs(os.path.dirname('docs/index.html'), exist_ok=True) -with open('docs/index.html', 'w', encoding='utf-8') as f: - source = re.sub(mpulse_tracker, '', source) - source = re.sub(google_tracker, '', source) - f.write(source) - -visited_urls = ['index.html'] +visited_urls = [] banned_urls = [] upcoming_urls = [] img_urls = [] visited_img_urls = [] -soup = BeautifulSoup(source, 'html.parser') -for link in soup.find_all('a'): - if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls: - if link.get('href').startswith('GUID') and link.get('href').endswith('.html'): - upcoming_urls.append(link.get('href')) - -for img in soup.find_all('img'): - if img.get('src') not in img_urls: - img_urls.append(img.get('src')) - -# Step 4: Get the index.json for search functionality (thanks to TheNexusAvenger!) and other assorted supporting files -headers = { -"User-Agent": - "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" -} -s = requests.session() -s.headers.update(headers) - -for cookie in driver.get_cookies(): - c = {cookie['name']: cookie['value']} - s.cookies.update(c) - -r = s.get(base_url + 'index.json', allow_redirects=True) -open("docs/index.json", 'wb').write(r.content) - -os.makedirs(os.path.dirname('docs/css/custom.css'), exist_ok=True) -r = s.get(base_url + 'css/custom.css', allow_redirects=True) -open("docs/css/custom.css", 'wb').write(r.content) - -os.makedirs(os.path.dirname('docs/js/vendor/jquery-3.5.1.min.js'), exist_ok=True) -r = s.get(base_url + 'js/vendor/jquery-3.5.1.min.js', allow_redirects=True) -open("docs/js/vendor/jquery-3.5.1.min.js", 'wb').write(r.content) - -r = s.get(base_url + 'js/vendor/jquery.magnific-popup.min.js', allow_redirects=True) -open("docs/js/vendor/jquery.magnific-popup.min.js", 'wb').write(r.content) - -r = s.get(base_url + 'js/vendor/lunr.js', allow_redirects=True) -open("docs/js/vendor/lunr.js", 'wb').write(r.content) - -r = s.get(base_url + 'js/search.js', allow_redirects=True) -open("docs/js/search.js", 'wb').write(r.content) - -os.makedirs(os.path.dirname('docs/img/spritemap.svg'), exist_ok=True) -r = s.get(base_url + 'img/spritemap.svg', allow_redirects=True) -open("docs/img/spritemap.svg", 'wb').write(r.content) - -os.makedirs(os.path.dirname('docs/design-system/5.4.1/index.css'), exist_ok=True) -r = s.get(base_url + 'design-system/5.4.1/index.css', allow_redirects=True) -open("docs/design-system/5.4.1/index.css", 'wb').write(r.content) - -r = s.get(base_url + 'design-system/5.4.1/index.js', allow_redirects=True) -open("docs/design-system/5.4.1/index.js", 'wb').write(r.content) - -os.makedirs(os.path.dirname('docs/tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2'), exist_ok=True) -r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2', allow_redirects=True) -open("docs/tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2", 'wb').write(r.content) - -r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2', allow_redirects=True) -open("docs/tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2", 'wb').write(r.content) - -r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2', allow_redirects=True) -open("docs/tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2", 'wb').write(r.content) - -# Step 5: Set up Python pickle to save session. You can stop the script and run it again to resume where you left off. -try: - pickle_in = open("dict.pickle","rb") - url_dict = pickle.load(pickle_in) - visited_urls = url_dict['visited_urls'] - banned_urls = url_dict['banned_urls'] - upcoming_urls = url_dict['upcoming_urls'] - img_urls = url_dict['img_urls'] -except: +mpulse_tracker = r'' +google_tracker = r'' + +class Webdriver: + def __init__(self): + options = webdriver.ChromeOptions() + options.add_experimental_option("excludeSwitches", ["enable-automation"]) + options.add_experimental_option('useAutomationExtension', False) + self.driver = webdriver.Chrome(options=options) + stealth(self.driver, + languages=["en-US", "en"], + vendor="Google Inc.", + platform="Win32", + webgl_vendor="Intel Inc.", + renderer="Intel Iris OpenGL Engine", + fix_hairline=True, + ) + def quit(self): + self.driver.quit() + def get_index(self): + # Step 2: Login to Tesla + driver = tesla_login(self.driver) + + # Step 3: Get to the index page + driver.get(service_manual_index) + time.sleep(10) + + window1 = driver.window_handles[1] + driver.switch_to.window(window1) + source = driver.find_element_by_css_selector("html").get_attribute('outerHTML') + append_upcoming_and_img_urls(source) + os.makedirs(os.path.dirname('docs/index.html'), exist_ok=True) + with open('docs/index.html', 'w', encoding='utf-8') as f: + source = re.sub(mpulse_tracker, '', source) + source = re.sub(google_tracker, '', source) + f.write(source) + if 'index.html' not in visited_urls: + visited_urls.append('index.html') + def get_support_files(self): + # Step 4: Get the index.json for search functionality (thanks to TheNexusAvenger!) and other assorted supporting files + headers = { + "User-Agent": + "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" + } + s = requests.session() + s.headers.update(headers) + + for cookie in self.driver.get_cookies(): + c = {cookie['name']: cookie['value']} + s.cookies.update(c) + + r = s.get(base_url + 'index.json', allow_redirects=True) + open("docs/index.json", 'wb').write(r.content) + + os.makedirs(os.path.dirname('docs/css/custom.css'), exist_ok=True) + r = s.get(base_url + 'css/custom.css', allow_redirects=True) + open("docs/css/custom.css", 'wb').write(r.content) + + os.makedirs(os.path.dirname('docs/js/vendor/jquery-3.5.1.min.js'), exist_ok=True) + r = s.get(base_url + 'js/vendor/jquery-3.5.1.min.js', allow_redirects=True) + open("docs/js/vendor/jquery-3.5.1.min.js", 'wb').write(r.content) + + r = s.get(base_url + 'js/vendor/jquery.magnific-popup.min.js', allow_redirects=True) + open("docs/js/vendor/jquery.magnific-popup.min.js", 'wb').write(r.content) + + r = s.get(base_url + 'js/vendor/lunr.js', allow_redirects=True) + open("docs/js/vendor/lunr.js", 'wb').write(r.content) + + r = s.get(base_url + 'js/search.js', allow_redirects=True) + open("docs/js/search.js", 'wb').write(r.content) + + os.makedirs(os.path.dirname('docs/img/spritemap.svg'), exist_ok=True) + r = s.get(base_url + 'img/spritemap.svg', allow_redirects=True) + open("docs/img/spritemap.svg", 'wb').write(r.content) + + os.makedirs(os.path.dirname('docs/design-system/5.4.1/index.css'), exist_ok=True) + r = s.get(base_url + 'design-system/5.4.1/index.css', allow_redirects=True) + open("docs/design-system/5.4.1/index.css", 'wb').write(r.content) + + r = s.get(base_url + 'design-system/5.4.1/index.js', allow_redirects=True) + open("docs/design-system/5.4.1/index.js", 'wb').write(r.content) + + os.makedirs(os.path.dirname('docs/tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2'), exist_ok=True) + r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2', allow_redirects=True) + open("docs/tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2", 'wb').write(r.content) + + r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2', allow_redirects=True) + open("docs/tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2", 'wb').write(r.content) + + r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2', allow_redirects=True) + open("docs/tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2", 'wb').write(r.content) + def get_html(self): + # Step 6: Loop to get all the html pages, and store information about images to be downloaded later. + while upcoming_urls: + for url in upcoming_urls: + if len(visited_urls) % 5 == 0: + save_session() + if url.startswith('GUID') and url.endswith('.html'): + self.driver.get(base_url + url) + else: + upcoming_urls.remove(url) + continue + source = self.driver.find_element_by_css_selector("html").get_attribute('outerHTML') + + with open('docs/' + url, 'w', encoding='utf-8') as f: + source = re.sub(mpulse_tracker, '', source) + source = re.sub(google_tracker, '', source) + # TODO: Check if this is an error page, if yes, break out + + f.write(source) + visited_urls.append(url) + upcoming_urls.remove(url) + print("visited: " + str(len(visited_urls))) + print("upcoming: " + str(len(upcoming_urls))) + print("images: " + str(len(set(img_urls)))) + + append_upcoming_and_img_urls(source) + + def get_imgs(self): + number_of_images = len(set(img_urls)) + number_of_images_downloaded = len(set(visited_img_urls)) + # Step 9: Download images with direct requests + headers = { + "User-Agent": + "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" + } + s = requests.session() + s.headers.update(headers) + for url in set(img_urls): + if url not in visited_img_urls: + if number_of_images_downloaded % 200 == 0: + save_session() + + for cookie in self.driver.get_cookies(): + c = {cookie['name']: cookie['value']} + s.cookies.update(c) + + r = s.get(base_url + url, allow_redirects=True) + open("docs/" + url, 'wb').write(r.content) + visited_img_urls.append(url) + + print("images: " + str(number_of_images)) + print("downloaded: " + str(number_of_images_downloaded)) + number_of_images_downloaded += 1 + +def append_upcoming_and_img_urls(source): + soup = BeautifulSoup(source, 'html.parser') + for link in soup.find_all('a'): + if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls: + if link.get('href').startswith('GUID') and link.get('href').endswith('.html'): + upcoming_urls.append(link.get('href')) + + for img in soup.find_all('img'): + if img.get('src') not in img_urls: + img_urls.append(img.get('src')) + +def check_source_validity(source): + if 'design-system/4.x/index.css' in source and 'Tesla Service' in source: + return False + else: + return True + +def new_session(): + global visited_urls, banned_urls, upcoming_urls, img_urls, visited_img_urls + # Step 5: Set up Python pickle to save session. You can stop the script and run it again to resume where you left off. + try: + pickle_in = open("dict.pickle","rb") + url_dict = pickle.load(pickle_in) + visited_urls = url_dict['visited_urls'] + banned_urls = url_dict['banned_urls'] + upcoming_urls = url_dict['upcoming_urls'] + img_urls = url_dict['img_urls'] + visited_img_urls = url_dict['visited_img_urls'] + print("****** SESSION LOADED ******") + except: + pickle_out = open("dict.pickle","wb") + pickle.dump({ + 'visited_urls': visited_urls, + 'banned_urls': banned_urls, + 'upcoming_urls': upcoming_urls, + 'img_urls': img_urls, + 'visited_img_urls': visited_img_urls + }, pickle_out) + pickle_out.close() + print("****** SESSION CREATED ******") + +def save_session(): pickle_out = open("dict.pickle","wb") pickle.dump({ 'visited_urls': visited_urls, @@ -144,112 +219,36 @@ except: pickle_out.close() print("****** SESSION SAVED ******") -# Step 6: Loop to get all the html pages, and store information about images to be downloaded later. -while upcoming_urls: - for url in upcoming_urls: - if len(visited_urls) % 50 == 0: - pickle_out = open("dict.pickle","wb") - pickle.dump({ - 'visited_urls': visited_urls, - 'banned_urls': banned_urls, - 'upcoming_urls': upcoming_urls, - 'img_urls': img_urls, - 'visited_img_urls': visited_img_urls - }, pickle_out) - pickle_out.close() - print("****** SESSION SAVED ******") - if url.startswith('GUID') and url.endswith('.html'): - driver.get(base_url + url) - else: - upcoming_urls.remove(url) +def clean_img_urls(): + # Step 7: Clean image URLs + for url in img_urls: + if not isinstance(url, str): + img_urls.remove(url) + elif not url.startswith('GUID'): + img_urls.remove(url) + + # Step 8: Sanity check on image URLs + for url in img_urls: + if url.endswith('jpg'): continue - source = driver.find_element_by_css_selector("html").get_attribute('outerHTML') - - with open('docs/' + url, 'w', encoding='utf-8') as f: - source = re.sub(mpulse_tracker, '', source) - source = re.sub(google_tracker, '', source) - - f.write(source) - visited_urls.append(url) - upcoming_urls.remove(url) - print("visited: " + str(len(visited_urls))) - print("upcoming: " + str(len(upcoming_urls))) - print("images: " + str(len(set(img_urls)))) - - soup = BeautifulSoup(source, 'html.parser') - for link in soup.find_all('a'): - if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls: - if link.get('href').startswith('GUID') and link.get('href').endswith('.html'): - upcoming_urls.append(link.get('href')) - - for img in soup.find_all('img'): - if img.get('src') not in img_urls: - img_urls.append(img.get('src')) - -# Step 7: Save session after all html files collected -pickle_out = open("dict.pickle","wb") -pickle.dump({ - 'visited_urls': visited_urls, - 'banned_urls': banned_urls, - 'upcoming_urls': upcoming_urls, - 'img_urls': img_urls, - 'visited_img_urls': visited_img_urls -}, pickle_out) -pickle_out.close() -print("****** SESSION SAVED ******") - -# Step 8: Clean image URLs -for url in img_urls: - if not isinstance(url, str): - img_urls.remove(url) - elif not url.startswith('GUID'): - img_urls.remove(url) - -# Step 9: Sanity check on image URLs -for url in img_urls: - if url.endswith('jpg'): - continue - elif url.endswith('png'): - continue - elif url.endswith('gif'): - continue - print(url) - -number_of_images = len(set(img_urls)) -number_of_images_downloaded = 0 - -# Step 10: Download images with direct requests -headers = { -"User-Agent": - "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" -} -s = requests.session() -s.headers.update(headers) -for url in set(img_urls): - if url not in visited_img_urls: - if number_of_images_downloaded % 200 == 0: - pickle_out = open("dict.pickle","wb") - pickle.dump({ - 'visited_urls': visited_urls, - 'banned_urls': banned_urls, - 'upcoming_urls': upcoming_urls, - 'img_urls': img_urls, - 'visited_img_urls': visited_img_urls - }, pickle_out) - pickle_out.close() - print("****** SESSION SAVED ******") - - for cookie in driver.get_cookies(): - c = {cookie['name']: cookie['value']} - s.cookies.update(c) - - r = s.get(base_url + url, allow_redirects=True) - open("docs/" + url, 'wb').write(r.content) - visited_img_urls.append(url) - - print("images: " + str(number_of_images)) - print("downloaded: " + str(number_of_images_downloaded)) - number_of_images_downloaded += 1 - -time.sleep(25) -driver.quit() + elif url.endswith('png'): + continue + elif url.endswith('gif'): + continue + print(url) + +def run(): + driver = Webdriver() + new_session() + driver.get_index() + save_session() + driver.get_support_files() + save_session() + driver.get_html() + save_session() + clean_img_urls() + driver.get_imgs() + time.sleep(15) + driver.driver.quit() + +run()