From c6df0fbbc5f87f4dae5401959f3c894ecb6439b8 Mon Sep 17 00:00:00 2001 From: Anson Lai Date: Sun, 22 May 2022 12:46:06 -0700 Subject: [PATCH] Init, working files --- .gitignore | 166 ++++++++++++++++++++++++++++++++++++++++++++++ README.md | 23 +++++++ scrape.py | 190 +++++++++++++++++++++++++++++++++++++++++++++++++++++ secrets.py | 18 +++++ 4 files changed, 397 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 scrape.py create mode 100644 secrets.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0740d2d --- /dev/null +++ b/.gitignore @@ -0,0 +1,166 @@ +docs/ +chromedriver.exe +dict.pickle + + + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..842e179 --- /dev/null +++ b/README.md @@ -0,0 +1,23 @@ +# Tesla Service Manual Scraper + +## Setup + +1. Go into `secrets.py` and fill out `tesla_account_email` and `tesla_account_password` with your account and password. +2. Go into `scrape.py` and enter the index URL of the manual you want saved. It is defaulted to the Model 3. +3. Setup selenium for Python. To use the required stealth module, you must use the Chromium webdriver. See tutorial at: +4. Pip download the required packages including: + 1. `selenium-stealth` + 2. `beautifulsoup4` +5. Run `scrape.py` + +## Tips + +* A full scrape of the Model 3 service manual **took over 30 minutes**. This script is set up so that you can stop the script, and then continue later on. +* Keep an eye out, Tesla's website seems to boot you out of logged in status after about 250 pages or 20 minutes. So it might be worthwhile to run this on the side while keeping an eye on your login status. +* Total file size of the Model 3 service manual is roughly **2.2GB**. +* There is minimal styling applied on the service manual. This script does not download those files. If you want the full experience, you should download the following folders (seen in your browser's developer tools, under the Sources tab): + * css + * design-system + * img + * js + * *This one is useful, they use jQuery* \ No newline at end of file diff --git a/scrape.py b/scrape.py new file mode 100644 index 0000000..31874c5 --- /dev/null +++ b/scrape.py @@ -0,0 +1,190 @@ +from selenium import webdriver +from selenium_stealth import stealth +from bs4 import BeautifulSoup +import time + +import requests +import pickle + +from secrets import tesla_login + +# Step 0: Indicate which manual you plan to scrape, currently set to Model 3 +service_manual_index = "https://service.tesla.com/docs/Model3/ServiceManual/en-us/index.html" + +# Step 1: Set up the webdriver +options = webdriver.ChromeOptions() +# You can run this in headless mode, but it is not advised because Tesla might kick you out after ~250 pages or so +# options.add_argument("--headless") + +options.add_experimental_option("excludeSwitches", ["enable-automation"]) +options.add_experimental_option('useAutomationExtension', False) +driver = webdriver.Chrome(options=options, executable_path=r"chromedriver.exe") + +stealth(driver, + languages=["en-US", "en"], + vendor="Google Inc.", + platform="Win32", + webgl_vendor="Intel Inc.", + renderer="Intel Iris OpenGL Engine", + fix_hairline=True, +) + +# Step 2: Login to Tesla +driver = tesla_login(driver) + +# Step 3: Get to the index page +driver.get(service_manual_index) +time.sleep(10) + +window1 = driver.window_handles[1] +driver.switch_to.window(window1) + +source = driver.find_element_by_css_selector("html").get_attribute('outerHTML') + +with open('docs/index.html', 'w', encoding='utf-8') as f: + f.write(source) + +visited_urls = ['index.html'] +banned_urls = [] +upcoming_urls = [] +img_urls = [] +visited_img_urls = [] + +soup = BeautifulSoup(source, 'html.parser') +for link in soup.find_all('a'): + if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls: + if link.get('href').startswith('GUID') and link.get('href').endswith('.html'): + # TODO: Remove this + pass + upcoming_urls.append(link.get('href')) + +for img in soup.find_all('img'): + if img.get('src') not in img_urls: + img_urls.append(img.get('src')) + +# Step 4: Set up Python pickle to save session. You can stop the script and run it again to resume where you left off. +try: + pickle_in = open("dict.pickle","rb") + url_dict = pickle.load(pickle_in) + visited_urls = url_dict['visited_urls'] + banned_urls = url_dict['banned_urls'] + upcoming_urls = url_dict['upcoming_urls'] + img_urls = url_dict['img_urls'] +except: + pickle_out = open("dict.pickle","wb") + pickle.dump({ + 'visited_urls': visited_urls, + 'banned_urls': banned_urls, + 'upcoming_urls': upcoming_urls, + 'img_urls': img_urls, + 'visited_img_urls': visited_img_urls + }, pickle_out) + pickle_out.close() + print("****** SESSION SAVED ******") + +# Step 5: Loop to get all the html pages, and store information about images to be downloaded later. +while upcoming_urls: + for url in upcoming_urls: + if len(visited_urls) % 50 == 0: + pickle_out = open("dict.pickle","wb") + pickle.dump({ + 'visited_urls': visited_urls, + 'banned_urls': banned_urls, + 'upcoming_urls': upcoming_urls, + 'img_urls': img_urls, + 'visited_img_urls': visited_img_urls + }, pickle_out) + pickle_out.close() + print("****** SESSION SAVED ******") + if url.startswith('GUID') and url.endswith('.html'): + driver.get('https://service.tesla.com/docs/Model3/ServiceManual/en-us/' + url) + else: + upcoming_urls.remove(url) + continue + source = driver.find_element_by_css_selector("html").get_attribute('outerHTML') + + with open('docs/' + url, 'w', encoding='utf-8') as f: + f.write(source) + visited_urls.append(url) + upcoming_urls.remove(url) + print("visited: " + str(len(visited_urls))) + print("upcoming: " + str(len(upcoming_urls))) + print("images: " + str(len(set(img_urls)))) + + soup = BeautifulSoup(source, 'html.parser') + for link in soup.find_all('a'): + if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls: + if link.get('href').startswith('GUID') and link.get('href').endswith('.html'): + upcoming_urls.append(link.get('href')) + + for img in soup.find_all('img'): + if img.get('src') not in img_urls: + img_urls.append(img.get('src')) + +# Step 6: Save session after all html files collected +pickle_out = open("dict.pickle","wb") +pickle.dump({ + 'visited_urls': visited_urls, + 'banned_urls': banned_urls, + 'upcoming_urls': upcoming_urls, + 'img_urls': img_urls, + 'visited_img_urls': visited_img_urls +}, pickle_out) +pickle_out.close() +print("****** SESSION SAVED ******") + +# Step 7: Clean image URLs +for url in img_urls: + if not isinstance(url, str): + img_urls.remove(url) + elif not url.startswith('GUID'): + img_urls.remove(url) + +# Step 8: Sanity check on image URLs +for url in img_urls: + if url.endswith('jpg'): + continue + elif url.endswith('png'): + continue + elif url.endswith('gif'): + continue + print(url) + +number_of_images = len(set(img_urls)) +number_of_images_downloaded = 0 + +# Step 9: Download images with direct requests +headers = { +"User-Agent": + "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" +} +s = requests.session() +s.headers.update(headers) +for url in set(img_urls): + if url not in visited_img_urls: + if number_of_images_downloaded % 200 == 0: + pickle_out = open("dict.pickle","wb") + pickle.dump({ + 'visited_urls': visited_urls, + 'banned_urls': banned_urls, + 'upcoming_urls': upcoming_urls, + 'img_urls': img_urls, + 'visited_img_urls': visited_img_urls + }, pickle_out) + pickle_out.close() + print("****** SESSION SAVED ******") + + for cookie in driver.get_cookies(): + c = {cookie['name']: cookie['value']} + s.cookies.update(c) + + r = s.get('https://service.tesla.com/docs/Model3/ServiceManual/en-us/' + url, allow_redirects=True) + open("docs/" + url, 'wb').write(r.content) + visited_img_urls.append(url) + + print("images: " + str(number_of_images)) + print("downloaded: " + str(number_of_images_downloaded)) + number_of_images_downloaded += 1 + +time.sleep(25) +driver.quit() \ No newline at end of file diff --git a/secrets.py b/secrets.py new file mode 100644 index 0000000..d357608 --- /dev/null +++ b/secrets.py @@ -0,0 +1,18 @@ +import time + +# Step 0: Input your tesla account details +tesla_account_email = "YOUR TESLA EMAIL HERE" +tesla_account_password = "YOUR TESLA PASSWORD HERE" + +def tesla_login(driver): + driver.get("https://tesla.com/teslaaccount") + driver.find_element_by_css_selector("#form-input-identity").send_keys(tesla_account_email) + time.sleep(2) + driver.find_element_by_css_selector("#form-submit-continue").click() + time.sleep(2) + driver.find_element_by_css_selector("#form-input-credential").send_keys(tesla_account_password) + time.sleep(2) + driver.find_element_by_css_selector("#form-submit-continue").click() + + return driver + \ No newline at end of file