Init, working files

pull/1/head
Anson Lai 3 years ago
commit c6df0fbbc5

166
.gitignore vendored

@ -0,0 +1,166 @@
docs/
chromedriver.exe
dict.pickle
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

@ -0,0 +1,23 @@
# Tesla Service Manual Scraper
## Setup
1. Go into `secrets.py` and fill out `tesla_account_email` and `tesla_account_password` with your account and password.
2. Go into `scrape.py` and enter the index URL of the manual you want saved. It is defaulted to the Model 3.
3. Setup selenium for Python. To use the required stealth module, you must use the Chromium webdriver. See tutorial at: <https://blog.testproject.io/2019/07/16/installing-selenium-webdriver-using-python-chrome/>
4. Pip download the required packages including:
1. `selenium-stealth`
2. `beautifulsoup4`
5. Run `scrape.py`
## Tips
* A full scrape of the Model 3 service manual **took over 30 minutes**. This script is set up so that you can stop the script, and then continue later on.
* Keep an eye out, Tesla's website seems to boot you out of logged in status after about 250 pages or 20 minutes. So it might be worthwhile to run this on the side while keeping an eye on your login status.
* Total file size of the Model 3 service manual is roughly **2.2GB**.
* There is minimal styling applied on the service manual. This script does not download those files. If you want the full experience, you should download the following folders (seen in your browser's developer tools, under the Sources tab):
* css
* design-system
* img
* js
* *This one is useful, they use jQuery*

@ -0,0 +1,190 @@
from selenium import webdriver
from selenium_stealth import stealth
from bs4 import BeautifulSoup
import time
import requests
import pickle
from secrets import tesla_login
# Step 0: Indicate which manual you plan to scrape, currently set to Model 3
service_manual_index = "https://service.tesla.com/docs/Model3/ServiceManual/en-us/index.html"
# Step 1: Set up the webdriver
options = webdriver.ChromeOptions()
# You can run this in headless mode, but it is not advised because Tesla might kick you out after ~250 pages or so
# options.add_argument("--headless")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r"chromedriver.exe")
stealth(driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
# Step 2: Login to Tesla
driver = tesla_login(driver)
# Step 3: Get to the index page
driver.get(service_manual_index)
time.sleep(10)
window1 = driver.window_handles[1]
driver.switch_to.window(window1)
source = driver.find_element_by_css_selector("html").get_attribute('outerHTML')
with open('docs/index.html', 'w', encoding='utf-8') as f:
f.write(source)
visited_urls = ['index.html']
banned_urls = []
upcoming_urls = []
img_urls = []
visited_img_urls = []
soup = BeautifulSoup(source, 'html.parser')
for link in soup.find_all('a'):
if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls:
if link.get('href').startswith('GUID') and link.get('href').endswith('.html'):
# TODO: Remove this
pass
upcoming_urls.append(link.get('href'))
for img in soup.find_all('img'):
if img.get('src') not in img_urls:
img_urls.append(img.get('src'))
# Step 4: Set up Python pickle to save session. You can stop the script and run it again to resume where you left off.
try:
pickle_in = open("dict.pickle","rb")
url_dict = pickle.load(pickle_in)
visited_urls = url_dict['visited_urls']
banned_urls = url_dict['banned_urls']
upcoming_urls = url_dict['upcoming_urls']
img_urls = url_dict['img_urls']
except:
pickle_out = open("dict.pickle","wb")
pickle.dump({
'visited_urls': visited_urls,
'banned_urls': banned_urls,
'upcoming_urls': upcoming_urls,
'img_urls': img_urls,
'visited_img_urls': visited_img_urls
}, pickle_out)
pickle_out.close()
print("****** SESSION SAVED ******")
# Step 5: Loop to get all the html pages, and store information about images to be downloaded later.
while upcoming_urls:
for url in upcoming_urls:
if len(visited_urls) % 50 == 0:
pickle_out = open("dict.pickle","wb")
pickle.dump({
'visited_urls': visited_urls,
'banned_urls': banned_urls,
'upcoming_urls': upcoming_urls,
'img_urls': img_urls,
'visited_img_urls': visited_img_urls
}, pickle_out)
pickle_out.close()
print("****** SESSION SAVED ******")
if url.startswith('GUID') and url.endswith('.html'):
driver.get('https://service.tesla.com/docs/Model3/ServiceManual/en-us/' + url)
else:
upcoming_urls.remove(url)
continue
source = driver.find_element_by_css_selector("html").get_attribute('outerHTML')
with open('docs/' + url, 'w', encoding='utf-8') as f:
f.write(source)
visited_urls.append(url)
upcoming_urls.remove(url)
print("visited: " + str(len(visited_urls)))
print("upcoming: " + str(len(upcoming_urls)))
print("images: " + str(len(set(img_urls))))
soup = BeautifulSoup(source, 'html.parser')
for link in soup.find_all('a'):
if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls:
if link.get('href').startswith('GUID') and link.get('href').endswith('.html'):
upcoming_urls.append(link.get('href'))
for img in soup.find_all('img'):
if img.get('src') not in img_urls:
img_urls.append(img.get('src'))
# Step 6: Save session after all html files collected
pickle_out = open("dict.pickle","wb")
pickle.dump({
'visited_urls': visited_urls,
'banned_urls': banned_urls,
'upcoming_urls': upcoming_urls,
'img_urls': img_urls,
'visited_img_urls': visited_img_urls
}, pickle_out)
pickle_out.close()
print("****** SESSION SAVED ******")
# Step 7: Clean image URLs
for url in img_urls:
if not isinstance(url, str):
img_urls.remove(url)
elif not url.startswith('GUID'):
img_urls.remove(url)
# Step 8: Sanity check on image URLs
for url in img_urls:
if url.endswith('jpg'):
continue
elif url.endswith('png'):
continue
elif url.endswith('gif'):
continue
print(url)
number_of_images = len(set(img_urls))
number_of_images_downloaded = 0
# Step 9: Download images with direct requests
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
}
s = requests.session()
s.headers.update(headers)
for url in set(img_urls):
if url not in visited_img_urls:
if number_of_images_downloaded % 200 == 0:
pickle_out = open("dict.pickle","wb")
pickle.dump({
'visited_urls': visited_urls,
'banned_urls': banned_urls,
'upcoming_urls': upcoming_urls,
'img_urls': img_urls,
'visited_img_urls': visited_img_urls
}, pickle_out)
pickle_out.close()
print("****** SESSION SAVED ******")
for cookie in driver.get_cookies():
c = {cookie['name']: cookie['value']}
s.cookies.update(c)
r = s.get('https://service.tesla.com/docs/Model3/ServiceManual/en-us/' + url, allow_redirects=True)
open("docs/" + url, 'wb').write(r.content)
visited_img_urls.append(url)
print("images: " + str(number_of_images))
print("downloaded: " + str(number_of_images_downloaded))
number_of_images_downloaded += 1
time.sleep(25)
driver.quit()

@ -0,0 +1,18 @@
import time
# Step 0: Input your tesla account details
tesla_account_email = "YOUR TESLA EMAIL HERE"
tesla_account_password = "YOUR TESLA PASSWORD HERE"
def tesla_login(driver):
driver.get("https://tesla.com/teslaaccount")
driver.find_element_by_css_selector("#form-input-identity").send_keys(tesla_account_email)
time.sleep(2)
driver.find_element_by_css_selector("#form-submit-continue").click()
time.sleep(2)
driver.find_element_by_css_selector("#form-input-credential").send_keys(tesla_account_password)
time.sleep(2)
driver.find_element_by_css_selector("#form-submit-continue").click()
return driver
Loading…
Cancel
Save