Fetch support files, and method to view search

pull/5/head
Anson Lai 2 years ago
parent 94503b389b
commit 9fc2a23100

@ -5,7 +5,7 @@ This script will download the Tesla Service Manual onto a local doc folder for o
## Setup
1. Go into `secrets.py` and fill out `tesla_account_email` and `tesla_account_password` with your account and password.
2. Go into `scrape.py` and enter the index URL of the manual you want saved. It is defaulted to the Model 3.
2. Go into `scrape.py` and enter the index URL of the manual you want saved by changing `service_manual_index` and `base_url` variables. It is defaulted to the Model 3.
3. Setup Python 3. See tutorial at: <https://wiki.python.org/moin/BeginnersGuide/Download>
4. Setup selenium for Python. To use the required stealth module, you **must** use the Chromium webdriver. See tutorial at: <https://blog.testproject.io/2019/07/16/installing-selenium-webdriver-using-python-chrome/>
5. Pip install the required packages (including `requests`, `selenium`, `selenium-stealth`, and `beautifulsoup4`). On windows, you run the following commands on command prompt (CMD):
@ -13,24 +13,21 @@ This script will download the Tesla Service Manual onto a local doc folder for o
2. `run pip install -r requirements.txt`
6. Run `scrape.py` by typing `python scrape.py`
## Viewing offline
### Option 1: Easy Way
1. Go into `docs/` folder and open up `index.html`. You're going to get 99% of the service manual just like that, but no search functionality.
### Option 2: HTTP Server (thanks to TheNexusAvenger)
1. Run CMD on Windows, and change the directory to the `docs` folder. Something like this `cd C:\Users\Anson\Desktop\TeslaServiceManualScraper`
2. Run the following command: `python -m http.server` (Python obviously needs to be installed)
3. Use a web browser and navigate to: `http://localhost:8000/` to see the full service manual including search.
## Tips
* A full scrape of the Model 3 service manual **took over 30 minutes**. This script is set up so that you can stop the script, and then continue later on.
* Keep an eye out, Tesla's website seems to boot you out of logged in status after about 250 pages or 20 minutes. So it might be worthwhile to run this on the side while keeping an eye on your login status.
* Keep an eye out, Tesla's website seems to boot you out of logged in status after about 250 pages or 20 minutes of continuous refreshing. So it might be worthwhile to run this on the side while keeping an eye on your login status.
* Total file size of the Model 3 service manual is roughly **2.2GB**.
* There is minimal styling applied on the service manual. This script does not download those files. If you want the full experience, you should download the following folders (seen in your browser's developer tools, under the Sources tab). The JS folder is probably the most helpful.
* css/
* custom.css
* design-system/
* 5.4.1/
* index.css
* index.js
* img/
* spritemap.svg
* js/
* vendor/
* jquery.magnific-popup.min.js
* jquery-3.5.1.min.js
* lunr.js
* search.js
* This script can likely be modified for MacOS easily, but I'm not familiar with how to install Selenium and chromedriver on MacOS. **Windows only for now.**
* This script can likely be modified for MacOS easily, but I'm not familiar with how to install Selenium and chromedriver on MacOS. See issues below **Windows only for now.**

@ -64,7 +64,60 @@ for img in soup.find_all('img'):
if img.get('src') not in img_urls:
img_urls.append(img.get('src'))
# Step 4: Set up Python pickle to save session. You can stop the script and run it again to resume where you left off.
# Step 4: Get the index.json for search functionality (thanks to TheNexusAvenger!) and other assorted supporting files
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
}
s = requests.session()
s.headers.update(headers)
for cookie in driver.get_cookies():
c = {cookie['name']: cookie['value']}
s.cookies.update(c)
r = s.get(base_url + 'index.json', allow_redirects=True)
open("docs/index.json", 'wb').write(r.content)
os.makedirs(os.path.dirname('docs/css/custom.css'), exist_ok=True)
r = s.get(base_url + 'css/custom.css', allow_redirects=True)
open("docs/css/custom.css", 'wb').write(r.content)
os.makedirs(os.path.dirname('docs/js/vendor/jquery-3.5.1.min.js'), exist_ok=True)
r = s.get(base_url + 'js/vendor/jquery-3.5.1.min.js', allow_redirects=True)
open("docs/js/vendor/jquery-3.5.1.min.js", 'wb').write(r.content)
r = s.get(base_url + 'js/vendor/jquery.magnific-popup.min.js', allow_redirects=True)
open("docs/js/vendor/jquery.magnific-popup.min.js", 'wb').write(r.content)
r = s.get(base_url + 'js/vendor/lunr.js', allow_redirects=True)
open("docs/js/vendor/lunr.js", 'wb').write(r.content)
r = s.get(base_url + 'js/search.js', allow_redirects=True)
open("docs/js/search.js", 'wb').write(r.content)
os.makedirs(os.path.dirname('docs/img/spritemap.svg'), exist_ok=True)
r = s.get(base_url + 'img/spritemap.svg', allow_redirects=True)
open("docs/img/spritemap.svg", 'wb').write(r.content)
os.makedirs(os.path.dirname('docs/design-system/5.4.1/index.css'), exist_ok=True)
r = s.get(base_url + 'design-system/5.4.1/index.css', allow_redirects=True)
open("docs/design-system/5.4.1/index.css", 'wb').write(r.content)
r = s.get(base_url + 'design-system/5.4.1/index.js', allow_redirects=True)
open("docs/design-system/5.4.1/index.js", 'wb').write(r.content)
os.makedirs(os.path.dirname('docs/tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2'), exist_ok=True)
r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2', allow_redirects=True)
open("docs/tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2", 'wb').write(r.content)
r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2', allow_redirects=True)
open("docs/tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2", 'wb').write(r.content)
r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2', allow_redirects=True)
open("docs/tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2", 'wb').write(r.content)
# Step 5: Set up Python pickle to save session. You can stop the script and run it again to resume where you left off.
try:
pickle_in = open("dict.pickle","rb")
url_dict = pickle.load(pickle_in)
@ -84,7 +137,7 @@ except:
pickle_out.close()
print("****** SESSION SAVED ******")
# Step 5: Loop to get all the html pages, and store information about images to be downloaded later.
# Step 6: Loop to get all the html pages, and store information about images to be downloaded later.
while upcoming_urls:
for url in upcoming_urls:
if len(visited_urls) % 50 == 0:
@ -123,7 +176,7 @@ while upcoming_urls:
if img.get('src') not in img_urls:
img_urls.append(img.get('src'))
# Step 6: Save session after all html files collected
# Step 7: Save session after all html files collected
pickle_out = open("dict.pickle","wb")
pickle.dump({
'visited_urls': visited_urls,
@ -135,14 +188,14 @@ pickle.dump({
pickle_out.close()
print("****** SESSION SAVED ******")
# Step 7: Clean image URLs
# Step 8: Clean image URLs
for url in img_urls:
if not isinstance(url, str):
img_urls.remove(url)
elif not url.startswith('GUID'):
img_urls.remove(url)
# Step 8: Sanity check on image URLs
# Step 9: Sanity check on image URLs
for url in img_urls:
if url.endswith('jpg'):
continue
@ -155,7 +208,7 @@ for url in img_urls:
number_of_images = len(set(img_urls))
number_of_images_downloaded = 0
# Step 9: Download images with direct requests
# Step 10: Download images with direct requests
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
@ -188,20 +241,5 @@ for url in set(img_urls):
print("downloaded: " + str(number_of_images_downloaded))
number_of_images_downloaded += 1
# Step 10: Get the index.json for search functionality (thanks to TheNexusAvenger!)
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
}
s = requests.session()
s.headers.update(headers)
for cookie in driver.get_cookies():
c = {cookie['name']: cookie['value']}
s.cookies.update(c)
r = s.get(base_url + 'index.json', allow_redirects=True)
open("docs/index.json", 'wb').write(r.content)
time.sleep(25)
driver.quit()
Loading…
Cancel
Save