diff --git a/README.md b/README.md index 34f1f0e..7b1002e 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,18 @@ This script will download the Tesla Service Manual onto a local doc folder for o * A full scrape of the Model 3 service manual **took over 30 minutes**. This script is set up so that you can stop the script, and then continue later on. * Keep an eye out, Tesla's website seems to boot you out of logged in status after about 250 pages or 20 minutes. So it might be worthwhile to run this on the side while keeping an eye on your login status. * Total file size of the Model 3 service manual is roughly **2.2GB**. -* There is minimal styling applied on the service manual. This script does not download those files. If you want the full experience, you should download the following folders (seen in your browser's developer tools, under the Sources tab): - * css - * design-system - * img - * js - * *This one is useful, they use jQuery* \ No newline at end of file +* There is minimal styling applied on the service manual. This script does not download those files. If you want the full experience, you should download the following folders (seen in your browser's developer tools, under the Sources tab). The JS folder is probably the most helpful. + * css/ + * custom.css + * design-system/ + * 5.4.1/ + * index.css + * index.js + * img/ + * spritemap.svg + * js/ + * vendor/ + * jquery.magnific-popup.min.js + * jquery-3.5.1.min.js + * lunr.js + * search.js \ No newline at end of file diff --git a/scrape.py b/scrape.py index 31874c5..f933f09 100644 --- a/scrape.py +++ b/scrape.py @@ -3,6 +3,7 @@ from selenium_stealth import stealth from bs4 import BeautifulSoup import time +import os import requests import pickle @@ -41,6 +42,8 @@ driver.switch_to.window(window1) source = driver.find_element_by_css_selector("html").get_attribute('outerHTML') + +os.makedirs(os.path.dirname('docs/index.html'), exist_ok=True) with open('docs/index.html', 'w', encoding='utf-8') as f: f.write(source) @@ -54,8 +57,6 @@ soup = BeautifulSoup(source, 'html.parser') for link in soup.find_all('a'): if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls: if link.get('href').startswith('GUID') and link.get('href').endswith('.html'): - # TODO: Remove this - pass upcoming_urls.append(link.get('href')) for img in soup.find_all('img'):