Quick fixes, more detail to readme

3 years ago · cb02932f73
parent cf3fcd3714
commit cb02932f73
2 changed files with 18 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -17,9 +17,18 @@ This script will download the Tesla Service Manual onto a local doc folder for o
 * A full scrape of the Model 3 service manual **took over 30 minutes**.  This script is set up so that you can stop the script, and then continue later on.
 * Keep an eye out, Tesla's website seems to boot you out of logged in status after about 250 pages or 20 minutes.  So it might be worthwhile to run this on the side while keeping an eye on your login status.
 * Total file size of the Model 3 service manual is roughly **2.2GB**.
-* There is minimal styling applied on the service manual.  This script does not download those files.  If you want the full experience, you should download the following folders (seen in your browser's developer tools, under the Sources tab):
-  * css
-  * design-system
-  * img
-  * js
-    * *This one is useful, they use jQuery*
+* There is minimal styling applied on the service manual.  This script does not download those files.  If you want the full experience, you should download the following folders (seen in your browser's developer tools, under the Sources tab).  The JS folder is probably the most helpful.
+  * css/
+    * custom.css
+  * design-system/
+    * 5.4.1/
+      * index.css
+      * index.js
+  * img/
+    * spritemap.svg
+  * js/
+    * vendor/
+      * jquery.magnific-popup.min.js
+      * jquery-3.5.1.min.js
+      * lunr.js
+    * search.js
--- a/scrape.py
+++ b/scrape.py
@ -3,6 +3,7 @@ from selenium_stealth import stealth
 from bs4 import BeautifulSoup
 import time

+import os
 import requests
 import pickle

@ -41,6 +42,8 @@ driver.switch_to.window(window1)

 source = driver.find_element_by_css_selector("html").get_attribute('outerHTML')

+
+os.makedirs(os.path.dirname('docs/index.html'), exist_ok=True)
 with open('docs/index.html', 'w', encoding='utf-8') as f:
  f.write(source)

@ -54,8 +57,6 @@ soup = BeautifulSoup(source, 'html.parser')
 for link in soup.find_all('a'):
  if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls:
    if link.get('href').startswith('GUID') and link.get('href').endswith('.html'):
-      # TODO: Remove this
-      pass
      upcoming_urls.append(link.get('href'))

 for img in soup.find_all('img'):