From 11a557f5823a39ddd68bb3d5bf42e901bd4a2fc4 Mon Sep 17 00:00:00 2001
From: Anson Lai <PharaohsVizier@gmail.com>
Date: Mon, 23 May 2022 14:14:48 -0700
Subject: [PATCH] Update to fetch index.json as well

---
 scrape.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/scrape.py b/scrape.py
index f933f09..1332422 100644
--- a/scrape.py
+++ b/scrape.py
@@ -11,6 +11,7 @@ from secrets import tesla_login
 
 # Step 0: Indicate which manual you plan to scrape, currently set to Model 3
 service_manual_index = "https://service.tesla.com/docs/Model3/ServiceManual/en-us/index.html"
+base_url = "https://service.tesla.com/docs/Model3/ServiceManual/en-us/"
 
 # Step 1: Set up the webdriver
 options = webdriver.ChromeOptions()
@@ -98,7 +99,7 @@ while upcoming_urls:
       pickle_out.close()
       print("****** SESSION SAVED ******")
     if url.startswith('GUID') and url.endswith('.html'):
-      driver.get('https://service.tesla.com/docs/Model3/ServiceManual/en-us/' + url)
+      driver.get(base_url + url)
     else:
       upcoming_urls.remove(url)
       continue
@@ -179,7 +180,7 @@ for url in set(img_urls):
         c = {cookie['name']: cookie['value']}
         s.cookies.update(c)
 
-    r = s.get('https://service.tesla.com/docs/Model3/ServiceManual/en-us/' + url, allow_redirects=True)
+    r = s.get(base_url + url, allow_redirects=True)
     open("docs/" + url, 'wb').write(r.content)
     visited_img_urls.append(url)
 
@@ -187,5 +188,20 @@ for url in set(img_urls):
     print("downloaded: " + str(number_of_images_downloaded))
     number_of_images_downloaded += 1
 
+# Step 10: Get the index.json for search functionality (thanks to TheNexusAvenger!)
+headers = {
+"User-Agent":
+  "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
+}
+s = requests.session()
+s.headers.update(headers)
+
+for cookie in driver.get_cookies():
+    c = {cookie['name']: cookie['value']}
+    s.cookies.update(c)
+
+r = s.get(base_url + 'index.json', allow_redirects=True)
+open("docs/index.json", 'wb').write(r.content)
+
 time.sleep(25)
 driver.quit()
\ No newline at end of file