Update readme

Merge branch 'master' of https://github.com/AnsonLai/TeslaServiceManualScraper
clean up comment convention
3 changed files with 232 additions and 229 deletions
--- a/README.md
+++ b/README.md
@ -14,6 +14,7 @@ This script will download the Tesla Service Manual onto a local doc folder for o
    2.  `run pip install -r requirements.txt`
 7. Before scraping, it is always a good idea to use a VPN of some sort to avoid any issues with your account.  I didn't run into any issues personally, but you can never be too safe.  It is also worthwhile to open a new account to claim the manuals instead of using a personal account.
 8. Run `scrape.py` by typing `python scrape.py`
+9. The browser will automatically restart when it encounters problems with the files or login status.

 ## Viewing offline

@ -33,8 +34,3 @@ This script will download the Tesla Service Manual onto a local doc folder for o
 * Keep an eye out, Tesla's website seems to boot you out of logged in status after about 250 pages or 20 minutes of continuous refreshing.  So it might be worthwhile to run this on the side while keeping an eye on your login status.
 * Total file size of the Model 3 service manual is roughly **2.2GB**.
 * On your first run, Tesla might throw a Captcha or lead to an error page.  Most of the time, just rerun and it'll work.
-
-## Roadmap
-
-1. Run a check to ensure no "no access" html files are downloaded.
-2. Restart the script every 200 files or so.
--- a/scrape.py
+++ b/scrape.py
@ -1,3 +1,4 @@
+from distutils.command.clean import clean
 from selenium import webdriver
 from selenium_stealth import stealth
 from bs4 import BeautifulSoup
@ -10,129 +11,211 @@ import pickle

 from secrets import tesla_login

-# Step 0: Indicate which manual you plan to scrape, currently set to Model 3.  Also increase the login delay to give yourself time to login if you have 2FA or encounter other login issues.
+# TODO: Indicate which manual you plan to scrape, currently set to Model 3.  Also increase the login delay in secrets.py to give yourself time to login if you have 2FA or encounter other login issues.
 service_manual_index = "https://service.tesla.com/docs/Model3/ServiceManual/en-us/index.html"
 base_url = "https://service.tesla.com/docs/Model3/ServiceManual/en-us/"

-
-
-# Step 1: Set up the webdriver
-options = webdriver.ChromeOptions()
-# You can run this in headless mode, but it is not advised because Tesla might kick you out after ~250 pages or so
-# options.add_argument("--headless")
-
-options.add_experimental_option("excludeSwitches", ["enable-automation"])
-options.add_experimental_option('useAutomationExtension', False)
-driver = webdriver.Chrome(options=options)
-
-stealth(driver,
-  languages=["en-US", "en"],
-  vendor="Google Inc.",
-  platform="Win32",
-  webgl_vendor="Intel Inc.",
-  renderer="Intel Iris OpenGL Engine",
-  fix_hairline=True,
-)
-
-# Step 2: Login to Tesla
-driver = tesla_login(driver)
-
-# Step 3: Get to the index page
-driver.get(service_manual_index)
-time.sleep(10)
-
-window1 = driver.window_handles[1]
-driver.switch_to.window(window1)
-
-source = driver.find_element_by_css_selector("html").get_attribute('outerHTML')
-
-mpulse_tracker = r'<script.+go.mpulse.+</script>'
-google_tracker = r'<script.+googletag.+</script>'
-
-os.makedirs(os.path.dirname('docs/index.html'), exist_ok=True)
-with open('docs/index.html', 'w', encoding='utf-8') as f:
-  source = re.sub(mpulse_tracker, '', source)
-  source = re.sub(google_tracker, '', source)
-  f.write(source)
-
-visited_urls = ['index.html']
+visited_urls = []
 banned_urls = []
 upcoming_urls = []
 img_urls = []
 visited_img_urls = []

-soup = BeautifulSoup(source, 'html.parser')
-for link in soup.find_all('a'):
-  if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls:
-    if link.get('href').startswith('GUID') and link.get('href').endswith('.html'):
-      upcoming_urls.append(link.get('href'))
-
-for img in soup.find_all('img'):
-  if img.get('src') not in img_urls:
-    img_urls.append(img.get('src'))
-
-# Step 4: Get the index.json for search functionality (thanks to TheNexusAvenger!) and other assorted supporting files
-headers = {
-"User-Agent":
-  "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
-}
-s = requests.session()
-s.headers.update(headers)
-
-for cookie in driver.get_cookies():
-    c = {cookie['name']: cookie['value']}
-    s.cookies.update(c)
-
-r = s.get(base_url + 'index.json', allow_redirects=True)
-open("docs/index.json", 'wb').write(r.content)
-
-os.makedirs(os.path.dirname('docs/css/custom.css'), exist_ok=True)
-r = s.get(base_url + 'css/custom.css', allow_redirects=True)
-open("docs/css/custom.css", 'wb').write(r.content)
-
-os.makedirs(os.path.dirname('docs/js/vendor/jquery-3.5.1.min.js'), exist_ok=True)
-r = s.get(base_url + 'js/vendor/jquery-3.5.1.min.js', allow_redirects=True)
-open("docs/js/vendor/jquery-3.5.1.min.js", 'wb').write(r.content)
-
-r = s.get(base_url + 'js/vendor/jquery.magnific-popup.min.js', allow_redirects=True)
-open("docs/js/vendor/jquery.magnific-popup.min.js", 'wb').write(r.content)
-
-r = s.get(base_url + 'js/vendor/lunr.js', allow_redirects=True)
-open("docs/js/vendor/lunr.js", 'wb').write(r.content)
-
-r = s.get(base_url + 'js/search.js', allow_redirects=True)
-open("docs/js/search.js", 'wb').write(r.content)
-
-os.makedirs(os.path.dirname('docs/img/spritemap.svg'), exist_ok=True)
-r = s.get(base_url + 'img/spritemap.svg', allow_redirects=True)
-open("docs/img/spritemap.svg", 'wb').write(r.content)
-
-os.makedirs(os.path.dirname('docs/design-system/5.4.1/index.css'), exist_ok=True)
-r = s.get(base_url + 'design-system/5.4.1/index.css', allow_redirects=True)
-open("docs/design-system/5.4.1/index.css", 'wb').write(r.content)
-
-r = s.get(base_url + 'design-system/5.4.1/index.js', allow_redirects=True)
-open("docs/design-system/5.4.1/index.js", 'wb').write(r.content)
-
-os.makedirs(os.path.dirname('docs/tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2'), exist_ok=True)
-r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2', allow_redirects=True)
-open("docs/tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2", 'wb').write(r.content)
-
-r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2', allow_redirects=True)
-open("docs/tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2", 'wb').write(r.content)
-
-r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2', allow_redirects=True)
-open("docs/tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2", 'wb').write(r.content)
-
-# Step 5: Set up Python pickle to save session.  You can stop the script and run it again to resume where you left off.
-try:
-  pickle_in = open("dict.pickle","rb")
-  url_dict = pickle.load(pickle_in)
-  visited_urls = url_dict['visited_urls']
-  banned_urls = url_dict['banned_urls']
-  upcoming_urls = url_dict['upcoming_urls']
-  img_urls = url_dict['img_urls']
-except:
+mpulse_tracker = r'<script.+go.mpulse.+</script>'
+google_tracker = r'<script.+googletag.+</script>'
+
+class Webdriver:
+  def __init__(self):
+    options = webdriver.ChromeOptions()
+    options.add_experimental_option("excludeSwitches", ["enable-automation"])
+    options.add_experimental_option('useAutomationExtension', False)
+    self.driver = webdriver.Chrome(options=options)
+    stealth(self.driver,
+      languages=["en-US", "en"],
+      vendor="Google Inc.",
+      platform="Win32",
+      webgl_vendor="Intel Inc.",
+      renderer="Intel Iris OpenGL Engine",
+      fix_hairline=True,
+    )
+  def restart_scrape(self):
+    self.driver.quit()
+    run()
+  def get_index(self):
+    # Login to Tesla
+    driver = tesla_login(self.driver)
+
+    # Get to the index page
+    driver.get(service_manual_index)
+    time.sleep(10)
+
+    window1 = driver.window_handles[1]
+    driver.switch_to.window(window1)
+    source = driver.find_element_by_css_selector("html").get_attribute('outerHTML')
+    append_upcoming_and_img_urls(source)
+    os.makedirs(os.path.dirname('docs/index.html'), exist_ok=True)
+    with open('docs/index.html', 'w', encoding='utf-8') as f:
+      source = re.sub(mpulse_tracker, '', source)
+      source = re.sub(google_tracker, '', source)
+      f.write(source)
+    if 'index.html' not in visited_urls:
+      visited_urls.append('index.html')
+  def get_support_files(self):
+    # Get the index.json for search functionality (thanks to TheNexusAvenger!) and other assorted supporting files
+    headers = {
+    "User-Agent":
+      "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
+    }
+    s = requests.session()
+    s.headers.update(headers)
+
+    for cookie in self.driver.get_cookies():
+        c = {cookie['name']: cookie['value']}
+        s.cookies.update(c)
+
+    r = s.get(base_url + 'index.json', allow_redirects=True)
+    open("docs/index.json", 'wb').write(r.content)
+
+    os.makedirs(os.path.dirname('docs/css/custom.css'), exist_ok=True)
+    r = s.get(base_url + 'css/custom.css', allow_redirects=True)
+    open("docs/css/custom.css", 'wb').write(r.content)
+
+    os.makedirs(os.path.dirname('docs/js/vendor/jquery-3.5.1.min.js'), exist_ok=True)
+    r = s.get(base_url + 'js/vendor/jquery-3.5.1.min.js', allow_redirects=True)
+    open("docs/js/vendor/jquery-3.5.1.min.js", 'wb').write(r.content)
+
+    r = s.get(base_url + 'js/vendor/jquery.magnific-popup.min.js', allow_redirects=True)
+    open("docs/js/vendor/jquery.magnific-popup.min.js", 'wb').write(r.content)
+
+    r = s.get(base_url + 'js/vendor/lunr.js', allow_redirects=True)
+    open("docs/js/vendor/lunr.js", 'wb').write(r.content)
+
+    r = s.get(base_url + 'js/search.js', allow_redirects=True)
+    open("docs/js/search.js", 'wb').write(r.content)
+
+    os.makedirs(os.path.dirname('docs/img/spritemap.svg'), exist_ok=True)
+    r = s.get(base_url + 'img/spritemap.svg', allow_redirects=True)
+    open("docs/img/spritemap.svg", 'wb').write(r.content)
+
+    os.makedirs(os.path.dirname('docs/design-system/5.4.1/index.css'), exist_ok=True)
+    r = s.get(base_url + 'design-system/5.4.1/index.css', allow_redirects=True)
+    open("docs/design-system/5.4.1/index.css", 'wb').write(r.content)
+
+    r = s.get(base_url + 'design-system/5.4.1/index.js', allow_redirects=True)
+    open("docs/design-system/5.4.1/index.js", 'wb').write(r.content)
+
+    os.makedirs(os.path.dirname('docs/tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2'), exist_ok=True)
+    r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2', allow_redirects=True)
+    open("docs/tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2", 'wb').write(r.content)
+
+    r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2', allow_redirects=True)
+    open("docs/tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2", 'wb').write(r.content)
+
+    r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2', allow_redirects=True)
+    open("docs/tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2", 'wb').write(r.content)
+  def get_html(self):
+    # Loop to get all the html pages, and store information about images to be downloaded later.
+    while upcoming_urls:
+      for url in upcoming_urls:
+        if len(visited_urls) % 50 == 0:
+          save_session()
+        if len(visited_urls) % 175 == 0:
+          self.restart_scrape()
+        if url.startswith('GUID') and url.endswith('.html'):
+          self.driver.get(base_url + url)
+        else:
+          upcoming_urls.remove(url)
+          continue
+
+        source = self.driver.find_element_by_css_selector("html").get_attribute('outerHTML')
+        if not check_source_validity(source):
+          self.restart_scrape()
+
+        with open('docs/' + url, 'w', encoding='utf-8') as f:
+          source = re.sub(mpulse_tracker, '', source)
+          source = re.sub(google_tracker, '', source)
+          # TODO: Check if this is an error page, if yes, break out
+
+          f.write(source)
+          visited_urls.append(url)
+          upcoming_urls.remove(url)
+          print("visited: " + str(len(visited_urls)))
+          print("upcoming: " + str(len(upcoming_urls)))
+          print("images: " + str(len(set(img_urls))))
+
+        append_upcoming_and_img_urls(source)
+
+  def get_imgs(self):
+    # Download images with direct requests
+    number_of_images = len(set(img_urls))
+    number_of_images_downloaded = len(set(visited_img_urls))
+    
+    headers = {
+    "User-Agent":
+      "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
+    }
+    s = requests.session()
+    s.headers.update(headers)
+    for url in set(img_urls):
+      if url not in visited_img_urls:
+        if number_of_images_downloaded % 200 == 0:
+          save_session()
+
+        for cookie in self.driver.get_cookies():
+            c = {cookie['name']: cookie['value']}
+            s.cookies.update(c)
+
+        r = s.get(base_url + url, allow_redirects=True)
+        open("docs/" + url, 'wb').write(r.content)
+        visited_img_urls.append(url)
+
+        print("images: " + str(number_of_images))
+        print("downloaded: " + str(number_of_images_downloaded))
+        number_of_images_downloaded += 1
+
+def append_upcoming_and_img_urls(source):
+  soup = BeautifulSoup(source, 'html.parser')
+  for link in soup.find_all('a'):
+    if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls:
+      if link.get('href').startswith('GUID') and link.get('href').endswith('.html'):
+        upcoming_urls.append(link.get('href'))
+
+  for img in soup.find_all('img'):
+    if img.get('src') not in img_urls:
+      img_urls.append(img.get('src'))
+
+def check_source_validity(source):
+  if 'design-system/4.x/index.css' in source and '<title>Tesla Service</title>' in source:
+    return False
+  else:
+    return True
+
+def new_session():
+  global visited_urls, banned_urls, upcoming_urls, img_urls, visited_img_urls
+  # Set up Python pickle to start/load a session.  You can stop the script and run it again to resume where you left off.
+  try:
+    pickle_in = open("dict.pickle","rb")
+    url_dict = pickle.load(pickle_in)
+    visited_urls = url_dict['visited_urls']
+    banned_urls = url_dict['banned_urls']
+    upcoming_urls = url_dict['upcoming_urls']
+    img_urls = url_dict['img_urls']
+    visited_img_urls = url_dict['visited_img_urls']
+    print("****** SESSION LOADED ******")
+  except:
+    pickle_out = open("dict.pickle","wb")
+    pickle.dump({
+      'visited_urls': visited_urls,
+      'banned_urls': banned_urls,
+      'upcoming_urls': upcoming_urls,
+      'img_urls': img_urls,
+      'visited_img_urls': visited_img_urls
+    }, pickle_out)
+    pickle_out.close()
+    print("****** SESSION CREATED ******")
+  
+def save_session():
+  # Use pickle to load session
  pickle_out = open("dict.pickle","wb")
  pickle.dump({
    'visited_urls': visited_urls,
@ -144,112 +227,36 @@ except:
  pickle_out.close()
  print("****** SESSION SAVED ******")

-# Step 6: Loop to get all the html pages, and store information about images to be downloaded later.
-while upcoming_urls:
-  for url in upcoming_urls:
-    if len(visited_urls) % 50 == 0:
-      pickle_out = open("dict.pickle","wb")
-      pickle.dump({
-        'visited_urls': visited_urls,
-        'banned_urls': banned_urls,
-        'upcoming_urls': upcoming_urls,
-        'img_urls': img_urls,
-        'visited_img_urls': visited_img_urls
-      }, pickle_out)
-      pickle_out.close()
-      print("****** SESSION SAVED ******")
-    if url.startswith('GUID') and url.endswith('.html'):
-      driver.get(base_url + url)
-    else:
-      upcoming_urls.remove(url)
+def clean_img_urls():
+  # Clean image URLs
+  for url in img_urls:
+    if not isinstance(url, str):
+      img_urls.remove(url)
+    elif not url.startswith('GUID'):
+      img_urls.remove(url)
+
+  # Sanity check on image URLs
+  for url in img_urls:
+    if url.endswith('jpg'):
      continue
-    source = driver.find_element_by_css_selector("html").get_attribute('outerHTML')
-
-    with open('docs/' + url, 'w', encoding='utf-8') as f:
-      source = re.sub(mpulse_tracker, '', source)
-      source = re.sub(google_tracker, '', source)
-
-      f.write(source)
-      visited_urls.append(url)
-      upcoming_urls.remove(url)
-      print("visited: " + str(len(visited_urls)))
-      print("upcoming: " + str(len(upcoming_urls)))
-      print("images: " + str(len(set(img_urls))))
-
-    soup = BeautifulSoup(source, 'html.parser')
-    for link in soup.find_all('a'):
-      if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls:
-        if link.get('href').startswith('GUID') and link.get('href').endswith('.html'):
-          upcoming_urls.append(link.get('href'))
-
-    for img in soup.find_all('img'):
-      if img.get('src') not in img_urls:
-        img_urls.append(img.get('src'))
-
-# Step 7: Save session after all html files collected
-pickle_out = open("dict.pickle","wb")
-pickle.dump({
-  'visited_urls': visited_urls,
-  'banned_urls': banned_urls,
-  'upcoming_urls': upcoming_urls,
-  'img_urls': img_urls,
-  'visited_img_urls': visited_img_urls
-}, pickle_out)
-pickle_out.close()
-print("****** SESSION SAVED ******")
-
-# Step 8: Clean image URLs
-for url in img_urls:
-  if not isinstance(url, str):
-    img_urls.remove(url)
-  elif not url.startswith('GUID'):
-    img_urls.remove(url)
-
-# Step 9: Sanity check on image URLs
-for url in img_urls:
-  if url.endswith('jpg'):
-    continue
-  elif url.endswith('png'):
-    continue
-  elif url.endswith('gif'):
-    continue
-  print(url)
-
-number_of_images = len(set(img_urls))
-number_of_images_downloaded = 0
-
-# Step 10: Download images with direct requests
-headers = {
-"User-Agent":
-  "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
-}
-s = requests.session()
-s.headers.update(headers)
-for url in set(img_urls):
-  if url not in visited_img_urls:
-    if number_of_images_downloaded % 200 == 0:
-      pickle_out = open("dict.pickle","wb")
-      pickle.dump({
-        'visited_urls': visited_urls,
-        'banned_urls': banned_urls,
-        'upcoming_urls': upcoming_urls,
-        'img_urls': img_urls,
-        'visited_img_urls': visited_img_urls
-      }, pickle_out)
-      pickle_out.close()
-      print("****** SESSION SAVED ******")
-
-    for cookie in driver.get_cookies():
-        c = {cookie['name']: cookie['value']}
-        s.cookies.update(c)
-
-    r = s.get(base_url + url, allow_redirects=True)
-    open("docs/" + url, 'wb').write(r.content)
-    visited_img_urls.append(url)
-
-    print("images: " + str(number_of_images))
-    print("downloaded: " + str(number_of_images_downloaded))
-    number_of_images_downloaded += 1
-
-time.sleep(25)
-driver.quit()
+    elif url.endswith('png'):
+      continue
+    elif url.endswith('gif'):
+      continue
+    print(url)
+
+def run():
+  driver = Webdriver()
+  new_session()
+  driver.get_index()
+  save_session()
+  driver.get_support_files()
+  save_session()
+  driver.get_html()
+  save_session()
+  clean_img_urls()
+  driver.get_imgs()
+  time.sleep(15)
+  driver.driver.quit()
+
+run()
--- a/secrets.py
+++ b/secrets.py
@ -1,6 +1,6 @@
 import time

-# Step 0: Input your tesla account details
+# TODO: Input your tesla account details
 tesla_account_email = "YOUR TESLA EMAIL HERE"
 tesla_account_password = "YOUR TESLA PASSWORD HERE"
 login_delay = 0
Author	SHA1	Message	Date
Anson Lai	5ff48e6cd4	Update readme	2 years ago
Anson Lai	a752cce492	Merge branch 'master' of https://github.com/AnsonLai/TeslaServiceManualScraper	2 years ago
Anson Lai	bc1707921c	clean up comment convention	2 years ago
Anson Lai	0ba2a26234	Restart scrape periodically	2 years ago
Anson Lai	b6f543ace6	Restart scrape if invalid source	2 years ago
Anson Lai	a8cf18b0c2	refactor, setup for restarts	2 years ago