refactor, setup for restarts

3 years ago · a8cf18b0c2
parent ffa9b1d0af
commit a8cf18b0c2
1 changed files with 221 additions and 222 deletions
--- a/scrape.py
+++ b/scrape.py
@ -1,3 +1,4 @@
 from distutils.command.clean import clean
 from selenium import webdriver
 from selenium_stealth import stealth
 from bs4 import BeautifulSoup
@ -14,125 +15,199 @@ from secrets import tesla_login
 service_manual_index = "https://service.tesla.com/docs/Model3/ServiceManual/en-us/index.html"
 base_url = "https://service.tesla.com/docs/Model3/ServiceManual/en-us/"
-
+visited_urls = []
 # Step 1: Set up the webdriver
 options = webdriver.ChromeOptions()
 # You can run this in headless mode, but it is not advised because Tesla might kick you out after ~250 pages or so
 # options.add_argument("--headless")
 options.add_experimental_option("excludeSwitches", ["enable-automation"])
 options.add_experimental_option('useAutomationExtension', False)
 driver = webdriver.Chrome(options=options)
 stealth(driver,
  languages=["en-US", "en"],
  vendor="Google Inc.",
  platform="Win32",
  webgl_vendor="Intel Inc.",
  renderer="Intel Iris OpenGL Engine",
  fix_hairline=True,
 )
 # Step 2: Login to Tesla
 driver = tesla_login(driver)
 # Step 3: Get to the index page
 driver.get(service_manual_index)
 time.sleep(10)
 window1 = driver.window_handles[1]
 driver.switch_to.window(window1)
 source = driver.find_element_by_css_selector("html").get_attribute('outerHTML')
 mpulse_tracker = r'<script.+go.mpulse.+</script>'
 google_tracker = r'<script.+googletag.+</script>'
 os.makedirs(os.path.dirname('docs/index.html'), exist_ok=True)
 with open('docs/index.html', 'w', encoding='utf-8') as f:
  source = re.sub(mpulse_tracker, '', source)
  source = re.sub(google_tracker, '', source)
  f.write(source)
 visited_urls = ['index.html']
 banned_urls = []
 upcoming_urls = []
 img_urls = []
 visited_img_urls = []
-soup = BeautifulSoup(source, 'html.parser')
+mpulse_tracker = r'<script.+go.mpulse.+</script>'
-for link in soup.find_all('a'):
+google_tracker = r'<script.+googletag.+</script>'
-  if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls:
+
-    if link.get('href').startswith('GUID') and link.get('href').endswith('.html'):
+class Webdriver:
-      upcoming_urls.append(link.get('href'))
+  def __init__(self):
-
+    options = webdriver.ChromeOptions()
-for img in soup.find_all('img'):
+    options.add_experimental_option("excludeSwitches", ["enable-automation"])
-  if img.get('src') not in img_urls:
+    options.add_experimental_option('useAutomationExtension', False)
-    img_urls.append(img.get('src'))
+    self.driver = webdriver.Chrome(options=options)
-
+    stealth(self.driver,
-# Step 4: Get the index.json for search functionality (thanks to TheNexusAvenger!) and other assorted supporting files
+      languages=["en-US", "en"],
-headers = {
+      vendor="Google Inc.",
-"User-Agent":
+      platform="Win32",
-  "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
+      webgl_vendor="Intel Inc.",
-}
+      renderer="Intel Iris OpenGL Engine",
-s = requests.session()
+      fix_hairline=True,
-s.headers.update(headers)
+    )
-
+  def quit(self):
-for cookie in driver.get_cookies():
+    self.driver.quit()
-    c = {cookie['name']: cookie['value']}
+  def get_index(self):
-    s.cookies.update(c)
+    # Step 2: Login to Tesla
-
+    driver = tesla_login(self.driver)
-r = s.get(base_url + 'index.json', allow_redirects=True)
+
-open("docs/index.json", 'wb').write(r.content)
+    # Step 3: Get to the index page
-
+    driver.get(service_manual_index)
-os.makedirs(os.path.dirname('docs/css/custom.css'), exist_ok=True)
+    time.sleep(10)
-r = s.get(base_url + 'css/custom.css', allow_redirects=True)
+
-open("docs/css/custom.css", 'wb').write(r.content)
+    window1 = driver.window_handles[1]
-
+    driver.switch_to.window(window1)
-os.makedirs(os.path.dirname('docs/js/vendor/jquery-3.5.1.min.js'), exist_ok=True)
+    source = driver.find_element_by_css_selector("html").get_attribute('outerHTML')
-r = s.get(base_url + 'js/vendor/jquery-3.5.1.min.js', allow_redirects=True)
+    append_upcoming_and_img_urls(source)
-open("docs/js/vendor/jquery-3.5.1.min.js", 'wb').write(r.content)
+    os.makedirs(os.path.dirname('docs/index.html'), exist_ok=True)
-
+    with open('docs/index.html', 'w', encoding='utf-8') as f:
-r = s.get(base_url + 'js/vendor/jquery.magnific-popup.min.js', allow_redirects=True)
+      source = re.sub(mpulse_tracker, '', source)
-open("docs/js/vendor/jquery.magnific-popup.min.js", 'wb').write(r.content)
+      source = re.sub(google_tracker, '', source)
-
+      f.write(source)
-r = s.get(base_url + 'js/vendor/lunr.js', allow_redirects=True)
+    if 'index.html' not in visited_urls:
-open("docs/js/vendor/lunr.js", 'wb').write(r.content)
+      visited_urls.append('index.html')
-
+  def get_support_files(self):
-r = s.get(base_url + 'js/search.js', allow_redirects=True)
+    # Step 4: Get the index.json for search functionality (thanks to TheNexusAvenger!) and other assorted supporting files
-open("docs/js/search.js", 'wb').write(r.content)
+    headers = {
-
+    "User-Agent":
-os.makedirs(os.path.dirname('docs/img/spritemap.svg'), exist_ok=True)
+      "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
-r = s.get(base_url + 'img/spritemap.svg', allow_redirects=True)
+    }
-open("docs/img/spritemap.svg", 'wb').write(r.content)
+    s = requests.session()
-
+    s.headers.update(headers)
-os.makedirs(os.path.dirname('docs/design-system/5.4.1/index.css'), exist_ok=True)
+
-r = s.get(base_url + 'design-system/5.4.1/index.css', allow_redirects=True)
+    for cookie in self.driver.get_cookies():
-open("docs/design-system/5.4.1/index.css", 'wb').write(r.content)
+        c = {cookie['name']: cookie['value']}
-
+        s.cookies.update(c)
-r = s.get(base_url + 'design-system/5.4.1/index.js', allow_redirects=True)
+
-open("docs/design-system/5.4.1/index.js", 'wb').write(r.content)
+    r = s.get(base_url + 'index.json', allow_redirects=True)
-
+    open("docs/index.json", 'wb').write(r.content)
-os.makedirs(os.path.dirname('docs/tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2'), exist_ok=True)
+
-r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2', allow_redirects=True)
+    os.makedirs(os.path.dirname('docs/css/custom.css'), exist_ok=True)
-open("docs/tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2", 'wb').write(r.content)
+    r = s.get(base_url + 'css/custom.css', allow_redirects=True)
-
+    open("docs/css/custom.css", 'wb').write(r.content)
-r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2', allow_redirects=True)
+
-open("docs/tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2", 'wb').write(r.content)
+    os.makedirs(os.path.dirname('docs/js/vendor/jquery-3.5.1.min.js'), exist_ok=True)
-
+    r = s.get(base_url + 'js/vendor/jquery-3.5.1.min.js', allow_redirects=True)
-r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2', allow_redirects=True)
+    open("docs/js/vendor/jquery-3.5.1.min.js", 'wb').write(r.content)
-open("docs/tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2", 'wb').write(r.content)
+
-
+    r = s.get(base_url + 'js/vendor/jquery.magnific-popup.min.js', allow_redirects=True)
-# Step 5: Set up Python pickle to save session.  You can stop the script and run it again to resume where you left off.
+    open("docs/js/vendor/jquery.magnific-popup.min.js", 'wb').write(r.content)
-try:
+
-  pickle_in = open("dict.pickle","rb")
+    r = s.get(base_url + 'js/vendor/lunr.js', allow_redirects=True)
-  url_dict = pickle.load(pickle_in)
+    open("docs/js/vendor/lunr.js", 'wb').write(r.content)
-  visited_urls = url_dict['visited_urls']
+
-  banned_urls = url_dict['banned_urls']
+    r = s.get(base_url + 'js/search.js', allow_redirects=True)
-  upcoming_urls = url_dict['upcoming_urls']
+    open("docs/js/search.js", 'wb').write(r.content)
-  img_urls = url_dict['img_urls']
+
-except:
+    os.makedirs(os.path.dirname('docs/img/spritemap.svg'), exist_ok=True)
    r = s.get(base_url + 'img/spritemap.svg', allow_redirects=True)
    open("docs/img/spritemap.svg", 'wb').write(r.content)
    os.makedirs(os.path.dirname('docs/design-system/5.4.1/index.css'), exist_ok=True)
    r = s.get(base_url + 'design-system/5.4.1/index.css', allow_redirects=True)
    open("docs/design-system/5.4.1/index.css", 'wb').write(r.content)
    r = s.get(base_url + 'design-system/5.4.1/index.js', allow_redirects=True)
    open("docs/design-system/5.4.1/index.js", 'wb').write(r.content)
    os.makedirs(os.path.dirname('docs/tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2'), exist_ok=True)
    r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2', allow_redirects=True)
    open("docs/tds-fonts/3.x/woff2/GothamSSm-Bold_web.woff2", 'wb').write(r.content)
    r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2', allow_redirects=True)
    open("docs/tds-fonts/3.x/woff2/GothamSSm-Book_web.woff2", 'wb').write(r.content)
    r = s.get(base_url + 'tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2', allow_redirects=True)
    open("docs/tds-fonts/3.x/woff2/GothamSSm-Medium_web.woff2", 'wb').write(r.content)
  def get_html(self):
    # Step 6: Loop to get all the html pages, and store information about images to be downloaded later.
    while upcoming_urls:
      for url in upcoming_urls:
        if len(visited_urls) % 5 == 0:
          save_session()
        if url.startswith('GUID') and url.endswith('.html'):
          self.driver.get(base_url + url)
        else:
          upcoming_urls.remove(url)
          continue
        source = self.driver.find_element_by_css_selector("html").get_attribute('outerHTML')
        with open('docs/' + url, 'w', encoding='utf-8') as f:
          source = re.sub(mpulse_tracker, '', source)
          source = re.sub(google_tracker, '', source)
          # TODO: Check if this is an error page, if yes, break out
          f.write(source)
          visited_urls.append(url)
          upcoming_urls.remove(url)
          print("visited: " + str(len(visited_urls)))
          print("upcoming: " + str(len(upcoming_urls)))
          print("images: " + str(len(set(img_urls))))
        append_upcoming_and_img_urls(source)
  def get_imgs(self):
    number_of_images = len(set(img_urls))
    number_of_images_downloaded = len(set(visited_img_urls))
    # Step 9: Download images with direct requests
    headers = {
    "User-Agent":
      "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
    }
    s = requests.session()
    s.headers.update(headers)
    for url in set(img_urls):
      if url not in visited_img_urls:
        if number_of_images_downloaded % 200 == 0:
          save_session()
        for cookie in self.driver.get_cookies():
            c = {cookie['name']: cookie['value']}
            s.cookies.update(c)
        r = s.get(base_url + url, allow_redirects=True)
        open("docs/" + url, 'wb').write(r.content)
        visited_img_urls.append(url)
        print("images: " + str(number_of_images))
        print("downloaded: " + str(number_of_images_downloaded))
        number_of_images_downloaded += 1
 def append_upcoming_and_img_urls(source):
  soup = BeautifulSoup(source, 'html.parser')
  for link in soup.find_all('a'):
    if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls:
      if link.get('href').startswith('GUID') and link.get('href').endswith('.html'):
        upcoming_urls.append(link.get('href'))
  for img in soup.find_all('img'):
    if img.get('src') not in img_urls:
      img_urls.append(img.get('src'))
 def check_source_validity(source):
  if 'design-system/4.x/index.css' in source and '<title>Tesla Service</title>' in source:
    return False
  else:
    return True
 def new_session():
  global visited_urls, banned_urls, upcoming_urls, img_urls, visited_img_urls
  # Step 5: Set up Python pickle to save session.  You can stop the script and run it again to resume where you left off.
  try:
    pickle_in = open("dict.pickle","rb")
    url_dict = pickle.load(pickle_in)
    visited_urls = url_dict['visited_urls']
    banned_urls = url_dict['banned_urls']
    upcoming_urls = url_dict['upcoming_urls']
    img_urls = url_dict['img_urls']
    visited_img_urls = url_dict['visited_img_urls']
    print("****** SESSION LOADED ******")
  except:
    pickle_out = open("dict.pickle","wb")
    pickle.dump({
      'visited_urls': visited_urls,
      'banned_urls': banned_urls,
      'upcoming_urls': upcoming_urls,
      'img_urls': img_urls,
      'visited_img_urls': visited_img_urls
    }, pickle_out)
    pickle_out.close()
    print("****** SESSION CREATED ******")
 def save_session():
  pickle_out = open("dict.pickle","wb")
  pickle.dump({
    'visited_urls': visited_urls,
@ -144,112 +219,36 @@ except:
  pickle_out.close()
  print("****** SESSION SAVED ******")
-# Step 6: Loop to get all the html pages, and store information about images to be downloaded later.
+def clean_img_urls():
-while upcoming_urls:
+  # Step 7: Clean image URLs
-  for url in upcoming_urls:
+  for url in img_urls:
-    if len(visited_urls) % 50 == 0:
+    if not isinstance(url, str):
-      pickle_out = open("dict.pickle","wb")
+      img_urls.remove(url)
-      pickle.dump({
+    elif not url.startswith('GUID'):
-        'visited_urls': visited_urls,
+      img_urls.remove(url)
-        'banned_urls': banned_urls,
+
-        'upcoming_urls': upcoming_urls,
+  # Step 8: Sanity check on image URLs
-        'img_urls': img_urls,
+  for url in img_urls:
-        'visited_img_urls': visited_img_urls
+    if url.endswith('jpg'):
      }, pickle_out)
      pickle_out.close()
      print("****** SESSION SAVED ******")
    if url.startswith('GUID') and url.endswith('.html'):
      driver.get(base_url + url)
    else:
      upcoming_urls.remove(url)
      continue
-    source = driver.find_element_by_css_selector("html").get_attribute('outerHTML')
+    elif url.endswith('png'):
-
+      continue
-    with open('docs/' + url, 'w', encoding='utf-8') as f:
+    elif url.endswith('gif'):
-      source = re.sub(mpulse_tracker, '', source)
+      continue
-      source = re.sub(google_tracker, '', source)
+    print(url)
-
+
-      f.write(source)
+def run():
-      visited_urls.append(url)
+  driver = Webdriver()
-      upcoming_urls.remove(url)
+  new_session()
-      print("visited: " + str(len(visited_urls)))
+  driver.get_index()
-      print("upcoming: " + str(len(upcoming_urls)))
+  save_session()
-      print("images: " + str(len(set(img_urls))))
+  driver.get_support_files()
-
+  save_session()
-    soup = BeautifulSoup(source, 'html.parser')
+  driver.get_html()
-    for link in soup.find_all('a'):
+  save_session()
-      if link.get('href') not in visited_urls and link.get('href') not in banned_urls and link.get('href') not in upcoming_urls:
+  clean_img_urls()
-        if link.get('href').startswith('GUID') and link.get('href').endswith('.html'):
+  driver.get_imgs()
-          upcoming_urls.append(link.get('href'))
+  time.sleep(15)
-
+  driver.driver.quit()
-    for img in soup.find_all('img'):
+
-      if img.get('src') not in img_urls:
+run()
        img_urls.append(img.get('src'))
 # Step 7: Save session after all html files collected
 pickle_out = open("dict.pickle","wb")
 pickle.dump({
  'visited_urls': visited_urls,
  'banned_urls': banned_urls,
  'upcoming_urls': upcoming_urls,
  'img_urls': img_urls,
  'visited_img_urls': visited_img_urls
 }, pickle_out)
 pickle_out.close()
 print("****** SESSION SAVED ******")
 # Step 8: Clean image URLs
 for url in img_urls:
  if not isinstance(url, str):
    img_urls.remove(url)
  elif not url.startswith('GUID'):
    img_urls.remove(url)
 # Step 9: Sanity check on image URLs
 for url in img_urls:
  if url.endswith('jpg'):
    continue
  elif url.endswith('png'):
    continue
  elif url.endswith('gif'):
    continue
  print(url)
 number_of_images = len(set(img_urls))
 number_of_images_downloaded = 0
 # Step 10: Download images with direct requests
 headers = {
 "User-Agent":
  "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
 }
 s = requests.session()
 s.headers.update(headers)
 for url in set(img_urls):
  if url not in visited_img_urls:
    if number_of_images_downloaded % 200 == 0:
      pickle_out = open("dict.pickle","wb")
      pickle.dump({
        'visited_urls': visited_urls,
        'banned_urls': banned_urls,
        'upcoming_urls': upcoming_urls,
        'img_urls': img_urls,
        'visited_img_urls': visited_img_urls
      }, pickle_out)
      pickle_out.close()
      print("****** SESSION SAVED ******")
    for cookie in driver.get_cookies():
        c = {cookie['name']: cookie['value']}
        s.cookies.update(c)
    r = s.get(base_url + url, allow_redirects=True)
    open("docs/" + url, 'wb').write(r.content)
    visited_img_urls.append(url)
    print("images: " + str(number_of_images))
    print("downloaded: " + str(number_of_images_downloaded))
    number_of_images_downloaded += 1
 time.sleep(25)
 driver.quit()