From 94f38f052ef7a12039f13d37a438cc277a1f076c Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Fri, 5 May 2023 21:58:08 +0200
Subject: [PATCH] Fetcher - playwright/browserless - Use builtin node puppeteer
 handler in browserless, scales way better, and is faster (#1559)

---
 changedetectionio/content_fetcher.py          | 192 ++++++++++++++++--
 .../res/xpath_element_scraper.js              |   6 +-
 2 files changed, 180 insertions(+), 18 deletions(-)

diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py
index 1b31aef3..2d8a66ab 100644
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@@ -10,6 +10,7 @@ import time
 
 visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary'
 
+
 class Non200ErrorCodeReceived(Exception):
     def __init__(self, status_code, url, screenshot=None, xpath_data=None, page_html=None):
         # Set this so we can use it in other parts of the app
@@ -24,10 +25,12 @@ class Non200ErrorCodeReceived(Exception):
             self.page_text = html_tools.html_to_text(page_html)
         return
 
+
 class checksumFromPreviousCheckWasTheSame(Exception):
     def __init__(self):
         return
 
+
 class JSActionExceptions(Exception):
     def __init__(self, status_code, url, screenshot, message=''):
         self.status_code = status_code
@@ -36,6 +39,7 @@ class JSActionExceptions(Exception):
         self.message = message
         return
 
+
 class BrowserStepsStepTimout(Exception):
     def __init__(self, step_n):
         self.step_n = step_n
@@ -51,6 +55,7 @@ class PageUnloadable(Exception):
         self.message = message
         return
 
+
 class EmptyReply(Exception):
     def __init__(self, status_code, url, screenshot=None):
         # Set this so we can use it in other parts of the app
@@ -59,6 +64,7 @@ class EmptyReply(Exception):
         self.screenshot = screenshot
         return
 
+
 class ScreenshotUnavailable(Exception):
     def __init__(self, status_code, url, page_html=None):
         # Set this so we can use it in other parts of the app
@@ -69,6 +75,7 @@ class ScreenshotUnavailable(Exception):
             self.page_text = html_to_text(page_html)
         return
 
+
 class ReplyWithContentButNoText(Exception):
     def __init__(self, status_code, url, screenshot=None):
         # Set this so we can use it in other parts of the app
@@ -77,13 +84,14 @@ class ReplyWithContentButNoText(Exception):
         self.screenshot = screenshot
         return
 
+
 class Fetcher():
     browser_steps = None
     browser_steps_screenshot_path = None
     content = None
     error = None
     fetcher_description = "No description"
-    headers = None
+    headers = {}
     status_code = None
     webdriver_js_execute_code = None
     xpath_data = None
@@ -105,7 +113,6 @@ class Fetcher():
         self.xpath_element_js = resource_string(__name__, "res/xpath_element_scraper.js").decode('utf-8')
         self.instock_data_js = resource_string(__name__, "res/stock-not-in-stock.js").decode('utf-8')
 
-
     @abstractmethod
     def get_error(self):
         return self.error
@@ -152,13 +159,15 @@ class Fetcher():
             interface = steppable_browser_interface()
             interface.page = self.page
 
-            valid_steps = filter(lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), self.browser_steps)
+            valid_steps = filter(
+                lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'),
+                self.browser_steps)
 
             for step in valid_steps:
                 step_n += 1
                 print(">> Iterating check - browser Step n {} - {}...".format(step_n, step['operation']))
-                self.screenshot_step("before-"+str(step_n))
-                self.save_step_html("before-"+str(step_n))
+                self.screenshot_step("before-" + str(step_n))
+                self.save_step_html("before-" + str(step_n))
                 try:
                     optional_value = step['optional_value']
                     selector = step['selector']
@@ -177,8 +186,6 @@ class Fetcher():
                     # Stop processing here
                     raise BrowserStepsStepTimout(step_n=step_n)
 
-
-
     # It's always good to reset these
     def delete_browser_steps_screenshots(self):
         import glob
@@ -188,6 +195,7 @@ class Fetcher():
             for f in files:
                 os.unlink(f)
 
+
 #   Maybe for the future, each fetcher provides its own diff output, could be used for text, image
 #   the current one would return javascript output (as we use JS to generate the diff)
 #
@@ -205,6 +213,7 @@ def available_fetchers():
 
     return p
 
+
 class base_html_playwright(Fetcher):
     fetcher_description = "Playwright {}/Javascript".format(
         os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
@@ -278,6 +287,156 @@ class base_html_playwright(Fetcher):
             current_include_filters=None,
             is_binary=False):
 
+        # Fallback for now to the old way if browsersteps
+        # @todo - need to figure out how to get browsersteps with images on each step working
+        if self.browser_steps:
+            for step in self.browser_steps:
+                if step.get('operation'):
+                    return self.run_playwright(
+                                   url,
+                                   timeout,
+                                   request_headers,
+                                   request_body,
+                                   request_method,
+                                   ignore_status_codes,
+                                   current_include_filters,
+                                   is_binary)
+        elif os.getenv('FORCE_PLAYWRIGHT_FETCH'):
+            # Temporary backup solution until we rewrite the playwright code
+            return self.run_playwright(
+                url,
+                timeout,
+                request_headers,
+                request_body,
+                request_method,
+                ignore_status_codes,
+                current_include_filters,
+                is_binary)
+
+
+        extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000
+        xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
+
+        code = f"""module.exports = async ({{ page, context }}) => {{
+          var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy}} = context;
+          
+          await page.setBypassCSP(true)
+          await page.setExtraHTTPHeaders(req_headers);          
+          await page.setUserAgent(user_agent);
+          
+          if(proxy) {{
+            await page.authenticate({{
+                username: proxy['username'],
+                password: proxy['password'],
+            }});
+          }}
+          
+          const r = await page.goto(url, wait_until='commit');                  
+          await page.waitForTimeout(extra_wait_ms)
+          
+          if(execute_js) {{
+            await page.evaluate(execute_js);
+            await page.waitForTimeout(200);
+          }}
+          
+          const xpath_data = await page.evaluate((include_filters) => {{ {xpath_element_js} }}, include_filters);
+          const instock_data = await page.evaluate(() => {{ {self.instock_data_js} }});
+      
+          const html = await page.content();
+          const b64s = await page.screenshot({{ encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg' }});
+          return {{
+            data: {{
+                'content': html, 
+                'headers': r.headers(), 
+                'instock_data': instock_data,
+                'screenshot': b64s,
+                'status_code': r.status(),
+                'xpath_data': xpath_data
+            }},
+            type: 'application/json',
+          }};
+        }};"""
+
+        from requests.exceptions import ConnectTimeout, ReadTimeout
+        wait_browserless_seconds = 120
+
+        browserless_function_url = os.getenv('BROWSERLESS_FUNCTION_URL')
+        from urllib.parse import urlparse
+        if not browserless_function_url:
+            # Convert/try to guess from PLAYWRIGHT_DRIVER_URL
+            o = urlparse(os.getenv('PLAYWRIGHT_DRIVER_URL'))
+            browserless_function_url = o._replace(scheme="http")._replace(path="function").geturl()
+
+
+        # Append proxy connect string
+        if self.proxy:
+            import urllib.parse
+            # Remove username/password if it exists in the URL or you will receive "ERR_NO_SUPPORTED_PROXIES" error
+            # Actual authentication handled by Puppeteer/node
+            o = urlparse(self.proxy.get('server'))
+            proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl())
+            browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}"
+
+
+        try:
+            response = requests.request(
+                method="POST",
+                json={
+                    "code": code,
+                    "context": {
+                        'execute_js': self.webdriver_js_execute_code,
+                        'extra_wait_ms': extra_wait_ms,
+                        'include_filters': current_include_filters,
+                        'proxy': self.proxy,
+                        'req_headers': request_headers,
+                        'screenshot_quality': int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)),
+                        'url': url,
+                        'user_agent': request_headers.get('User-Agent', 'Mozilla/5.0'),
+                    }
+                },
+                # @todo /function needs adding ws:// to http:// rebuild this
+                url=browserless_function_url,
+                timeout=wait_browserless_seconds)
+
+        except ReadTimeout:
+            raise PageUnloadable(url=url, status_code=None, message=f"No response from browserless in {wait_browserless_seconds}s")
+        except ConnectTimeout:
+            raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browserless, retrying..")
+        else:
+            # 200 Here means that the communication to browserless worked only, not the page state
+            if response.status_code == 200:
+                import base64
+
+                x = response.json()
+                if not x.get('screenshot'):
+                    raise ScreenshotUnavailable(url=url, status_code=None)
+
+                if not x.get('content', '').strip():
+                    raise EmptyReply(url=url, status_code=None)
+
+                if x.get('status_code', 200) != 200 and not ignore_status_codes:
+                    raise Non200ErrorCodeReceived(url=url, status_code=x.get('status_code', 200), page_html=x['content'])
+
+                self.content = x.get('content')
+                self.headers = x.get('headers')
+                self.instock_data = x.get('instock_data')
+                self.screenshot = base64.b64decode(x.get('screenshot'))
+                self.xpath_data = x.get('xpath_data')
+
+            else:
+                # Some other error from browserless
+                raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8'))
+
+    def run_playwright(self,
+                       url,
+                       timeout,
+                       request_headers,
+                       request_body,
+                       request_method,
+                       ignore_status_codes=False,
+                       current_include_filters=None,
+                       is_binary=False):
+
         from playwright.sync_api import sync_playwright
         import playwright._impl._api_types
 
@@ -294,7 +453,7 @@ class base_html_playwright(Fetcher):
             # Set user agent to prevent Cloudflare from blocking the browser
             # Use the default one configured in the App.py model that's passed from fetch_site_status.py
             context = browser.new_context(
-                user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0',
+                user_agent=request_headers.get('User-Agent', 'Mozilla/5.0'),
                 proxy=self.proxy,
                 # This is needed to enable JavaScript execution on GitHub and others
                 bypass_csp=True,
@@ -324,12 +483,12 @@ class base_html_playwright(Fetcher):
             except playwright._impl._api_types.Error as e:
                 # Retry once - https://github.com/browserless/chrome/issues/2485
                 # Sometimes errors related to invalid cert's and other can be random
-                print ("Content Fetcher > retrying request got error - ", str(e))
+                print("Content Fetcher > retrying request got error - ", str(e))
                 time.sleep(1)
                 response = self.page.goto(url, wait_until='commit')
 
             except Exception as e:
-                print ("Content Fetcher > Other exception when page.goto", str(e))
+                print("Content Fetcher > Other exception when page.goto", str(e))
                 context.close()
                 browser.close()
                 raise PageUnloadable(url=url, status_code=None, message=str(e))
@@ -348,7 +507,7 @@ class base_html_playwright(Fetcher):
                 # This can be ok, we will try to grab what we could retrieve
                 pass
             except Exception as e:
-                print ("Content Fetcher > Other exception when executing custom JS code", str(e))
+                print("Content Fetcher > Other exception when executing custom JS code", str(e))
                 context.close()
                 browser.close()
                 raise PageUnloadable(url=url, status_code=None, message=str(e))
@@ -356,7 +515,7 @@ class base_html_playwright(Fetcher):
             if response is None:
                 context.close()
                 browser.close()
-                print ("Content Fetcher > Response object was none")
+                print("Content Fetcher > Response object was none")
                 raise EmptyReply(url=url, status_code=None)
 
             # Run Browser Steps here
@@ -370,7 +529,7 @@ class base_html_playwright(Fetcher):
             if len(self.page.content().strip()) == 0:
                 context.close()
                 browser.close()
-                print ("Content Fetcher > Content was empty")
+                print("Content Fetcher > Content was empty")
                 raise EmptyReply(url=url, status_code=response.status)
 
             self.status_code = response.status
@@ -382,7 +541,8 @@ class base_html_playwright(Fetcher):
             else:
                 self.page.evaluate("var include_filters=''")
 
-            self.xpath_data = self.page.evaluate("async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
+            self.xpath_data = self.page.evaluate(
+                "async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
             self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}")
 
             # Bug 3 in Playwright screenshot handling
@@ -394,7 +554,8 @@ class base_html_playwright(Fetcher):
             # acceptable screenshot quality here
             try:
                 # The actual screenshot
-                self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))
+                self.screenshot = self.page.screenshot(type='jpeg', full_page=True,
+                                                       quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))
             except Exception as e:
                 context.close()
                 browser.close()
@@ -403,6 +564,7 @@ class base_html_playwright(Fetcher):
             context.close()
             browser.close()
 
+
 class base_html_webdriver(Fetcher):
     if os.getenv("WEBDRIVER_URL"):
         fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
diff --git a/changedetectionio/res/xpath_element_scraper.js b/changedetectionio/res/xpath_element_scraper.js
index effdd58b..e363d2a1 100644
--- a/changedetectionio/res/xpath_element_scraper.js
+++ b/changedetectionio/res/xpath_element_scraper.js
@@ -38,15 +38,15 @@ const findUpTag = (el) => {
     if (el.name !== undefined && el.name.length) {
         var proposed = el.tagName + "[name=" + el.name + "]";
         var proposed_element = window.document.querySelectorAll(proposed);
-        if(proposed_element.length) {
+        if (proposed_element.length) {
             if (proposed_element.length === 1) {
                 return proposed;
             } else {
                 // Some sites change ID but name= stays the same, we can hit it if we know the index
                 // Find all the elements that match and work out the input[n]
-                var n=Array.from(proposed_element).indexOf(el);
+                var n = Array.from(proposed_element).indexOf(el);
                 // Return a Playwright selector for nthinput[name=zipcode]
-                return proposed+" >> nth="+n;
+                return proposed + " >> nth=" + n;
             }
         }
     }