changedetection.io/changedetectionio/content_fetcher.py

from abc import abstractmethod
from distutils.util import strtobool
from urllib.parse import urlparse
import chardet
import hashlib
import json
import logging
import os
import requests
import sys
import time
import urllib.parse

visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary'


class Non200ErrorCodeReceived(Exception):
    def __init__(self, status_code, url, screenshot=None, xpath_data=None, page_html=None):
        # Set this so we can use it in other parts of the app
        self.status_code = status_code
        self.url = url
        self.screenshot = screenshot
        self.xpath_data = xpath_data
        self.page_text = None

        if page_html:
            from changedetectionio import html_tools
            self.page_text = html_tools.html_to_text(page_html)
        return


class checksumFromPreviousCheckWasTheSame(Exception):
    def __init__(self):
        return


class JSActionExceptions(Exception):
    def __init__(self, status_code, url, screenshot, message=''):
        self.status_code = status_code
        self.url = url
        self.screenshot = screenshot
        self.message = message
        return


class BrowserStepsStepTimout(Exception):
    def __init__(self, step_n):
        self.step_n = step_n
        return


class PageUnloadable(Exception):
    def __init__(self, status_code, url, message, screenshot=False):
        # Set this so we can use it in other parts of the app
        self.status_code = status_code
        self.url = url
        self.screenshot = screenshot
        self.message = message
        return


class EmptyReply(Exception):
    def __init__(self, status_code, url, screenshot=None):
        # Set this so we can use it in other parts of the app
        self.status_code = status_code
        self.url = url
        self.screenshot = screenshot
        return


class ScreenshotUnavailable(Exception):
    def __init__(self, status_code, url, page_html=None):
        # Set this so we can use it in other parts of the app
        self.status_code = status_code
        self.url = url
        if page_html:
            from html_tools import html_to_text
            self.page_text = html_to_text(page_html)
        return


class ReplyWithContentButNoText(Exception):
    def __init__(self, status_code, url, screenshot=None, has_filters=False, html_content=''):
        # Set this so we can use it in other parts of the app
        self.status_code = status_code
        self.url = url
        self.screenshot = screenshot
        self.has_filters = has_filters
        self.html_content = html_content
        return


class Fetcher():
    browser_steps = None
    browser_steps_screenshot_path = None
    content = None
    error = None
    fetcher_description = "No description"
    headers = {}
    status_code = None
    webdriver_js_execute_code = None
    xpath_data = None
    xpath_element_js = ""
    instock_data = None
    instock_data_js = ""

    # Will be needed in the future by the VisualSelector, always get this where possible.
    screenshot = False
    system_http_proxy = os.getenv('HTTP_PROXY')
    system_https_proxy = os.getenv('HTTPS_PROXY')

    # Time ONTOP of the system defined env minimum time
    render_extract_delay = 0

    def __init__(self):
        from pkg_resources import resource_string
        # The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector
        self.xpath_element_js = resource_string(__name__, "res/xpath_element_scraper.js").decode('utf-8')
        self.instock_data_js = resource_string(__name__, "res/stock-not-in-stock.js").decode('utf-8')

    @abstractmethod
    def get_error(self):
        return self.error

    @abstractmethod
    def run(self,
            url,
            timeout,
            request_headers,
            request_body,
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
            is_binary=False):
        # Should set self.error, self.status_code and self.content
        pass

    @abstractmethod
    def quit(self):
        return

    @abstractmethod
    def get_last_status_code(self):
        return self.status_code

    @abstractmethod
    def screenshot_step(self, step_n):
        return None

    @abstractmethod
    # Return true/false if this checker is ready to run, in the case it needs todo some special config check etc
    def is_ready(self):
        return True

    def get_all_headers(self):
        """
        Get all headers but ensure all keys are lowercase
        :return:
        """
        return {k.lower(): v for k, v in self.headers.items()}

    def iterate_browser_steps(self):
        from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
        from playwright._impl._api_types import TimeoutError
        from jinja2 import Environment
        jinja2_env = Environment(extensions=['jinja2_time.TimeExtension'])

        step_n = 0

        if self.browser_steps is not None and len(self.browser_steps):
            interface = steppable_browser_interface()
            interface.page = self.page

            valid_steps = filter(
                lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'),
                self.browser_steps)

            for step in valid_steps:
                step_n += 1
                print(">> Iterating check - browser Step n {} - {}...".format(step_n, step['operation']))
                self.screenshot_step("before-" + str(step_n))
                self.save_step_html("before-" + str(step_n))
                try:
                    optional_value = step['optional_value']
                    selector = step['selector']
                    # Support for jinja2 template in step values, with date module added
                    if '{%' in step['optional_value'] or '{{' in step['optional_value']:
                        optional_value = str(jinja2_env.from_string(step['optional_value']).render())
                    if '{%' in step['selector'] or '{{' in step['selector']:
                        selector = str(jinja2_env.from_string(step['selector']).render())

                    getattr(interface, "call_action")(action_name=step['operation'],
                                                      selector=selector,
                                                      optional_value=optional_value)
                    self.screenshot_step(step_n)
                    self.save_step_html(step_n)
                except TimeoutError as e:
                    print(str(e))
                    # Stop processing here
                    raise BrowserStepsStepTimout(step_n=step_n)

    # It's always good to reset these
    def delete_browser_steps_screenshots(self):
        import glob
        if self.browser_steps_screenshot_path is not None:
            dest = os.path.join(self.browser_steps_screenshot_path, 'step_*.jpeg')
            files = glob.glob(dest)
            for f in files:
                if os.path.isfile(f):
                    os.unlink(f)


#   Maybe for the future, each fetcher provides its own diff output, could be used for text, image
#   the current one would return javascript output (as we use JS to generate the diff)
#
def available_fetchers():
    # See the if statement at the bottom of this file for how we switch between playwright and webdriver
    import inspect
    p = []
    for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass):
        if inspect.isclass(obj):
            # @todo html_ is maybe better as fetcher_ or something
            # In this case, make sure to edit the default one in store.py and fetch_site_status.py
            if name.startswith('html_'):
                t = tuple([name, obj.fetcher_description])
                p.append(t)

    return p


class base_html_playwright(Fetcher):
    fetcher_description = "Playwright {}/Javascript".format(
        os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
    )
    if os.getenv("PLAYWRIGHT_DRIVER_URL"):
        fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL"))

    browser_type = ''
    command_executor = ''

    # Configs for Proxy setup
    # In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server"
    playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password']

    proxy = None

    def __init__(self, proxy_override=None):
        super().__init__()
        # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
        self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
        self.command_executor = os.getenv(
            "PLAYWRIGHT_DRIVER_URL",
            'ws://playwright-chrome:3000'
        ).strip('"')

        # If any proxy settings are enabled, then we should setup the proxy object
        proxy_args = {}
        for k in self.playwright_proxy_settings_mappings:
            v = os.getenv('playwright_proxy_' + k, False)
            if v:
                proxy_args[k] = v.strip('"')

        if proxy_args:
            self.proxy = proxy_args

        # allow per-watch proxy selection override
        if proxy_override:
            self.proxy = {'server': proxy_override}

        if self.proxy:
            # Playwright needs separate username and password values
            parsed = urlparse(self.proxy.get('server'))
            if parsed.username:
                self.proxy['username'] = parsed.username
                self.proxy['password'] = parsed.password

    def screenshot_step(self, step_n=''):
        screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=85)

        if self.browser_steps_screenshot_path is not None:
            destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n))
            logging.debug("Saving step screenshot to {}".format(destination))
            with open(destination, 'wb') as f:
                f.write(screenshot)

    def save_step_html(self, step_n):
        content = self.page.content()
        destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n))
        logging.debug("Saving step HTML to {}".format(destination))
        with open(destination, 'w') as f:
            f.write(content)

    def run_fetch_browserless_puppeteer(self,
            url,
            timeout,
            request_headers,
            request_body,
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
            is_binary=False):

        from pkg_resources import resource_string

        extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000

        self.xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
        code = resource_string(__name__, "res/puppeteer_fetch.js").decode('utf-8')
        # In the future inject this is a proper JS package
        code = code.replace('%xpath_scrape_code%', self.xpath_element_js)
        code = code.replace('%instock_scrape_code%', self.instock_data_js)

        from requests.exceptions import ConnectTimeout, ReadTimeout
        wait_browserless_seconds = 240

        browserless_function_url = os.getenv('BROWSERLESS_FUNCTION_URL')
        from urllib.parse import urlparse
        if not browserless_function_url:
            # Convert/try to guess from PLAYWRIGHT_DRIVER_URL
            o = urlparse(os.getenv('PLAYWRIGHT_DRIVER_URL'))
            browserless_function_url = o._replace(scheme="http")._replace(path="function").geturl()


        # Append proxy connect string
        if self.proxy:
            # Remove username/password if it exists in the URL or you will receive "ERR_NO_SUPPORTED_PROXIES" error
            # Actual authentication handled by Puppeteer/node
            o = urlparse(self.proxy.get('server'))
            # Remove scheme, socks5:// doesnt always work and it will autodetect anyway
            proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl().replace(f"{o.scheme}://", '', 1))
            browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}&dumpio=true"

        try:
            amp = '&' if '?' in browserless_function_url else '?'
            response = requests.request(
                method="POST",
                json={
                    "code": code,
                    "context": {
                        # Very primitive disk cache - USE WITH EXTREME CAUTION
                        # Run browserless container  with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]"
                        'disk_cache_dir': os.getenv("PUPPETEER_DISK_CACHE", False), # or path to disk cache ending in /, ie /tmp/cache/
                        'execute_js': self.webdriver_js_execute_code,
                        'extra_wait_ms': extra_wait_ms,
                        'include_filters': current_include_filters,
                        'req_headers': request_headers,
                        'screenshot_quality': int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)),
                        'url': url,
                        'user_agent': {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None),
                        'proxy_username': self.proxy.get('username', '') if self.proxy else False,
                        'proxy_password': self.proxy.get('password', '') if self.proxy and self.proxy.get('username') else False,
                        'no_cache_list': [
                            'twitter',
                            '.pdf'
                        ],
                        # Could use https://github.com/easylist/easylist here, or install a plugin
                        'block_url_list': [
                            'adnxs.com',
                            'analytics.twitter.com',
                            'doubleclick.net',
                            'google-analytics.com',
                            'googletagmanager',
                            'trustpilot.com'
                        ]
                    }
                },
                # @todo /function needs adding ws:// to http:// rebuild this
                url=browserless_function_url+f"{amp}--disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts",
                timeout=wait_browserless_seconds)

        except ReadTimeout:
            raise PageUnloadable(url=url, status_code=None, message=f"No response from browserless in {wait_browserless_seconds}s")
        except ConnectTimeout:
            raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browserless, retrying..")
        else:
            # 200 Here means that the communication to browserless worked only, not the page state
            if response.status_code == 200:
                import base64

                x = response.json()
                if not x.get('screenshot'):
                    # https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips
                    # https://github.com/puppeteer/puppeteer/issues/1834
                    # https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-381047051
                    # Check your memory is shared and big enough
                    raise ScreenshotUnavailable(url=url, status_code=None)

                if not x.get('content', '').strip():
                    raise EmptyReply(url=url, status_code=None)

                if x.get('status_code', 200) != 200 and not ignore_status_codes:
                    raise Non200ErrorCodeReceived(url=url, status_code=x.get('status_code', 200), page_html=x['content'])

                self.content = x.get('content')
                self.headers = x.get('headers')
                self.instock_data = x.get('instock_data')
                self.screenshot = base64.b64decode(x.get('screenshot'))
                self.status_code = x.get('status_code')
                self.xpath_data = x.get('xpath_data')

            else:
                # Some other error from browserless
                raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8'))

    def run(self,
            url,
            timeout,
            request_headers,
            request_body,
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
            is_binary=False):

        # For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!)
        has_browser_steps = self.browser_steps and list(filter(
                lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'),
                self.browser_steps))

        if not has_browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
            if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')):
                # Temporary backup solution until we rewrite the playwright code
                return self.run_fetch_browserless_puppeteer(
                    url,
                    timeout,
                    request_headers,
                    request_body,
                    request_method,
                    ignore_status_codes,
                    current_include_filters,
                    is_binary)

        from playwright.sync_api import sync_playwright
        import playwright._impl._api_types

        self.delete_browser_steps_screenshots()
        response = None

        with sync_playwright() as p:
            browser_type = getattr(p, self.browser_type)

            # Seemed to cause a connection Exception even tho I can see it connect
            # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
            # 60,000 connection timeout only
            browser = browser_type.connect_over_cdp(self.command_executor, timeout=60000)

            # SOCKS5 with authentication is not supported (yet)
            # https://github.com/microsoft/playwright/issues/10567

            # Set user agent to prevent Cloudflare from blocking the browser
            # Use the default one configured in the App.py model that's passed from fetch_site_status.py
            context = browser.new_context(
                user_agent={k.lower(): v for k, v in request_headers.items()}.get('user-agent', None),
                proxy=self.proxy,
                # This is needed to enable JavaScript execution on GitHub and others
                bypass_csp=True,
                # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
                service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'),
                # Should never be needed
                accept_downloads=False
            )

            self.page = context.new_page()
            if len(request_headers):
                context.set_extra_http_headers(request_headers)

                self.page.set_default_navigation_timeout(90000)
                self.page.set_default_timeout(90000)

                # Listen for all console events and handle errors
                self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))

            # Goto page
            try:
                # Wait_until = commit
                # - `'commit'` - consider operation to be finished when network response is received and the document started loading.
                # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
                # This seemed to solve nearly all 'TimeoutErrors'
                response = self.page.goto(url, wait_until='commit')
            except playwright._impl._api_types.Error as e:
                # Retry once - https://github.com/browserless/chrome/issues/2485
                # Sometimes errors related to invalid cert's and other can be random
                print("Content Fetcher > retrying request got error - ", str(e))
                time.sleep(1)
                response = self.page.goto(url, wait_until='commit')
            except Exception as e:
                print("Content Fetcher > Other exception when page.goto", str(e))
                context.close()
                browser.close()
                raise PageUnloadable(url=url, status_code=None, message=str(e))

            # Execute any browser steps
            try:
                extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
                self.page.wait_for_timeout(extra_wait * 1000)

                if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
                    self.page.evaluate(self.webdriver_js_execute_code)

            except playwright._impl._api_types.TimeoutError as e:
                context.close()
                browser.close()
                # This can be ok, we will try to grab what we could retrieve
                pass
            except Exception as e:
                print("Content Fetcher > Other exception when executing custom JS code", str(e))
                context.close()
                browser.close()
                raise PageUnloadable(url=url, status_code=None, message=str(e))

            if response is None:
                context.close()
                browser.close()
                print("Content Fetcher > Response object was none")
                raise EmptyReply(url=url, status_code=None)

            # Run Browser Steps here
            self.iterate_browser_steps()

            extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
            time.sleep(extra_wait)

            self.content = self.page.content()
            self.status_code = response.status
            if len(self.page.content().strip()) == 0:
                context.close()
                browser.close()
                print("Content Fetcher > Content was empty")
                raise EmptyReply(url=url, status_code=response.status)

            self.status_code = response.status
            self.headers = response.all_headers()

            # So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
            if current_include_filters is not None:
                self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
            else:
                self.page.evaluate("var include_filters=''")

            self.xpath_data = self.page.evaluate(
                "async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
            self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}")

            # Bug 3 in Playwright screenshot handling
            # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
            # JPEG is better here because the screenshots can be very very large

            # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
            # which will significantly increase the IO size between the server and client, it's recommended to use the lowest
            # acceptable screenshot quality here
            try:
                # The actual screenshot
                self.screenshot = self.page.screenshot(type='jpeg', full_page=True,
                                                       quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))
            except Exception as e:
                context.close()
                browser.close()
                raise ScreenshotUnavailable(url=url, status_code=None)

            context.close()
            browser.close()


class base_html_webdriver(Fetcher):
    if os.getenv("WEBDRIVER_URL"):
        fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
    else:
        fetcher_description = "WebDriver Chrome/Javascript"

    command_executor = ''

    # Configs for Proxy setup
    # In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
    selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
                                        'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
                                        'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
    proxy = None

    def __init__(self, proxy_override=None):
        super().__init__()
        from selenium.webdriver.common.proxy import Proxy as SeleniumProxy

        # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
        self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')

        # If any proxy settings are enabled, then we should setup the proxy object
        proxy_args = {}
        for k in self.selenium_proxy_settings_mappings:
            v = os.getenv('webdriver_' + k, False)
            if v:
                proxy_args[k] = v.strip('"')

        # Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy
        if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy:
            proxy_args['httpProxy'] = self.system_http_proxy
        if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy:
            proxy_args['httpsProxy'] = self.system_https_proxy

        # Allows override the proxy on a per-request basis
        if proxy_override is not None:
            proxy_args['httpProxy'] = proxy_override

        if proxy_args:

            # Convert socks5:// to selenium SOCKS5 correct config
            o = urlparse(proxy_args.get('httpProxy', ''))
            # https://github.com/tebeka/selenium/issues/167
            # https://github.com/SeleniumHQ/selenium/issues/6917
            if o.scheme == 'socks5':
                if proxy_args.get('httpProxy'):
                    del(proxy_args['httpProxy'])
                path = "/" + o.path if o.path else ''
                if not o.port:
                    raise Exception("No Port number specific in this proxy config URL")
                proxy_args['socksProxy'] = f"{o.hostname}:{o.port}{path}"
                proxy_args['socksVersion'] = 5
                proxy_args['proxyType'] = 'MANUAL'
                if o.username:
                    proxy_args['socksUsername'] = o.username
                    proxy_args['socksPassword'] = o.password

            self.proxy = SeleniumProxy(raw=proxy_args)

    def run(self,
            url,
            timeout,
            request_headers,
            request_body,
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
            is_binary=False):

        from selenium import webdriver
        from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
        from selenium.common.exceptions import WebDriverException
        # request_body, request_method unused for now, until some magic in the future happens.

        self.driver = webdriver.Remote(
            command_executor=self.command_executor,
            desired_capabilities=DesiredCapabilities.CHROME,
            proxy=self.proxy)

        try:
            self.driver.get(url)
        except WebDriverException as e:
            # Be sure we close the session window
            self.quit()
            raise

        self.driver.set_window_size(1280, 1024)
        self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))

        if self.webdriver_js_execute_code is not None:
            self.driver.execute_script(self.webdriver_js_execute_code)
            # Selenium doesn't automatically wait for actions as good as Playwright, so wait again
            self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))

        # @todo - how to check this? is it possible?
        self.status_code = 200
        # @todo somehow we should try to get this working for WebDriver
        # raise EmptyReply(url=url, status_code=r.status_code)

        # @todo - dom wait loaded?
        time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
        self.content = self.driver.page_source
        self.headers = {}

        self.screenshot = self.driver.get_screenshot_as_png()

    # Does the connection to the webdriver work? run a test connection.
    def is_ready(self):
        from selenium import webdriver
        from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

        self.driver = webdriver.Remote(
            command_executor=self.command_executor,
            desired_capabilities=DesiredCapabilities.CHROME)

        # driver.quit() seems to cause better exceptions
        self.quit()
        return True

    def quit(self):
        if self.driver:
            try:
                self.driver.quit()
            except Exception as e:
                print("Content Fetcher > Exception in chrome shutdown/quit" + str(e))


# "html_requests" is listed as the default fetcher in store.py!
class html_requests(Fetcher):
    fetcher_description = "Basic fast Plaintext/HTTP Client"

    def __init__(self, proxy_override=None):
        self.proxy_override = proxy_override

    def run(self,
            url,
            timeout,
            request_headers,
            request_body,
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
            is_binary=False):

        # Make requests use a more modern looking user-agent
        if not {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None):
            request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT",
                                                      'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36')

        proxies = {}

        # Allows override the proxy on a per-request basis

        # https://requests.readthedocs.io/en/latest/user/advanced/#socks
        # Should also work with `socks5://user:pass@host:port` type syntax.

        if self.proxy_override:
            proxies = {'http': self.proxy_override, 'https': self.proxy_override, 'ftp': self.proxy_override}
        else:
            if self.system_http_proxy:
                proxies['http'] = self.system_http_proxy
            if self.system_https_proxy:
                proxies['https'] = self.system_https_proxy

        r = requests.request(method=request_method,
                             data=request_body,
                             url=url,
                             headers=request_headers,
                             timeout=timeout,
                             proxies=proxies,
                             verify=False)

        # If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
        # For example - some sites don't tell us it's utf-8, but return utf-8 content
        # This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
        # https://github.com/psf/requests/issues/1604 good info about requests encoding detection
        if not is_binary:
            # Don't run this for PDF (and requests identified as binary) takes a _long_ time
            if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
                encoding = chardet.detect(r.content)['encoding']
                if encoding:
                    r.encoding = encoding

        if not r.content or not len(r.content):
            raise EmptyReply(url=url, status_code=r.status_code)

        # @todo test this
        # @todo maybe you really want to test zero-byte return pages?
        if r.status_code != 200 and not ignore_status_codes:
            # maybe check with content works?
            raise Non200ErrorCodeReceived(url=url, status_code=r.status_code, page_html=r.text)

        self.status_code = r.status_code
        if is_binary:
            # Binary files just return their checksum until we add something smarter
            self.content = hashlib.md5(r.content).hexdigest()
        else:
            self.content = r.text

        self.headers = r.headers
        self.raw_content = r.content


# Decide which is the 'real' HTML webdriver, this is more a system wide config
# rather than site-specific.
use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)
if use_playwright_as_chrome_fetcher:
    html_webdriver = base_html_playwright
else:
    html_webdriver = base_html_webdriver