changedetection.io/changedetectionio/content_fetcher.py

from abc import ABC, abstractmethod
import chardet
import os
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
from selenium.common.exceptions import WebDriverException
import requests
import time
import urllib3.exceptions


class EmptyReply(Exception):
    def __init__(self, status_code, url):
        # Set this so we can use it in other parts of the app
        self.status_code = status_code
        self.url = url
        return

    pass

class Fetcher():
    error = None
    status_code = None
    content = None
    headers = None

    fetcher_description ="No description"

    @abstractmethod
    def get_error(self):
        return self.error

    @abstractmethod
    def run(self,
            url,
            timeout,
            request_headers,
            request_body,
            request_method,
            ignore_status_codes=False):
        # Should set self.error, self.status_code and self.content
        pass

    @abstractmethod
    def get_last_status_code(self):
        return self.status_code

    @abstractmethod
    # Return true/false if this checker is ready to run, in the case it needs todo some special config check etc
    def is_ready(self):
        return True

#   Maybe for the future, each fetcher provides its own diff output, could be used for text, image
#   the current one would return javascript output (as we use JS to generate the diff)
#
#   Returns tuple(mime_type, stream)
#    @abstractmethod
#    def return_diff(self, stream_a, stream_b):
#        return

def available_fetchers():
        import inspect
        from changedetectionio import content_fetcher
        p=[]
        for name, obj in inspect.getmembers(content_fetcher):
            if inspect.isclass(obj):
                # @todo html_ is maybe better as fetcher_ or something
                # In this case, make sure to edit the default one in store.py and fetch_site_status.py
                if "html_" in name:
                    t=tuple([name,obj.fetcher_description])
                    p.append(t)

        return p

class html_webdriver(Fetcher):
    if os.getenv("WEBDRIVER_URL"):
        fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
    else:
        fetcher_description = "WebDriver Chrome/Javascript"

    command_executor = ''

    # Configs for Proxy setup
    # In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
    selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
                                        'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
                                        'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']


    proxy=None

    def __init__(self):
        # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
        self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')

        # If any proxy settings are enabled, then we should setup the proxy object
        proxy_args = {}
        for k in self.selenium_proxy_settings_mappings:
            v = os.getenv('webdriver_' + k, False)
            if v:
                proxy_args[k] = v.strip('"')

        if proxy_args:
            self.proxy = SeleniumProxy(raw=proxy_args)

    def run(self,
            url,
            timeout,
            request_headers,
            request_body,
            request_method,
            ignore_status_codes=False):

        # request_body, request_method unused for now, until some magic in the future happens.

        # check env for WEBDRIVER_URL
        driver = webdriver.Remote(
            command_executor=self.command_executor,
            desired_capabilities=DesiredCapabilities.CHROME,
            proxy=self.proxy)

        try:
            driver.get(url)
        except WebDriverException as e:
            # Be sure we close the session window
            driver.quit()
            raise

        # @todo - how to check this? is it possible?
        self.status_code = 200
        # @todo somehow we should try to get this working for WebDriver
        # raise EmptyReply(url=url, status_code=r.status_code)

        # @todo - dom wait loaded?
        time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
        self.content = driver.page_source
        self.headers = {}

        driver.quit()


    def is_ready(self):
        from selenium import webdriver
        from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
        from selenium.common.exceptions import WebDriverException

        driver = webdriver.Remote(
            command_executor=self.command_executor,
            desired_capabilities=DesiredCapabilities.CHROME)

        # driver.quit() seems to cause better exceptions
        driver.quit()

        return True

# "html_requests" is listed as the default fetcher in store.py!
class html_requests(Fetcher):
    fetcher_description = "Basic fast Plaintext/HTTP Client"

    def run(self,
            url,
            timeout,
            request_headers,
            request_body,
            request_method,
            ignore_status_codes=False):

        r = requests.request(method=request_method,
                         data=request_body,
                         url=url,
                         headers=request_headers,
                         timeout=timeout,
                         verify=False)

        # If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
        # For example - some sites don't tell us it's utf-8, but return utf-8 content
        # This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
        # https://github.com/psf/requests/issues/1604 good info about requests encoding detection
        if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
            encoding = chardet.detect(r.content)['encoding']
            if encoding:
                r.encoding = encoding

        # @todo test this
        # @todo maybe you really want to test zero-byte return pages?
        if (not ignore_status_codes and not r) or not r.content or not len(r.content):
            raise EmptyReply(url=url, status_code=r.status_code)

        self.status_code = r.status_code
        self.content = r.text
        self.headers = r.headers
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago			`from abc import ABC, abstractmethod`
Detect byte-encoding when the server mishandles the content-type header reply (#472) 3 years ago			`import chardet`
			`import os`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago			`from selenium import webdriver`
			`from selenium.webdriver.common.desired_capabilities import DesiredCapabilities`
Re #267 - Pass settings for the proxy setup for webdriver (#326) * Re #267 - Pass HTTP_PROXY as the proxy setup for webdriver * Update README.md 3 years ago			`from selenium.webdriver.common.proxy import Proxy as SeleniumProxy`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago			`from selenium.common.exceptions import WebDriverException`
Detect byte-encoding when the server mishandles the content-type header reply (#472) 3 years ago			`import requests`
			`import time`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago			`import urllib3.exceptions`


			`class EmptyReply(Exception):`
Better handling of EmptyReply exception, always bump 'last_checked' in the case of an error (#354) * Better handling of EmptyReply exception, always bump 'last_checked' in the case of an error, adds test 3 years ago			`def __init__(self, status_code, url):`
			`# Set this so we can use it in other parts of the app`
			`self.status_code = status_code`
			`self.url = url`
			`return`

Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago			`pass`

			`class Fetcher():`
			`error = None`
			`status_code = None`
Detect byte-encoding when the server mishandles the content-type header reply (#472) 3 years ago			`content = None`
Be sure that documents returned with a application/json header are not parsed with inscriptis (#337) * Auto-detect JSON by Content-Type header * Add test to not parse JSON responses with inscriptis 3 years ago			`headers = None`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago
			`fetcher_description ="No description"`

			`@abstractmethod`
			`def get_error(self):`
			`return self.error`

			`@abstractmethod`
Allow changedetector to ignore status codes as a per-site setting (#479) (#485) Co-authored-by: Ara Hayrabedian <ara.hayrabedian@gmail.com> 3 years ago			`def run(self,`
			`url,`
			`timeout,`
			`request_headers,`
			`request_body,`
			`request_method,`
			`ignore_status_codes=False):`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago			`# Should set self.error, self.status_code and self.content`
			`pass`

			`@abstractmethod`
			`def get_last_status_code(self):`
			`return self.status_code`

			`@abstractmethod`
			`# Return true/false if this checker is ready to run, in the case it needs todo some special config check etc`
			`def is_ready(self):`
			`return True`

			`# Maybe for the future, each fetcher provides its own diff output, could be used for text, image`
			`# the current one would return javascript output (as we use JS to generate the diff)`
			`#`
			`# Returns tuple(mime_type, stream)`
			`# @abstractmethod`
			`# def return_diff(self, stream_a, stream_b):`
			`# return`

			`def available_fetchers():`
			`import inspect`
Installation via pip (#186) Builder for https://pypi.org/project/changedetection.io/ 3 years ago			`from changedetectionio import content_fetcher`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago			`p=[]`
			`for name, obj in inspect.getmembers(content_fetcher):`
			`if inspect.isclass(obj):`
			`# @todo html_ is maybe better as fetcher_ or something`
			`# In this case, make sure to edit the default one in store.py and fetch_site_status.py`
			`if "html_" in name:`
			`t=tuple([name,obj.fetcher_description])`
			`p.append(t)`

			`return p`

			`class html_webdriver(Fetcher):`
WebDriver fetcher - settings - when an alternative one is configured, show it in the label 3 years ago			`if os.getenv("WEBDRIVER_URL"):`
			`fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))`
			`else:`
			`fetcher_description = "WebDriver Chrome/Javascript"`

Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago			`command_executor = ''`

Re #267 - Pass settings for the proxy setup for webdriver (#326) * Re #267 - Pass HTTP_PROXY as the proxy setup for webdriver * Update README.md 3 years ago			`# Configs for Proxy setup`
			`# In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"`
Adding new proxyType to selenium mappings 3 years ago			`selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',`
Re #267 - Pass settings for the proxy setup for webdriver (#326) * Re #267 - Pass HTTP_PROXY as the proxy setup for webdriver * Update README.md 3 years ago			`'proxyAutoconfigUrl', 'sslProxy', 'autodetect',`
Add socksVersion mapping (#331) 3 years ago			`'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']`
Adding new proxyType to selenium mappings 3 years ago


Re #267 - Pass settings for the proxy setup for webdriver (#326) * Re #267 - Pass HTTP_PROXY as the proxy setup for webdriver * Update README.md 3 years ago			`proxy=None`

Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago			`def __init__(self):`
Re #267 - Pass settings for the proxy setup for webdriver (#326) * Re #267 - Pass HTTP_PROXY as the proxy setup for webdriver * Update README.md 3 years ago			`# .strip('"') is going to save someone a lot of time when they accidently wrap the env value`
			`self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')`

			`# If any proxy settings are enabled, then we should setup the proxy object`
			`proxy_args = {}`
			`for k in self.selenium_proxy_settings_mappings:`
			`v = os.getenv('webdriver_' + k, False)`
			`if v:`
			`proxy_args[k] = v.strip('"')`

			`if proxy_args:`
			`self.proxy = SeleniumProxy(raw=proxy_args)`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago
Allow changedetector to ignore status codes as a per-site setting (#479) (#485) Co-authored-by: Ara Hayrabedian <ara.hayrabedian@gmail.com> 3 years ago			`def run(self,`
			`url,`
			`timeout,`
			`request_headers,`
			`request_body,`
			`request_method,`
			`ignore_status_codes=False):`
Aligning call signatures #325 3 years ago
			`# request_body, request_method unused for now, until some magic in the future happens.`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago
			`# check env for WEBDRIVER_URL`
			`driver = webdriver.Remote(`
			`command_executor=self.command_executor,`
Re #267 - Pass settings for the proxy setup for webdriver (#326) * Re #267 - Pass HTTP_PROXY as the proxy setup for webdriver * Update README.md 3 years ago			`desired_capabilities=DesiredCapabilities.CHROME,`
			`proxy=self.proxy)`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago
			`try:`
			`driver.get(url)`
			`except WebDriverException as e:`
			`# Be sure we close the session window`
			`driver.quit()`
			`raise`

			`# @todo - how to check this? is it possible?`
			`self.status_code = 200`
Better handling of EmptyReply exception, always bump 'last_checked' in the case of an error (#354) * Better handling of EmptyReply exception, always bump 'last_checked' in the case of an error, adds test 3 years ago			`# @todo somehow we should try to get this working for WebDriver`
			`# raise EmptyReply(url=url, status_code=r.status_code)`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago
			`# @todo - dom wait loaded?`
When using Env. FETCH_WORKERS or WEBDRIVER_DELAY_BEFORE_CONTENT_READY , it should be type int 3 years ago			`time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago			`self.content = driver.page_source`
Be sure that documents returned with a application/json header are not parsed with inscriptis (#337) * Auto-detect JSON by Content-Type header * Add test to not parse JSON responses with inscriptis 3 years ago			`self.headers = {}`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago
			`driver.quit()`


			`def is_ready(self):`
			`from selenium import webdriver`
			`from selenium.webdriver.common.desired_capabilities import DesiredCapabilities`
			`from selenium.common.exceptions import WebDriverException`

			`driver = webdriver.Remote(`
WebDriver test fetch should use environment var too 3 years ago			`command_executor=self.command_executor,`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago			`desired_capabilities=DesiredCapabilities.CHROME)`

			`# driver.quit() seems to cause better exceptions`
			`driver.quit()`

			`return True`

			`# "html_requests" is listed as the default fetcher in store.py!`
			`class html_requests(Fetcher):`
			`fetcher_description = "Basic fast Plaintext/HTTP Client"`

Allow changedetector to ignore status codes as a per-site setting (#479) (#485) Co-authored-by: Ara Hayrabedian <ara.hayrabedian@gmail.com> 3 years ago			`def run(self,`
			`url,`
			`timeout,`
			`request_headers,`
			`request_body,`
			`request_method,`
			`ignore_status_codes=False):`
Let the fetcher throw an exception which will be caught and handed to the operator anyway 3 years ago
Extend Request Parameters to add Body & Method (#325) 3 years ago			`r = requests.request(method=request_method,`
			`data=request_body,`
			`url=url,`
Let the fetcher throw an exception which will be caught and handed to the operator anyway 3 years ago			`headers=request_headers,`
			`timeout=timeout,`
			`verify=False)`

Detect byte-encoding when the server mishandles the content-type header reply (#472) 3 years ago			# If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
			`# For example - some sites don't tell us it's utf-8, but return utf-8 content`
			`# This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.`
			`# https://github.com/psf/requests/issues/1604 good info about requests encoding detection`
			`if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):`
			`encoding = chardet.detect(r.content)['encoding']`
			`if encoding:`
			`r.encoding = encoding`
Let the fetcher throw an exception which will be caught and handed to the operator anyway 3 years ago
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago			`# @todo test this`
Better handling of EmptyReply exception, always bump 'last_checked' in the case of an error (#354) * Better handling of EmptyReply exception, always bump 'last_checked' in the case of an error, adds test 3 years ago			`# @todo maybe you really want to test zero-byte return pages?`
Allow changedetector to ignore status codes as a per-site setting (#479) (#485) Co-authored-by: Ara Hayrabedian <ara.hayrabedian@gmail.com> 3 years ago			`if (not ignore_status_codes and not r) or not r.content or not len(r.content):`
Better handling of EmptyReply exception, always bump 'last_checked' in the case of an error (#354) * Better handling of EmptyReply exception, always bump 'last_checked' in the case of an error, adds test 3 years ago			`raise EmptyReply(url=url, status_code=r.status_code)`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago
			`self.status_code = r.status_code`
Detect byte-encoding when the server mishandles the content-type header reply (#472) 3 years ago			`self.content = r.text`
Be sure that documents returned with a application/json header are not parsed with inscriptis (#337) * Auto-detect JSON by Content-Type header * Add test to not parse JSON responses with inscriptis 3 years ago			`self.headers = r.headers`
Chrome/Webdriver support for Javascript websites (#114) JS Support via fetching the page over WebDriver/Selenium network Refactor forms (Split into logical tabs) 3 years ago