92 lines
3.8 KiB
92 lines
3.8 KiB
import hashlib
|
|
import os
|
|
|
|
import chardet
|
|
import requests
|
|
|
|
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
|
|
from changedetectionio.content_fetchers.base import Fetcher
|
|
|
|
|
|
# "html_requests" is listed as the default fetcher in store.py!
|
|
class fetcher(Fetcher):
|
|
fetcher_description = "Basic fast Plaintext/HTTP Client"
|
|
|
|
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
|
|
super().__init__()
|
|
self.proxy_override = proxy_override
|
|
# browser_connection_url is none because its always 'launched locally'
|
|
|
|
def run(self,
|
|
url,
|
|
timeout,
|
|
request_headers,
|
|
request_body,
|
|
request_method,
|
|
ignore_status_codes=False,
|
|
current_include_filters=None,
|
|
is_binary=False):
|
|
|
|
if self.browser_steps_get_valid_steps():
|
|
raise BrowserStepsInUnsupportedFetcher(url=url)
|
|
|
|
# Make requests use a more modern looking user-agent
|
|
if not {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None):
|
|
request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT",
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36')
|
|
|
|
proxies = {}
|
|
|
|
# Allows override the proxy on a per-request basis
|
|
|
|
# https://requests.readthedocs.io/en/latest/user/advanced/#socks
|
|
# Should also work with `socks5://user:pass@host:port` type syntax.
|
|
|
|
if self.proxy_override:
|
|
proxies = {'http': self.proxy_override, 'https': self.proxy_override, 'ftp': self.proxy_override}
|
|
else:
|
|
if self.system_http_proxy:
|
|
proxies['http'] = self.system_http_proxy
|
|
if self.system_https_proxy:
|
|
proxies['https'] = self.system_https_proxy
|
|
|
|
r = requests.request(method=request_method,
|
|
data=request_body,
|
|
url=url,
|
|
headers=request_headers,
|
|
timeout=timeout,
|
|
proxies=proxies,
|
|
verify=False)
|
|
|
|
# If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
|
|
# For example - some sites don't tell us it's utf-8, but return utf-8 content
|
|
# This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
|
|
# https://github.com/psf/requests/issues/1604 good info about requests encoding detection
|
|
if not is_binary:
|
|
# Don't run this for PDF (and requests identified as binary) takes a _long_ time
|
|
if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
|
|
encoding = chardet.detect(r.content)['encoding']
|
|
if encoding:
|
|
r.encoding = encoding
|
|
|
|
self.headers = r.headers
|
|
|
|
if not r.content or not len(r.content):
|
|
raise EmptyReply(url=url, status_code=r.status_code)
|
|
|
|
# @todo test this
|
|
# @todo maybe you really want to test zero-byte return pages?
|
|
if r.status_code != 200 and not ignore_status_codes:
|
|
# maybe check with content works?
|
|
raise Non200ErrorCodeReceived(url=url, status_code=r.status_code, page_html=r.text)
|
|
|
|
self.status_code = r.status_code
|
|
if is_binary:
|
|
# Binary files just return their checksum until we add something smarter
|
|
self.content = hashlib.md5(r.content).hexdigest()
|
|
else:
|
|
self.content = r.text
|
|
|
|
|
|
self.raw_content = r.content
|