|
|
@ -1,10 +1,12 @@
|
|
|
|
import os
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
|
|
|
|
import chardet
|
|
|
|
|
|
|
|
import os
|
|
|
|
from selenium import webdriver
|
|
|
|
from selenium import webdriver
|
|
|
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
|
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
|
|
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
|
|
|
|
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
|
|
|
|
from selenium.common.exceptions import WebDriverException
|
|
|
|
from selenium.common.exceptions import WebDriverException
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
import time
|
|
|
|
import urllib3.exceptions
|
|
|
|
import urllib3.exceptions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -20,7 +22,7 @@ class EmptyReply(Exception):
|
|
|
|
class Fetcher():
|
|
|
|
class Fetcher():
|
|
|
|
error = None
|
|
|
|
error = None
|
|
|
|
status_code = None
|
|
|
|
status_code = None
|
|
|
|
content = None # Should always be bytes.
|
|
|
|
content = None
|
|
|
|
headers = None
|
|
|
|
headers = None
|
|
|
|
|
|
|
|
|
|
|
|
fetcher_description ="No description"
|
|
|
|
fetcher_description ="No description"
|
|
|
@ -146,7 +148,6 @@ class html_requests(Fetcher):
|
|
|
|
fetcher_description = "Basic fast Plaintext/HTTP Client"
|
|
|
|
fetcher_description = "Basic fast Plaintext/HTTP Client"
|
|
|
|
|
|
|
|
|
|
|
|
def run(self, url, timeout, request_headers, request_body, request_method):
|
|
|
|
def run(self, url, timeout, request_headers, request_body, request_method):
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
r = requests.request(method=request_method,
|
|
|
|
r = requests.request(method=request_method,
|
|
|
|
data=request_body,
|
|
|
|
data=request_body,
|
|
|
@ -155,16 +156,21 @@ class html_requests(Fetcher):
|
|
|
|
timeout=timeout,
|
|
|
|
timeout=timeout,
|
|
|
|
verify=False)
|
|
|
|
verify=False)
|
|
|
|
|
|
|
|
|
|
|
|
# https://stackoverflow.com/questions/44203397/python-requests-get-returns-improperly-decoded-text-instead-of-utf-8
|
|
|
|
# If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
|
|
|
|
# Return bytes here
|
|
|
|
# For example - some sites don't tell us it's utf-8, but return utf-8 content
|
|
|
|
html = r.text
|
|
|
|
# This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
|
|
|
|
|
|
|
|
# https://github.com/psf/requests/issues/1604 good info about requests encoding detection
|
|
|
|
|
|
|
|
if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
|
|
|
|
|
|
|
|
encoding = chardet.detect(r.content)['encoding']
|
|
|
|
|
|
|
|
if encoding:
|
|
|
|
|
|
|
|
r.encoding = encoding
|
|
|
|
|
|
|
|
|
|
|
|
# @todo test this
|
|
|
|
# @todo test this
|
|
|
|
# @todo maybe you really want to test zero-byte return pages?
|
|
|
|
# @todo maybe you really want to test zero-byte return pages?
|
|
|
|
if not r or not html or not len(html):
|
|
|
|
if not r or not r.content or not len(r.content):
|
|
|
|
raise EmptyReply(url=url, status_code=r.status_code)
|
|
|
|
raise EmptyReply(url=url, status_code=r.status_code)
|
|
|
|
|
|
|
|
|
|
|
|
self.status_code = r.status_code
|
|
|
|
self.status_code = r.status_code
|
|
|
|
self.content = html
|
|
|
|
self.content = r.text
|
|
|
|
self.headers = r.headers
|
|
|
|
self.headers = r.headers
|
|
|
|
|
|
|
|
|
|
|
|