diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 33a38427..ce83ebe0 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -14,7 +14,7 @@ class EmptyReply(Exception): class Fetcher(): error = None status_code = None - content = None # Should be bytes? + content = None # Should always be bytes. fetcher_description ="No description" @@ -129,7 +129,6 @@ class html_webdriver(Fetcher): # driver.quit() seems to cause better exceptions driver.quit() - return True # "html_requests" is listed as the default fetcher in store.py! @@ -146,6 +145,8 @@ class html_requests(Fetcher): timeout=timeout, verify=False) + # https://stackoverflow.com/questions/44203397/python-requests-get-returns-improperly-decoded-text-instead-of-utf-8 + # Return bytes here html = r.text diff --git a/changedetectionio/store.py b/changedetectionio/store.py index fb7cede5..3cc049c0 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -367,6 +367,10 @@ class ChangeDetectionStore: import uuid output_path = "{}/{}".format(self.datastore_path, watch_uuid) + # Incase the operator deleted it, check and create. + if not os.path.isdir(output_path): + mkdir(output_path) + fname = "{}/{}.stripped.txt".format(output_path, uuid.uuid4()) with open(fname, 'wb') as f: f.write(contents) diff --git a/changedetectionio/tests/test_notification.py b/changedetectionio/tests/test_notification.py index 3cfeecf9..21083066 100644 --- a/changedetectionio/tests/test_notification.py +++ b/changedetectionio/tests/test_notification.py @@ -159,6 +159,9 @@ def test_check_notification(client, live_server): with open("test-datastore/notification.txt", "r") as f: notification_submission = f.read() + print ("Notification submission was:", notification_submission) + # Re #342 - check for accidental python byte encoding of non-utf8/string + assert "b'" not in notification_submission assert re.search('Watch UUID: [0-9a-f]{8}(-[0-9a-f]{4}){3}-[0-9a-f]{12}', notification_submission, re.IGNORECASE) assert "Watch title: my title" in notification_submission diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 4ab1d806..a8992d96 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -2,7 +2,12 @@ import threading import queue import time -# Requests for checking on the site use a pool of thread Workers managed by a Queue. +# A single update worker +# +# Requests for checking on a single site(watch) from a queue of watches +# (another process inserts watches into the queue that are time-ready for checking) + + class update_worker(threading.Thread): current_uuid = None @@ -39,6 +44,13 @@ class update_worker(threading.Thread): now = time.time() changed_detected, update_obj, contents = update_handler.run(uuid) + # Re #342 + # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes. + # We then convert/.decode('utf-8') for the notification etc + if not isinstance(contents, (bytes, bytearray)): + raise Exception("Error - returned data from the fetch handler SHOULD be bytes") + + # Always record that we atleast tried self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3)}) @@ -111,7 +123,7 @@ class update_worker(threading.Thread): n_object.update({ 'watch_url': watch['url'], 'uuid': uuid, - 'current_snapshot': str(contents), + 'current_snapshot': contents.decode('utf-8'), 'diff_full': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep), 'diff': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep) })