From 023951a10e7f783ecee9b3daf0260ad940b7586c Mon Sep 17 00:00:00 2001 From: Unpublished Date: Sun, 2 Jan 2022 22:35:33 +0100 Subject: [PATCH] Be sure that documents returned with a application/json header are not parsed with inscriptis (#337) * Auto-detect JSON by Content-Type header * Add test to not parse JSON responses with inscriptis --- changedetectionio/content_fetcher.py | 3 ++ changedetectionio/fetch_site_status.py | 13 ++++-- .../tests/test_jsonpath_selector.py | 46 +++++++++++++++++++ changedetectionio/tests/util.py | 10 ++++ 4 files changed, 69 insertions(+), 3 deletions(-) diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index ce83ebe0..d82775b9 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -15,6 +15,7 @@ class Fetcher(): error = None status_code = None content = None # Should always be bytes. + headers = None fetcher_description ="No description" @@ -113,6 +114,7 @@ class html_webdriver(Fetcher): # @todo - dom wait loaded? time.sleep(5) self.content = driver.page_source + self.headers = {} driver.quit() @@ -156,4 +158,5 @@ class html_requests(Fetcher): self.status_code = r.status_code self.content = html + self.headers = r.headers diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 82108306..98c0be1d 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -103,9 +103,16 @@ class perform_site_check(): # https://stackoverflow.com/questions/41817578/basic-method-chaining ? # return content().textfilter().jsonextract().checksumcompare() ? - is_html = True + is_json = fetcher.headers.get('Content-Type', '') == 'application/json' + is_html = not is_json css_filter_rule = watch['css_filter'] - if css_filter_rule and len(css_filter_rule.strip()): + + has_filter_rule = css_filter_rule and len(css_filter_rule.strip()) + if is_json and not has_filter_rule: + css_filter_rule = "json:$" + has_filter_rule = True + + if has_filter_rule: if 'json:' in css_filter_rule: stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule) is_html = False @@ -116,7 +123,7 @@ class perform_site_check(): if is_html: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content = fetcher.content - if css_filter_rule and len(css_filter_rule.strip()): + if has_filter_rule: html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) # get_text() via inscriptis diff --git a/changedetectionio/tests/test_jsonpath_selector.py b/changedetectionio/tests/test_jsonpath_selector.py index 39529642..5a4b7959 100644 --- a/changedetectionio/tests/test_jsonpath_selector.py +++ b/changedetectionio/tests/test_jsonpath_selector.py @@ -111,6 +111,21 @@ def set_original_response(): f.write(test_return_data) return None + +def set_response_with_html(): + test_return_data = """ + { + "test": [ + { + "html": "" + } + ] + } + """ + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + return None + def set_modified_response(): test_return_data = """ { @@ -138,6 +153,37 @@ def set_modified_response(): return None +def test_check_json_without_filter(client, live_server): + # Request a JSON document from a application/json source containing HTML + # and be sure it doesn't get chewed up by instriptis + set_response_with_html() + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint_json', _external=True) + client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(3) + + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + + assert b'"<b>' in res.data + assert res.data.count(b'{\n') >= 2 + + def test_check_json_filter(client, live_server): json_filter = 'json:boss.name' diff --git a/changedetectionio/tests/util.py b/changedetectionio/tests/util.py index 80eb9820..2e30be25 100644 --- a/changedetectionio/tests/util.py +++ b/changedetectionio/tests/util.py @@ -44,6 +44,16 @@ def live_server_setup(live_server): with open("test-datastore/endpoint-content.txt", "r") as f: return f.read() + @live_server.app.route('/test-endpoint-json') + def test_endpoint_json(): + + from flask import make_response + + with open("test-datastore/endpoint-content.txt", "r") as f: + resp = make_response(f.read()) + resp.headers['Content-Type'] = 'application/json' + return resp + # Just return the headers in the request @live_server.app.route('/test-headers') def test_headers():