Be sure that documents returned with a application/json header are not parsed with inscriptis (#337)

* Auto-detect JSON by Content-Type header
* Add test to not parse JSON responses with inscriptis
pull/350/head
Unpublished 3 years ago committed by GitHub
parent fbd9ecab62
commit 023951a10e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -15,6 +15,7 @@ class Fetcher():
error = None
status_code = None
content = None # Should always be bytes.
headers = None
fetcher_description ="No description"
@ -113,6 +114,7 @@ class html_webdriver(Fetcher):
# @todo - dom wait loaded?
time.sleep(5)
self.content = driver.page_source
self.headers = {}
driver.quit()
@ -156,4 +158,5 @@ class html_requests(Fetcher):
self.status_code = r.status_code
self.content = html
self.headers = r.headers

@ -103,9 +103,16 @@ class perform_site_check():
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
# return content().textfilter().jsonextract().checksumcompare() ?
is_html = True
is_json = fetcher.headers.get('Content-Type', '') == 'application/json'
is_html = not is_json
css_filter_rule = watch['css_filter']
if css_filter_rule and len(css_filter_rule.strip()):
has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
if is_json and not has_filter_rule:
css_filter_rule = "json:$"
has_filter_rule = True
if has_filter_rule:
if 'json:' in css_filter_rule:
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
is_html = False
@ -116,7 +123,7 @@ class perform_site_check():
if is_html:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
html_content = fetcher.content
if css_filter_rule and len(css_filter_rule.strip()):
if has_filter_rule:
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
# get_text() via inscriptis

@ -111,6 +111,21 @@ def set_original_response():
f.write(test_return_data)
return None
def set_response_with_html():
test_return_data = """
{
"test": [
{
"html": "<b>"
}
]
}
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
return None
def set_modified_response():
test_return_data = """
{
@ -138,6 +153,37 @@ def set_modified_response():
return None
def test_check_json_without_filter(client, live_server):
# Request a JSON document from a application/json source containing HTML
# and be sure it doesn't get chewed up by instriptis
set_response_with_html()
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint_json', _external=True)
client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(3)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b'&#34;&lt;b&gt;' in res.data
assert res.data.count(b'{\n') >= 2
def test_check_json_filter(client, live_server):
json_filter = 'json:boss.name'

@ -44,6 +44,16 @@ def live_server_setup(live_server):
with open("test-datastore/endpoint-content.txt", "r") as f:
return f.read()
@live_server.app.route('/test-endpoint-json')
def test_endpoint_json():
from flask import make_response
with open("test-datastore/endpoint-content.txt", "r") as f:
resp = make_response(f.read())
resp.headers['Content-Type'] = 'application/json'
return resp
# Just return the headers in the request
@live_server.app.route('/test-headers')
def test_headers():

Loading…
Cancel
Save