You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
91 lines
2.8 KiB
91 lines
2.8 KiB
4 years ago
|
import json
|
||
4 years ago
|
from bs4 import BeautifulSoup
|
||
4 years ago
|
from jsonpath_ng import parse
|
||
4 years ago
|
|
||
|
|
||
4 years ago
|
class JSONNotFound(ValueError):
|
||
|
def __init__(self, msg):
|
||
|
ValueError.__init__(self, msg)
|
||
|
|
||
4 years ago
|
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
|
||
|
def css_filter(css_filter, html_content):
|
||
|
soup = BeautifulSoup(html_content, "html.parser")
|
||
|
html_block = ""
|
||
|
for item in soup.select(css_filter, separator=""):
|
||
|
html_block += str(item)
|
||
|
|
||
|
return html_block + "\n"
|
||
|
|
||
|
|
||
|
# Extract/find element
|
||
|
def extract_element(find='title', html_content=''):
|
||
4 years ago
|
|
||
|
#Re #106, be sure to handle when its not found
|
||
|
element_text = None
|
||
4 years ago
|
|
||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||
4 years ago
|
result = soup.find(find)
|
||
|
if result and result.string:
|
||
|
element_text = result.string.strip()
|
||
|
|
||
|
return element_text
|
||
4 years ago
|
|
||
4 years ago
|
#
|
||
|
def _parse_json(json_data, jsonpath_filter):
|
||
|
s=[]
|
||
|
jsonpath_expression = parse(jsonpath_filter.replace('json:', ''))
|
||
|
match = jsonpath_expression.find(json_data)
|
||
|
|
||
|
# More than one result, we will return it as a JSON list.
|
||
|
if len(match) > 1:
|
||
|
for i in match:
|
||
|
s.append(i.value)
|
||
|
|
||
|
# Single value, use just the value, as it could be later used in a token in notifications.
|
||
|
if len(match) == 1:
|
||
|
s = match[0].value
|
||
|
|
||
|
if not s:
|
||
|
raise JSONNotFound("No Matching JSON could be found for the rule {}".format(jsonpath_filter.replace('json:', '')))
|
||
|
|
||
|
stripped_text_from_html = json.dumps(s, indent=4)
|
||
|
|
||
|
return stripped_text_from_html
|
||
|
|
||
|
def extract_json_as_string(content, jsonpath_filter):
|
||
|
|
||
|
stripped_text_from_html = False
|
||
|
|
||
|
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
|
||
|
try:
|
||
|
stripped_text_from_html = _parse_json(json.loads(content), jsonpath_filter)
|
||
|
except json.JSONDecodeError:
|
||
|
|
||
|
# Foreach <script json></script> blob.. just return the first that matches jsonpath_filter
|
||
|
s = []
|
||
|
soup = BeautifulSoup(content, 'html.parser')
|
||
|
bs_result = soup.findAll('script')
|
||
|
|
||
|
if not bs_result:
|
||
|
raise JSONNotFound("No parsable JSON found in this document")
|
||
|
|
||
|
for result in bs_result:
|
||
4 years ago
|
# Skip empty tags, and things that dont even look like JSON
|
||
|
if not result.string or not '{' in result.string:
|
||
|
continue
|
||
|
|
||
4 years ago
|
try:
|
||
|
json_data = json.loads(result.string)
|
||
|
except json.JSONDecodeError:
|
||
|
# Just skip it
|
||
|
continue
|
||
|
else:
|
||
|
stripped_text_from_html = _parse_json(json_data, jsonpath_filter)
|
||
|
if stripped_text_from_html:
|
||
|
break
|
||
|
|
||
4 years ago
|
if not stripped_text_from_html:
|
||
|
raise JSONNotFound("No JSON matching the rule '%s' found" % jsonpath_filter.replace('json:',''))
|
||
|
|
||
4 years ago
|
return stripped_text_from_html
|