changedetection.io/changedetectionio/html_tools.py

import json
from bs4 import BeautifulSoup
from jsonpath_ng.ext import parse
import re

class JSONNotFound(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)

# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
def css_filter(css_filter, html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    html_block = ""
    for item in soup.select(css_filter, separator=""):
        html_block += str(item)

    return html_block + "\n"


# Return str Utf-8 of matched rules
def xpath_filter(xpath_filter, html_content):
    from lxml import html
    from lxml import etree

    tree = html.fromstring(html_content)
    html_block = ""

    for item in tree.xpath(xpath_filter.strip(), namespaces={'re':'http://exslt.org/regular-expressions'}):
        html_block+= etree.tostring(item, pretty_print=True).decode('utf-8')+"<br/>"

    return html_block


# Extract/find element
def extract_element(find='title', html_content=''):

    #Re #106, be sure to handle when its not found
    element_text = None

    soup = BeautifulSoup(html_content, 'html.parser')
    result = soup.find(find)
    if result and result.string:
        element_text = result.string.strip()

    return element_text

#
def _parse_json(json_data, jsonpath_filter):
    s=[]
    jsonpath_expression = parse(jsonpath_filter.replace('json:', ''))
    match = jsonpath_expression.find(json_data)

    # More than one result, we will return it as a JSON list.
    if len(match) > 1:
        for i in match:
            s.append(i.value)

    # Single value, use just the value, as it could be later used in a token in notifications.
    if len(match) == 1:
        s = match[0].value

    # Re #257 - Better handling where it does not exist, in the case the original 's' value was False..
    if not match:
        # Re 265 - Just return an empty string when filter not found
        return ''

    stripped_text_from_html = json.dumps(s, indent=4)

    return stripped_text_from_html

def extract_json_as_string(content, jsonpath_filter):

    stripped_text_from_html = False

    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
    try:
        stripped_text_from_html = _parse_json(json.loads(content), jsonpath_filter)
    except json.JSONDecodeError:

        # Foreach <script json></script> blob.. just return the first that matches jsonpath_filter
        s = []
        soup = BeautifulSoup(content, 'html.parser')
        bs_result = soup.findAll('script')

        if not bs_result:
            raise JSONNotFound("No parsable JSON found in this document")

        for result in bs_result:
            # Skip empty tags, and things that dont even look like JSON
            if not result.string or not '{' in result.string:
                continue
                
            try:
                json_data = json.loads(result.string)
            except json.JSONDecodeError:
                # Just skip it
                continue
            else:
                stripped_text_from_html = _parse_json(json_data, jsonpath_filter)
                if stripped_text_from_html:
                    break

    if not stripped_text_from_html:
        # Re 265 - Just return an empty string when filter not found
        return ''

    return stripped_text_from_html

# Mode     - "content" return the content without the matches (default)
#          - "line numbers" return a list of line numbers that match (int list)
#
# wordlist - list of regex's (str) or words (str)
def strip_ignore_text(content, wordlist, mode="content"):
    ignore = []
    ignore_regex = []

    # @todo check this runs case insensitive
    for k in wordlist:

        # Is it a regex?
        if k[0] == '/':
            ignore_regex.append(k.strip(" /"))
        else:
            ignore.append(k)

    i = 0
    output = []
    ignored_line_numbers = []
    for line in content.splitlines():
        i += 1
        # Always ignore blank lines in this mode. (when this function gets called)
        if len(line.strip()):
            regex_matches = False

            # if any of these match, skip
            for regex in ignore_regex:
                try:
                    if re.search(regex, line, re.IGNORECASE):
                        regex_matches = True
                except Exception as e:
                    continue

            if not regex_matches and not any(skip_text.lower() in line.lower() for skip_text in ignore):
                output.append(line.encode('utf8'))
            else:
                ignored_line_numbers.append(i)


    # Used for finding out what to highlight
    if mode == "line numbers":
        return ignored_line_numbers

    return "\n".encode('utf8').join(output)
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago			`import json`
Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago			`from bs4 import BeautifulSoup`
Re #265 - extended jsonpath support (#266) * Re #265 - Use extended JSONpath support, Allow a JSONPath selector to not match anything (yet) Adding test Correctly capture invalid JSONPath query error 3 years ago			`from jsonpath_ng.ext import parse`
Ability to visualise trigger and filter rules against the current snapshot on the preview page 3 years ago			`import re`
Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago			`class JSONNotFound(ValueError):`
			`def __init__(self, msg):`
			`ValueError.__init__(self, msg)`

Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago			`# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches`
			`def css_filter(css_filter, html_content):`
			`soup = BeautifulSoup(html_content, "html.parser")`
			`html_block = ""`
			`for item in soup.select(css_filter, separator=""):`
			`html_block += str(item)`

			`return html_block + "\n"`

XPath support (#355) * XPath support and minor improvements to form validation 3 years ago
			`# Return str Utf-8 of matched rules`
			`def xpath_filter(xpath_filter, html_content):`
			`from lxml import html`
			`from lxml import etree`

			`tree = html.fromstring(html_content)`
			`html_block = ""`

XPath RegularExpression support 3 years ago			`for item in tree.xpath(xpath_filter.strip(), namespaces={'re':'http://exslt.org/regular-expressions'}):`
XPath support (#355) * XPath support and minor improvements to form validation 3 years ago			`html_block+= etree.tostring(item, pretty_print=True).decode('utf-8')+"<br/>"`

			`return html_block`


Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago			`# Extract/find element`
			`def extract_element(find='title', html_content=''):`
Re #106 - handling empty title with gettr cleanup (#107) 4 years ago
			`#Re #106, be sure to handle when its not found`
			`element_text = None`
Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago
			`soup = BeautifulSoup(html_content, 'html.parser')`
Re #106 - handling empty title with gettr cleanup (#107) 4 years ago			`result = soup.find(find)`
			`if result and result.string:`
			`element_text = result.string.strip()`

			`return element_text`
Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago			`#`
			`def _parse_json(json_data, jsonpath_filter):`
			`s=[]`
			`jsonpath_expression = parse(jsonpath_filter.replace('json:', ''))`
			`match = jsonpath_expression.find(json_data)`

			`# More than one result, we will return it as a JSON list.`
			`if len(match) > 1:`
			`for i in match:`
			`s.append(i.value)`

			`# Single value, use just the value, as it could be later used in a token in notifications.`
			`if len(match) == 1:`
			`s = match[0].value`

Re #257 - Handle bool val of json path better (#263) * Re #257 - Handle bool val of json path better, with test 3 years ago			`# Re #257 - Better handling where it does not exist, in the case the original 's' value was False..`
			`if not match:`
Re #265 - extended jsonpath support (#266) * Re #265 - Use extended JSONpath support, Allow a JSONPath selector to not match anything (yet) Adding test Correctly capture invalid JSONPath query error 3 years ago			`# Re 265 - Just return an empty string when filter not found`
			`return ''`
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago
			`stripped_text_from_html = json.dumps(s, indent=4)`

			`return stripped_text_from_html`

			`def extract_json_as_string(content, jsonpath_filter):`

			`stripped_text_from_html = False`

			`# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>`
			`try:`
			`stripped_text_from_html = _parse_json(json.loads(content), jsonpath_filter)`
			`except json.JSONDecodeError:`

			`# Foreach <script json></script> blob.. just return the first that matches jsonpath_filter`
			`s = []`
			`soup = BeautifulSoup(content, 'html.parser')`
			`bs_result = soup.findAll('script')`

			`if not bs_result:`
			`raise JSONNotFound("No parsable JSON found in this document")`

			`for result in bs_result:`
Re #154 - Handle missing JSON better 3 years ago			`# Skip empty tags, and things that dont even look like JSON`
			`if not result.string or not '{' in result.string:`
			`continue`

Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago			`try:`
			`json_data = json.loads(result.string)`
			`except json.JSONDecodeError:`
			`# Just skip it`
			`continue`
			`else:`
			`stripped_text_from_html = _parse_json(json_data, jsonpath_filter)`
			`if stripped_text_from_html:`
			`break`

Re #154 - Handle missing JSON better 3 years ago			`if not stripped_text_from_html:`
Re #265 - extended jsonpath support (#266) * Re #265 - Use extended JSONpath support, Allow a JSONPath selector to not match anything (yet) Adding test Correctly capture invalid JSONPath query error 3 years ago			`# Re 265 - Just return an empty string when filter not found`
			`return ''`
Re #154 - Handle missing JSON better 3 years ago
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago			`return stripped_text_from_html`
Ability to visualise trigger and filter rules against the current snapshot on the preview page 3 years ago
			`# Mode - "content" return the content without the matches (default)`
			`# - "line numbers" return a list of line numbers that match (int list)`
			`#`
			`# wordlist - list of regex's (str) or words (str)`
			`def strip_ignore_text(content, wordlist, mode="content"):`
			`ignore = []`
			`ignore_regex = []`

			`# @todo check this runs case insensitive`
			`for k in wordlist:`

			`# Is it a regex?`
			`if k[0] == '/':`
			`ignore_regex.append(k.strip(" /"))`
			`else:`
			`ignore.append(k)`

			`i = 0`
			`output = []`
			`ignored_line_numbers = []`
			`for line in content.splitlines():`
			`i += 1`
			`# Always ignore blank lines in this mode. (when this function gets called)`
			`if len(line.strip()):`
			`regex_matches = False`

			`# if any of these match, skip`
			`for regex in ignore_regex:`
			`try:`
			`if re.search(regex, line, re.IGNORECASE):`
			`regex_matches = True`
			`except Exception as e:`
			`continue`

Ensure string matching on the ignore filter is always case-INsensitive 3 years ago			`if not regex_matches and not any(skip_text.lower() in line.lower() for skip_text in ignore):`
Ability to visualise trigger and filter rules against the current snapshot on the preview page 3 years ago			`output.append(line.encode('utf8'))`
			`else:`
			`ignored_line_numbers.append(i)`



			`# Used for finding out what to highlight`
			`if mode == "line numbers":`
			`return ignored_line_numbers`

			`return "\n".encode('utf8').join(output)`