PDF File change detection - Initial PDF fetcher support with basic text extraction (#1244)

2 years ago · 13c4121f52
parent e8e176f3bd
commit 13c4121f52
10 changed files with 143 additions and 15 deletions
--- a/3
+++ b/3
@ -38,6 +38,8 @@ ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1
 RUN apt-get update && apt-get install -y --no-install-recommends \
    g++ \
    gcc \
+    # For pdftohtml
+    poppler-utils \
    libc-dev \
    libffi-dev \
    libjpeg-dev \
@ -65,7 +67,6 @@ COPY changedetectionio /app/changedetectionio
 # The eventlet server wrapper
 COPY changedetection.py /app/changedetection.py

-
 WORKDIR /app

 CMD [ "python", "./changedetection.py" , "-d", "/datastore"]
--- a/README.md
+++ b/README.md
@ -43,6 +43,7 @@ Requires Playwright to be enabled.

 - Products and services have a change in pricing
 - _Out of stock notification_ and _Back In stock notification_
+- Monitor and track PDF file changes, know when a PDF file has text changes.
 - Governmental department updates (changes are often only on their websites)
 - New software releases, security advisories when you're not on their mailing list.
 - Festivals with changes
@ -68,6 +69,7 @@ _Need an actual Chrome runner with Javascript support? We support fetching via W
 - Lots of trigger filters, such as "Trigger on text", "Remove text by selector", "Ignore text", "Extract text", also using regular-expressions!
 - Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JSONPath or jq
 - Switch between fast non-JS and Chrome JS based "fetchers"
+- Track changes in PDF files (Monitor text changed in the PDF, Also monitor PDF filesize and checksums)
 - Easily specify how often a site should be checked
 - Execute JS before extracting text (Good for logging in, see examples in the UI!)
 - Override Request Headers, Specify `POST` or `GET` and other methods
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@ -1,3 +1,4 @@
+import hashlib
 from abc import abstractmethod
 import chardet
 import json
@ -116,7 +117,8 @@ class Fetcher():
            request_body,
            request_method,
            ignore_status_codes=False,
-            current_include_filters=None):
+            current_include_filters=None,
+            is_binary=False):
        # Should set self.error, self.status_code and self.content
        pass

@ -267,7 +269,8 @@ class base_html_playwright(Fetcher):
            request_body,
            request_method,
            ignore_status_codes=False,
-            current_include_filters=None):
+            current_include_filters=None,
+            is_binary=False):

        from playwright.sync_api import sync_playwright
        import playwright._impl._api_types
@ -453,7 +456,8 @@ class base_html_webdriver(Fetcher):
            request_body,
            request_method,
            ignore_status_codes=False,
-            current_include_filters=None):
+            current_include_filters=None,
+            is_binary=False):

        from selenium import webdriver
        from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
@ -528,7 +532,8 @@ class html_requests(Fetcher):
            request_body,
            request_method,
            ignore_status_codes=False,
-            current_include_filters=None):
+            current_include_filters=None,
+            is_binary=False):

        # Make requests use a more modern looking user-agent
        if not 'User-Agent' in request_headers:
@ -558,10 +563,12 @@ class html_requests(Fetcher):
        # For example - some sites don't tell us it's utf-8, but return utf-8 content
        # This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
        # https://github.com/psf/requests/issues/1604 good info about requests encoding detection
-        if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
-            encoding = chardet.detect(r.content)['encoding']
-            if encoding:
-                r.encoding = encoding
+        if not is_binary:
+            # Don't run this for PDF (and requests identified as binary) takes a _long_ time
+            if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
+                encoding = chardet.detect(r.content)['encoding']
+                if encoding:
+                    r.encoding = encoding

        if not r.content or not len(r.content):
            raise EmptyReply(url=url, status_code=r.status_code)
@ -573,8 +580,14 @@ class html_requests(Fetcher):
            raise Non200ErrorCodeReceived(url=url, status_code=r.status_code, page_html=r.text)

        self.status_code = r.status_code
-        self.content = r.text
+        if is_binary:
+            # Binary files just return their checksum until we add something smarter
+            self.content = hashlib.md5(r.content).hexdigest()
+        else:
+            self.content = r.text
+
        self.headers = r.headers
+        self.raw_content = r.content


 # Decide which is the 'real' HTML webdriver, this is more a system wide config
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@ -16,6 +16,10 @@ class FilterNotFoundInResponse(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)

+class PDFToHTMLToolNotFound(ValueError):
+    def __init__(self, msg):
+        ValueError.__init__(self, msg)
+

 # Some common stuff here that can be moved to a base class
 # (set_proxy_from_list)
@ -87,7 +91,7 @@ class perform_site_check():
            is_source = True

        # Pluggable content fetcher
-        prefer_backend = watch.get('fetch_backend')
+        prefer_backend = watch.get_fetch_backend
        if hasattr(content_fetcher, prefer_backend):
            klass = getattr(content_fetcher, prefer_backend)
        else:
@ -117,12 +121,18 @@ class perform_site_check():
        if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip():
            fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code')

-        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'))
+        # requests for PDF's, images etc should be passwd the is_binary flag
+        is_binary = watch.is_pdf
+
+        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), is_binary=is_binary)
        fetcher.quit()

        self.screenshot = fetcher.screenshot
        self.xpath_data = fetcher.xpath_data

+        # Track the content type
+        update_obj['content_type'] = fetcher.headers.get('Content-Type', '')
+
        # Watches added automatically in the queue manager will skip if its the same checksum as the previous run
        # Saves a lot of CPU
        update_obj['previous_md5_before_filters'] = hashlib.md5(fetcher.content.encode('utf-8')).hexdigest()
@ -149,6 +159,31 @@ class perform_site_check():
            is_html = False
            is_json = False

+        if watch.is_pdf or 'application/pdf' in fetcher.headers.get('Content-Type', '').lower():
+            from shutil import which
+            tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
+            if not which(tool):
+                raise PDFToHTMLToolNotFound("Command-line `{}` tool was not found in system PATH, was it installed?".format(tool))
+
+            import subprocess
+            proc = subprocess.Popen(
+                [tool, '-stdout', '-', '-s', 'out.pdf', '-i'],
+                stdout=subprocess.PIPE,
+                stdin=subprocess.PIPE)
+            proc.stdin.write(fetcher.raw_content)
+            proc.stdin.close()
+            fetcher.content = proc.stdout.read().decode('utf-8')
+            proc.wait(timeout=60)
+
+            # Add a little metadata so we know if the file changes (like if an image changes, but the text is the same
+            # @todo may cause problems with non-UTF8?
+            metadata = "<p>Added by changedetection.io: Document checksum - {} Filesize - {} bytes</p>".format(
+                hashlib.md5(fetcher.raw_content).hexdigest().upper(),
+                len(fetcher.content))
+
+            fetcher.content = fetcher.content.replace('</body>', metadata + '</body>')
+
+
        include_filters_rule = deepcopy(watch.get('include_filters', []))
        # include_filters_rule = watch['include_filters']
        subtractive_selectors = watch.get(
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@ -114,6 +114,24 @@ class model(dict):

        return ready_url

+    @property
+    def get_fetch_backend(self):
+        """
+        Like just using the `fetch_backend` key but there could be some logic
+        :return:
+        """
+        # Maybe also if is_image etc?
+        # This is because chrome/playwright wont render the PDF in the browser and we will just fetch it and use pdf2html to see the text.
+        if self.is_pdf:
+            return 'html_requests'
+
+        return self.get('fetch_backend')
+
+    @property
+    def is_pdf(self):
+        # content_type field is set in the future
+        return '.pdf' in self.get('url', '').lower() or 'pdf' in self.get('content_type', '').lower()
+
    @property
    def label(self):
        # Used for sorting
--- a/changedetectionio/static/images/pdf-icon.svg
+++ b/changedetectionio/static/images/pdf-icon.svg
@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg xmlns="http://www.w3.org/2000/svg" width="75.320129mm" height="92.604164mm" viewBox="0 0 75.320129 92.604164">
+  <g transform="translate(53.548057 -183.975276) scale(1.4843)">
+    <path fill="#ff2116" d="M-29.632812 123.94727c-3.551967 0-6.44336 2.89347-6.44336 6.44531v49.49804c0 3.55185 2.891393 6.44532 6.44336 6.44532H8.2167969c3.5519661 0 6.4433591-2.89335 6.4433591-6.44532v-40.70117s.101353-1.19181-.416015-2.35156c-.484969-1.08711-1.275391-1.84375-1.275391-1.84375a1.0584391 1.0584391 0 0 0-.0059-.008l-9.3906254-9.21094a1.0584391 1.0584391 0 0 0-.015625-.0156s-.8017392-.76344-1.9902344-1.27344c-1.39939552-.6005-2.8417968-.53711-2.8417968-.53711l.021484-.002z" color="#000" font-family="sans-serif" overflow="visible" paint-order="markers fill stroke" style="line-height:normal;font-variant-ligatures:normal;font-variant-position:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-alternates:normal;font-feature-settings:normal;text-indent:0;text-align:start;text-decoration-line:none;text-decoration-style:solid;text-decoration-color:#000000;text-transform:none;text-orientation:mixed;white-space:normal;shape-padding:0;isolation:auto;mix-blend-mode:normal;solid-color:#000000;solid-opacity:1"/>
+    <path fill="#f5f5f5" d="M-29.632812 126.06445h28.3789058a1.0584391 1.0584391 0 0 0 .021484 0s1.13480448.011 1.96484378.36719c.79889772.34282 1.36536982.86176 1.36914062.86524.0000125.00001.00391.004.00391.004l9.3671868 9.18945s.564354.59582.837891 1.20899c.220779.49491.234375 1.40039.234375 1.40039a1.0584391 1.0584391 0 0 0-.002.0449v40.74609c0 2.41592-1.910258 4.32813-4.3261717 4.32813H-29.632812c-2.415914 0-4.326172-1.91209-4.326172-4.32813v-49.49804c0-2.41603 1.910258-4.32813 4.326172-4.32813z" color="#000" font-family="sans-serif" overflow="visible" paint-order="markers fill stroke" style="line-height:normal;font-variant-ligatures:normal;font-variant-position:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-alternates:normal;font-feature-settings:normal;text-indent:0;text-align:start;text-decoration-line:none;text-decoration-style:solid;text-decoration-color:#000000;text-transform:none;text-orientation:mixed;white-space:normal;shape-padding:0;isolation:auto;mix-blend-mode:normal;solid-color:#000000;solid-opacity:1"/>
+    <path fill="#ff2116" d="M-23.40766 161.09299c-1.45669-1.45669.11934-3.45839 4.39648-5.58397l2.69124-1.33743 1.04845-2.29399c.57665-1.26169 1.43729-3.32036 1.91254-4.5748l.8641-2.28082-.59546-1.68793c-.73217-2.07547-.99326-5.19438-.52872-6.31588.62923-1.51909 2.69029-1.36323 3.50626.26515.63727 1.27176.57212 3.57488-.18329 6.47946l-.6193 2.38125.5455.92604c.30003.50932 1.1764 1.71867 1.9475 2.68743l1.44924 1.80272 1.8033728-.23533c5.72900399-.74758 7.6912472.523 7.6912472 2.34476 0 2.29921-4.4984914 2.48899-8.2760865-.16423-.8499666-.59698-1.4336605-1.19001-1.4336605-1.19001s-2.3665326.48178-3.531704.79583c-1.202707.32417-1.80274.52719-3.564509 1.12186 0 0-.61814.89767-1.02094 1.55026-1.49858 2.4279-3.24833 4.43998-4.49793 5.1723-1.3991.81993-2.86584.87582-3.60433.13733zm2.28605-.81668c.81883-.50607 2.47616-2.46625 3.62341-4.28553l.46449-.73658-2.11497 1.06339c-3.26655 1.64239-4.76093 3.19033-3.98386 4.12664.43653.52598.95874.48237 2.01093-.16792zm21.21809-5.95578c.80089-.56097.68463-1.69142-.22082-2.1472-.70466-.35471-1.2726074-.42759-3.1031574-.40057-1.1249.0767-2.9337647.3034-3.2403347.37237 0 0 .993716.68678 1.434896.93922.58731.33544 2.0145161.95811 3.0565161 1.27706 1.02785.31461 1.6224.28144 2.0729-.0409zm-8.53152-3.54594c-.4847-.50952-1.30889-1.57296-1.83152-2.3632-.68353-.89643-1.02629-1.52887-1.02629-1.52887s-.4996 1.60694-.90948 2.57394l-1.27876 3.16076-.37075.71695s1.971043-.64627 2.97389-.90822c1.0621668-.27744 3.21787-.70134 3.21787-.70134zm-2.74938-11.02573c.12363-1.0375.1761-2.07346-.15724-2.59587-.9246-1.01077-2.04057-.16787-1.85154 2.23517.0636.8084.26443 2.19033.53292 3.04209l.48817 1.54863.34358-1.16638c.18897-.64151.47882-2.02015.64411-3.06364z"/>
+    <path fill="#2c2c2c" d="M-20.930423 167.83862h2.364986q1.133514 0 1.840213.2169.706698.20991 1.189489.9446.482795.72769.482795 1.75625 0 .94459-.391832 1.6233-.391833.67871-1.056548.97958-.65772.30087-2.02913.30087h-.818651v3.72941h-1.581322zm1.581322 1.22447v3.33058h.783664q1.049552 0 1.44838-.39184.405826-.39183.405826-1.27345 0-.65772-.265887-1.06355-.265884-.41282-.587747-.50378-.314866-.098-1.000572-.098zm5.50664-1.22447h2.148082q1.560333 0 2.4909318.55276.9375993.55276 1.4133973 1.6443.482791 1.09153.482791 2.42096 0 1.3994-.4338151 2.49793-.4268149 1.09153-1.3154348 1.76324-.8816233.67172-2.5189212.67172h-2.267031zm1.581326 1.26645v7.018h.657715q1.378411 0 2.001144-.9516.6227329-.95858.6227329-2.5539 0-3.5125-2.6238769-3.5125zm6.4722254-1.26645h5.30372941v1.26645H-4.2075842v2.85478h2.9807225v1.26646h-2.9807225v4.16322h-1.5813254z" font-family="Franklin Gothic Medium Cond" letter-spacing="0" style="line-height:125%;-inkscape-font-specification:'Franklin Gothic Medium Cond'" word-spacing="4.26000023"/>
+  </g>
+</svg>
--- a/changedetectionio/templates/watch-overview.html
+++ b/changedetectionio/templates/watch-overview.html
@ -89,10 +89,10 @@
                </td>
                <td class="title-col inline">{{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}}
                    <a class="external" target="_blank" rel="noopener" href="{{ watch.link.replace('source:','') }}"></a>
-                    <a class="link-spread" href="{{url_for('form_share_put_watch', uuid=watch.uuid)}}"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="icon icon-spread" title="Create a link to share watch config with others" /></a>
-
-                    {%if watch.fetch_backend == "html_webdriver" %}<img style="height: 1em; display:inline-block;" src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}" title="Using a chrome browser" />{% endif %}
+                    <a class="link-spread" href="{{url_for('form_share_put_watch', uuid=watch.uuid)}}"><img style="height: 1em;display:inline-block;  vertical-align: middle;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="icon icon-spread" title="Create a link to share watch config with others" /></a>

+                    {%if watch.get_fetch_backend == "html_webdriver" %}<img style="height: 1em; display:inline-block;" src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}" title="Using a chrome browser" />{% endif %}
+                    {%if watch.is_pdf  %}<img style="height: 1.2em;  vertical-align: middle; display:inline-block;" src="{{url_for('static_content', group='images', filename='pdf-icon.svg')}}" title="Converting PDF to text" />{% endif %}
                    {% if watch.last_error is defined and watch.last_error != False %}
                    <div class="fetch-error">{{ watch.last_error }}</div>
                    {% endif %}
--- a/changedetectionio/tests/test.pdf
+++ b/changedetectionio/tests/test.pdf
--- a/changedetectionio/tests/test_pdf.py
+++ b/changedetectionio/tests/test_pdf.py
@ -0,0 +1,40 @@
+#!/usr/bin/python3
+
+import time
+from flask import url_for
+from .util import set_original_response, set_modified_response, live_server_setup
+
+sleep_time_for_fetch_thread = 3
+
+# `subtractive_selectors` should still work in `source:` type requests
+def test_fetch_pdf(client, live_server):
+    import shutil
+    shutil.copy("tests/test.pdf", "test-datastore/endpoint-test.pdf")
+
+    live_server_setup(live_server)
+    test_url = url_for('test_pdf_endpoint', _external=True)
+    # Add our URL to the import page
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+
+    assert b"1 Imported" in res.data
+
+    time.sleep(sleep_time_for_fetch_thread)
+    res = client.get(
+        url_for("preview_page", uuid="first"),
+        follow_redirects=True
+    )
+
+    assert b'PDF-1.5' not in res.data
+    assert b'hello world' in res.data
+
+    # So we know if the file changes in other ways
+    import hashlib
+    md5 = hashlib.md5(open("test-datastore/endpoint-test.pdf", 'rb').read()).hexdigest().upper()
+    # We should have one
+    assert len(md5) >0
+    # And it's going to be in the document
+    assert b'Document checksum - '+bytes(str(md5).encode('utf-8')) in res.data
--- a/changedetectionio/tests/util.py
+++ b/changedetectionio/tests/util.py
@ -168,5 +168,15 @@ def live_server_setup(live_server):
    def test_return_query():
        return request.query_string

+
+    @live_server.app.route('/endpoint-test.pdf')
+    def test_pdf_endpoint():
+
+        # Tried using a global var here but didn't seem to work, so reading from a file instead.
+        with open("test-datastore/endpoint-test.pdf", "rb") as f:
+            resp = make_response(f.read(), 200)
+            resp.headers['Content-Type'] = 'application/pdf'
+            return resp
+
    live_server.start()