From 12aa77ee35428875222e2e2332094c764f8056ad Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sat, 30 Apr 2022 17:32:56 +0200 Subject: [PATCH] just re-use the existing page fetch --- changedetectionio/__init__.py | 152 --- changedetectionio/content_fetcher.py | 111 ++ changedetectionio/fetch_site_status.py | 3 +- changedetectionio/store.py | 8 + changedetectionio/templates/edit.html | 3 +- changedetectionio/update_worker.py | 6 +- f | 1532 ++++++++++++++++++++++++ 7 files changed, 1659 insertions(+), 156 deletions(-) create mode 100644 f diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 1525677e..07c8a819 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -1101,158 +1101,6 @@ def changedetection_app(config=None, datastore_o=None): flash("{} watches are queued for rechecking.".format(i)) return redirect(url_for('index', tag=tag)) - @app.route("/api/request-visual-selector-data/", methods=['GET']) - @login_required - def visualselector_request_current_screenshot_and_metadata(uuid): - import json - - watch = deepcopy(datastore.data['watching'][uuid]) - - path_to_datafile = os.path.join(datastore_o.datastore_path, uuid, "elements.json") - try: - os.unlink(path_to_datafile) - except FileNotFoundError: - pass - - # docker run -p 3000:3000 browserless/chrome - # @todo this needs abstracting out? - from playwright.sync_api import sync_playwright - with sync_playwright() as p: - browser = p.chromium.connect_over_cdp("ws://127.0.0.1:3000") - page = browser.new_page() - - - # @todo handle timeouts for long pages >30sec - try: - page.goto(watch['url']) - except Exception as e: - pass - - #time.sleep(3) - # https://github.com/microsoft/playwright/issues/620 - # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it - screenshot = page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024}) - screenshot = page.screenshot(type='jpeg', full_page=True, quality=90) - - # Could be made a lot faster - # https://toruskit.com/blog/how-to-get-element-bounds-without-reflow/ - - # lazy quoting, probably going to be bad later. - css_filter = watch['css_filter'].replace('"', '\\"') - css_filter = css_filter.replace('\'', '\\\'') - - page.evaluate("var css_filter='{}';".format(css_filter)) - - info = page.evaluate("""async () => { - // Include the getXpath script directly, easier than fetching - !function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):(e=e||self).getXPath=n()}(this,function(){return function(e){var n=e;if(n&&n.id)return'//*[@id="'+n.id+'"]';for(var o=[];n&&Node.ELEMENT_NODE===n.nodeType;){for(var i=0,r=!1,d=n.previousSibling;d;)d.nodeType!==Node.DOCUMENT_TYPE_NODE&&d.nodeName===n.nodeName&&i++,d=d.previousSibling;for(d=n.nextSibling;d;){if(d.nodeName===n.nodeName){r=!0;break}d=d.nextSibling}o.push((n.prefix?n.prefix+":":"")+n.localName+(i||r?"["+(i+1)+"]":"")),n=n.parentNode}return o.length?"/"+o.reverse().join("/"):""}}); - //# sourceMappingURL=index.umd.js.map - - -const findUpTag = (el) => { - let r = el - chained_css = []; - - while (r.parentNode) { - - if(r.classList.length >0) { - // limit to just using 2 class names of each, stops from getting really huge selector strings - current_css='.'+Array.from(r.classList).slice(0, 2).join('.'); - chained_css.unshift(current_css); - - var f=chained_css.join(' '); - var q=document.querySelectorAll(f); - if(q.length==1) return current_css; - if(f.length >120) return null; - } - r = r.parentNode; - } - return null; -} - - - var elements = document.getElementsByTagName("*"); - var size_pos=[]; - // after page fetch, inject this JS - // build a map of all elements and their positions (maybe that only include text?) - var bbox; - for (var i = 0; i < elements.length; i++) { - bbox = elements[i].getBoundingClientRect(); - - // forget reallysmall ones - if (bbox['width'] <10 && bbox['height'] <10 ) { - continue; - } - - // @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes - // it should not traverse when we know we can anchor off just an ID one level up etc.. - // maybe, get current class or id, keep traversing up looking for only class or id until there is just one match - - // 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us. - xpath_result=false; - try { - var d= findUpTag(elements[i]); - if (d) { - xpath_result =d; - } - } catch (e) { - var x=1; - } - - // default back to the less intelligent one - if (!xpath_result) { - xpath_result = getXPath(elements[i]); - } - - size_pos.push({ - xpath: xpath_result, - width: bbox['width'], - height: bbox['height'], - left: bbox['left'], - top: bbox['top'], - childCount: elements[i].childElementCount - }); - } - - - // inject the current one set in the css_filter, which may be a CSS rule - // used for displaying the current one in VisualSelector, where its not one we generated. - if (css_filter.length) { - // is it xpath? - if (css_filter.startsWith('/') ) { - q=document.evaluate(css_filter, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; - } else { - q=document.querySelector(css_filter); - } - if (q) { - bbox = q.getBoundingClientRect(); - size_pos.push({ - xpath: css_filter, - width: bbox['width'], - height: bbox['height'], - left: bbox['left'], - top: bbox['top'], - childCount: q.childElementCount - }); - } - } - - return size_pos; -}""") - - browser.close() - - with open(path_to_datafile ,'w') as f: - f.write(json.dumps(info, indent=4)) - - - response = make_response(screenshot) - response.headers['Content-type'] = 'image/jpeg' - response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate' - response.headers['Pragma'] = 'no-cache' - response.headers['Expires'] = 0 - return response - # @todo handle ctrl break ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start() diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 1f40911e..36b96960 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -7,6 +7,7 @@ from selenium.webdriver.common.proxy import Proxy as SeleniumProxy from selenium.common.exceptions import WebDriverException import requests import time +import json import urllib3.exceptions @@ -26,6 +27,102 @@ class Fetcher(): headers = None fetcher_description ="No description" + xpath_element_js=""" + // Include the getXpath script directly, easier than fetching + !function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):(e=e||self).getXPath=n()}(this,function(){return function(e){var n=e;if(n&&n.id)return'//*[@id="'+n.id+'"]';for(var o=[];n&&Node.ELEMENT_NODE===n.nodeType;){for(var i=0,r=!1,d=n.previousSibling;d;)d.nodeType!==Node.DOCUMENT_TYPE_NODE&&d.nodeName===n.nodeName&&i++,d=d.previousSibling;for(d=n.nextSibling;d;){if(d.nodeName===n.nodeName){r=!0;break}d=d.nextSibling}o.push((n.prefix?n.prefix+":":"")+n.localName+(i||r?"["+(i+1)+"]":"")),n=n.parentNode}return o.length?"/"+o.reverse().join("/"):""}}); + //# sourceMappingURL=index.umd.js.map + + + const findUpTag = (el) => { + let r = el + chained_css = []; + + while (r.parentNode) { + + if(r.classList.length >0) { + // limit to just using 2 class names of each, stops from getting really huge selector strings + current_css='.'+Array.from(r.classList).slice(0, 2).join('.'); + chained_css.unshift(current_css); + + var f=chained_css.join(' '); + var q=document.querySelectorAll(f); + if(q.length==1) return current_css; + if(f.length >120) return null; + } + r = r.parentNode; + } + return null; + } + + + var elements = document.getElementsByTagName("*"); + var size_pos=[]; + // after page fetch, inject this JS + // build a map of all elements and their positions (maybe that only include text?) + var bbox; + for (var i = 0; i < elements.length; i++) { + bbox = elements[i].getBoundingClientRect(); + + // forget reallysmall ones + if (bbox['width'] <10 && bbox['height'] <10 ) { + continue; + } + + // @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes + // it should not traverse when we know we can anchor off just an ID one level up etc.. + // maybe, get current class or id, keep traversing up looking for only class or id until there is just one match + + // 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us. + xpath_result=false; + try { + var d= findUpTag(elements[i]); + if (d) { + xpath_result =d; + } + } catch (e) { + var x=1; + } + + // default back to the less intelligent one + if (!xpath_result) { + xpath_result = getXPath(elements[i]); + } + + size_pos.push({ + xpath: xpath_result, + width: bbox['width'], + height: bbox['height'], + left: bbox['left'], + top: bbox['top'], + childCount: elements[i].childElementCount + }); + } + + + // inject the current one set in the css_filter, which may be a CSS rule + // used for displaying the current one in VisualSelector, where its not one we generated. + if (css_filter.length) { + // is it xpath? + if (css_filter.startsWith('/') ) { + q=document.evaluate(css_filter, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; + } else { + q=document.querySelector(css_filter); + } + if (q) { + bbox = q.getBoundingClientRect(); + size_pos.push({ + xpath: css_filter, + width: bbox['width'], + height: bbox['height'], + left: bbox['left'], + top: bbox['top'], + childCount: q.childElementCount + }); + } + } + + return size_pos; + """ @abstractmethod def get_error(self): @@ -59,6 +156,11 @@ class Fetcher(): def is_ready(self): return True + @abstractmethod + def get_xpath_data(self, current_css_xpath_filter): + return None + + # Maybe for the future, each fetcher provides its own diff output, could be used for text, image # the current one would return javascript output (as we use JS to generate the diff) # @@ -163,6 +265,15 @@ class html_webdriver(Fetcher): self.quit() return True + def get_xpath_data(self, current_css_xpath_filter): + + # lazy quoting, probably going to be bad later. + css_filter = current_css_xpath_filter.replace('"', '\\"') + css_filter = css_filter.replace('\'', '\\\'') + info = self.driver.execute_script("var css_filter='{}';".format(css_filter)+self.xpath_element_js) + return info + + def quit(self): if self.driver: try: diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 55bf49dc..e3ec410f 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -195,6 +195,7 @@ class perform_site_check(): if self.datastore.data['settings']['application'].get('real_browser_save_screenshot', True): screenshot = fetcher.screenshot() + xpath_elements = fetcher.get_xpath_data(watch['css_filter']) fetcher.quit() - return changed_detected, update_obj, text_content_before_ignored_filter, screenshot \ No newline at end of file + return changed_detected, update_obj, text_content_before_ignored_filter, screenshot, xpath_elements \ No newline at end of file diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 351f2b4c..8ca3fde2 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -357,6 +357,14 @@ class ChangeDetectionStore: f.write(screenshot) f.close() + def save_xpath_data(self, watch_uuid, data): + output_path = "{}/{}".format(self.datastore_path, watch_uuid) + fname = "{}/elements.json".format(output_path) + with open(fname, 'w') as f: + f.write(json.dumps(data)) + f.close() + + def sync_to_json(self): logging.info("Saving JSON..") print("Saving JSON..") diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index f16faef5..ce633bfa 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -189,7 +189,7 @@ nav + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + No parsable JSON found in this document · Issue #572 · dgtlmoon/changedetection.io · GitHub + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Skip to content + + + + + + + + + + +
+ +
+ + + + + + + +
+ + + +
+ + + + + + + + + + +
+
+
+ + + + + + + + + + + + + + +
+ + + + + + +
+ + + +
+ + + +
+ +
+
+
+ + + + + +
+ + + New issue + + +
+
+ +
+ +
+

+ Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community. +

+ + +

By clicking “Sign up for GitHub”, you agree to our terms of service and + privacy statement. We’ll occasionally send you account related emails.

+ +

+ Already on GitHub? + Sign in + to your account +

+
+ +
+
+
+ +
+ +

+ No parsable JSON found in this document + #572 +

+
+
+ +
+
+ + Open + +
+ +
+
+ +
+ +
+ +
+ +
+ +
+ maglevize opened this issue +Apr 30, 2022 +· 0 comments + + + + + +
+
+ + + +
+
+
+
+
+ + Open + +
+ +
+
+ +
+ +
+ +
+ +
+ +
+

+ No parsable JSON found in this document + #572 +

+ +
+ maglevize opened this issue +Apr 30, 2022 +· 0 comments + + + + + +
+
+
+
+
+
+
+
+ + + + +
+
+

Comments

+
+ +
+
+ +
+ @maglevize + +
+ +
+
+ + +
+
+ + + + + +
+ + + + + + + + Copy link + + +
+ +
+ +
+ + + + + +
+ +

+ + + @maglevize + + + + maglevize + + + + + + commented + + + Apr 30, 2022 + + + + +

+
+ + +
+ + + + + + + + + + + + +
+

Describe the bug
+I'm using filter json:$..File_Path but it show "No parsable JSON found in this document"

+

Version
+v0.39.12

+

To Reproduce

+
    +
  1. I'm using WebDriver instance on other server, with open port 4444.
  2. +
  3. set filter to json:$..File_Path
  4. +
  5. get from this URL here: https://www.idx.co.id/umbraco/Surface/ListedCompany/GetFinancialReport?indexFrom=0&pageSize=10&year=2022&reportType=rdf&periode=tw1&kodeEmiten=BIRD;ICBP;BBCA;ISSP;SIDO;LPPF
  6. +
  7. click recheck button
    +with no filter, it produce the result correctly without warning
  8. +
+

Expected behavior
+It should be return just like test tools at jsonpath.com

+

Screenshots
+image

+

Desktop (please complete the following information):

+
    +
  • OS: MacOS 11.6
  • +
  • Browser Safari
  • +
  • Version 15.1
  • +
+
+
+ + +
+ + + +
+
+ +
+
+ +
+ + +
+
+ + + + + + + +
+
+ +
+
+ + +
+ + + + +
+
+ + + Sign up for free + to join this conversation on GitHub. + Already have an account? + Sign in to comment + + + +
+ +
+
+ +
+
+
+ + + + + + +
+ + + +
+ Labels +
+ +
+ None yet +
+ +
+ + + +
+
+
+ Projects +
+ + + + None yet + + + +
+ + + + + + + +
+
+
+ +
+ Development +
+ + + +

No branches or pull requests

+ + + + +
+
+ + +
+ + + +
+
+
+ 1 participant +
+
+ + @maglevize +
+
+
+ + + + + + + + + + + +
+ +
+ +
+
+ + +
+ + +
+ +
+
+ +
+ + + + + + + + + + + + + + + + + + + + +