dgtlmoon 2 weeks ago committed by GitHub
commit cedc95bb31
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -1,8 +1,12 @@
from . import difference_detection_processor
from ..html_tools import xpath1_filter as xpath_filter
# xpath1 is a lot faster and is sufficient here
from ..html_tools import extract_json_as_string, has_ldjson_product_info
from copy import deepcopy
from loguru import logger
import hashlib
import re
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@ -10,16 +14,76 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name = 'Re-stock detection for single product pages'
description = 'Detects if the product goes back to in-stock'
class UnableToExtractRestockData(Exception):
def __init__(self, status_code):
# Set this so we can use it in other parts of the app
self.status_code = status_code
return
def get_itemprop_availability(html_content):
"""
`itemprop` is a global attribute
https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/itemprop
https://schema.org/ItemAvailability
<div class="product-offer" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
...
<link itemprop="availability" href="https://schema.org/OutOfStock" />
:return:
"""
# Try/prefer the structured data first if it exists
# https://schema.org/ItemAvailability Which strings mean we should consider it in stock?
# Chewing on random content could throw any kind of exception, best to catch it and move on if possible.
#LD-JSON type
value = None
try:
if has_ldjson_product_info(html_content):
value = extract_json_as_string(html_content.lower(), "json:$..offers.availability", ensure_is_ldjson_info_type=True)
if value:
value = re.sub(r'(?i)^(https|http)://schema.org/', '', value.strip(' "\''))
logger.debug(f"Has 'LD-JSON' - '{value}'")
except Exception as e:
# This should be OK, we will attempt the scraped version instead
logger.warning(f"Exception getting get_itemprop_availability 'LD-JSON' - {str(e)}")
# Microdata style
if not value:
try:
value = xpath_filter("//*[@itemtype='https://schema.org/Offer']//*[@itemprop='availability']/@href", html_content)
if value:
value = re.sub(r'(?i)^(https|http)://schema.org/', '', value.strip(' "\'').lower())
logger.debug(f"Has 'Microdata' - '{value}'")
except Exception as e:
# This should be OK, we will attempt the scraped version instead
logger.warning(f"Exception getting get_itemprop_availability 'Microdata' - {str(e)}")
# RDFa style
if not value:
try:
value = xpath_filter("//*[@property='schema:availability']/@content", html_content)
if value:
value = re.sub(r'(?i)^(https|http)://schema.org/', '', value.strip(' "\'').lower())
logger.debug(f"Has 'RDFa' - '{value}'")
except Exception as e:
# This should be OK, we will attempt the scraped version instead
logger.warning(f"Exception getting get_itemprop_availability 'RDFa' - {str(e)}")
# @todo this should return dict/tuple of instock + price
return value
class perform_site_check(difference_detection_processor):
screenshot = None
xpath_data = None
def run_changedetection(self, uuid, skip_when_checksum_same=True):
# DeepCopy so we can be sure we don't accidently change anything by reference
@ -29,7 +93,7 @@ class perform_site_check(difference_detection_processor):
raise Exception("Watch no longer exists.")
# Unset any existing notification error
update_obj = {'last_notification_error': False, 'last_error': False}
update_obj = {'last_notification_error': False, 'last_error': False, 'in_stock': None}
self.screenshot = self.fetcher.screenshot
self.xpath_data = self.fetcher.xpath_data
@ -38,23 +102,44 @@ class perform_site_check(difference_detection_processor):
update_obj['content_type'] = self.fetcher.headers.get('Content-Type', '')
update_obj["last_check_status"] = self.fetcher.get_last_status_code()
# Main detection method
fetched_md5 = None
if self.fetcher.instock_data:
fetched_md5 = hashlib.md5(self.fetcher.instock_data.encode('utf-8')).hexdigest()
availability = get_itemprop_availability(html_content=self.fetcher.content)
if availability:
self.fetcher.instock_data = availability # Stored as the text snapshot
# @todo: Configurable?
if any(availability in s for s in
[
'instock',
'Instoreonly',
'limitedavailability',
'onlineonly',
'presale' # Debatable?
]):
update_obj['in_stock'] = True
else:
update_obj['in_stock'] = False
# Fallback to scraping the content for keywords (done in JS)
if update_obj['in_stock'] == None and self.fetcher.instock_data:
# 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold.
update_obj["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False
update_obj['in_stock'] = True if self.fetcher.instock_data == 'Possibly in stock' else False
logger.debug(f"Watch UUID {uuid} restock check returned '{self.fetcher.instock_data}' from JS scraper.")
else:
if not self.fetcher.instock_data:
raise UnableToExtractRestockData(status_code=self.fetcher.status_code)
# Main detection method
fetched_md5 = None
fetched_md5 = hashlib.md5(self.fetcher.instock_data.encode('utf-8')).hexdigest()
# The main thing that all this at the moment comes down to :)
changed_detected = False
logger.debug(f"Watch UUID {uuid} restock check - Previous MD5: {watch.get('previous_md5')}, Fetched MD5 {fetched_md5}")
if watch.get('previous_md5') and watch.get('previous_md5') != fetched_md5:
if watch.get('in_stock') != update_obj.get('in_stock'):
# Yes if we only care about it going to instock, AND we are in stock
if watch.get('in_stock_only') and update_obj["in_stock"]:
if watch.get('in_stock_only') and update_obj['in_stock']:
changed_detected = True
if not watch.get('in_stock_only'):
@ -64,3 +149,4 @@ class perform_site_check(difference_detection_processor):
# Always record the new checksum
update_obj["previous_md5"] = fetched_md5
return changed_detected, update_obj, self.fetcher.instock_data.encode('utf-8').strip()

@ -0,0 +1,73 @@
#!/usr/bin/python3
from flask import url_for
from .util import live_server_setup, wait_for_all_checks, extract_UUID_from_client
instock_props = [
# LD JSON
'<script type=\'application/ld+json\'>[{"@context":"http://schema.org","@type":"WebSite","name":"Velkoobchod České Díly.cz","description":"Velkoobchodní a maloobchodní prodej originálních a náhradních dílů pro širokou škálu osobních a užitkových vozidel. Jsme největší obchod s náhradními díly v Čechách. Kamenná prodejna v Praze. Široký výběr značek za nejnižší ceny na trhu. MANN-FILTER, Bosch, LUK, VALEO, KYB, NGK, TRW, Brembo, SACHS, FEBI BILSTENI, ATE, INA, CONTIT.VlastnímeECH, PIERBURG, CASTROL , MOTUL, MOBIL, SHELL ,TOTAL ,elf ,LIQUI MOLY , wynn`s a další. Autodoplňky. Autokosmetika. Vybavení pro dílny. Nabídka olejů všech druhů a značek. Nejlevnější autodlíly.","url":"https://ceskedily.cz/autodily/dodge/challenger-kupe/5.7-280kw/filtr?productId=3038915","potentialAction":{"@type":"SearchAction","target":"https://ceskedily.cz/vyhledavani?search={query}","query-input":{"@type":"PropertyValueSpecification","valueRequired":"http://schema.org/True","valueName":"query"}},"publisher":{"@context":"http://schema.org","@type":"Organization","name":"Velkoobchod České Díly.cz","url":"https://ceskedily.cz/","logo":"https://data.kvikymart.space/ceskedily.cz/images/0m/77k/77026/77026_3195959275.png","sameAs":["https://twitter.com/CeskeD","https://www.instagram.com/ceskedily/?hl=cs"]},"sameAs":["https://twitter.com/CeskeD","https://www.instagram.com/ceskedily/?hl=cs"]},{"@context":"http://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":0,"item":{"@id":"/autodily","name":"Autodíly pro osobní vozy"}},{"@type":"ListItem","position":1,"item":{"@id":"/autodily/dodge","name":"DODGE"}},{"@type":"ListItem","position":2,"item":{"@id":"https://ceskedily.cz/autodily/dodge/challenger-kupe/5.7-280kw","name":"CHALLENGER kupé • 5.7 • 280 kW"}}]},{"@context":"http://schema.org","@type":"Product","name":"Olejový filtr K&N Filters HP-2010","description":"","mpn":"HP-2010","brand":"K&N Filters","image":"https://digital-assets.tecalliance.services/images/1600/c8fe1f1428021f4fe17a39297686178b04cba885.jpg","offers":{"@context":"http://schema.org","@type":"Offer","price":294.0,"priceCurrency":"CZK","url":"https://ceskedily.cz/olejovy-filtr-k-n-filters-hp-2010","availability":"http://schema.org/InStock"}}]</script>',
'<script id="product-jsonld" type="application/ld+json">{"@context":"https://schema.org","@type":"Product","brand":{"@type":"Brand","name":"Ubiquiti"},"name":"UniFi Express","sku":"UX","description":"Impressively compact UniFi Cloud Gateway and WiFi 6 access point that runs UniFi Network. Powers an entire network or simply meshes as an access point.","url":"https://store.ui.com/us/en/products/ux","image":{"@type":"ImageObject","url":"https://cdn.ecomm.ui.com/products/4ed25b4c-db92-4b98-bbf3-b0989f007c0e/123417a2-895e-49c7-ba04-b6cd8f6acc03.png","width":"1500","height":"1500"},"offers":{"@type":"Offer","availability":"https://schema.org/InStock","priceSpecification":{"@type":"PriceSpecification","price":149,"priceCurrency":"USD","valueAddedTaxIncluded":false}}}</script>',
'<script id="product-schema" type="application/ld+json">{"@context": "https://schema.org","@type": "Product","itemCondition": "https://schema.org/NewCondition","image": "//1.com/hmgo","name": "Polo MuscleFit","color": "Beige","description": "Polo","sku": "0957102010","brand": {"@type": "Brand","name": "H&M"},"category": {"@type": "Thing","name": "Polo"},"offers": [{"@type": "Offer","url": "https:/www2.xxxxxx.com/fr_fr/productpage.0957102010.html","priceCurrency": "EUR","price": "25.99","availability": "http://schema.org/InStock","seller": { "@type": "Organization", "name": "H&amp;M"}}]}</script>'
# Microdata
'<div itemscope itemtype="https://schema.org/Product"><h1 itemprop="name">Example Product</h1><p itemprop="description">This is a sample product description.</p><div itemprop="offers" itemscope itemtype="https://schema.org/Offer"><p>Price: <span itemprop="price">$19.99</span></p><link itemprop="availability" href="https://schema.org/InStock" /></div></div>'
]
out_of_stock_props = [
# out of stock AND contains multiples
'<script type="application/ld+json">{"@context":"http://schema.org","@type":"WebSite","url":"https://www.medimops.de/","potentialAction":{"@type":"SearchAction","target":"https://www.medimops.de/produkte-C0/?fcIsSearch=1&searchparam={searchparam}","query-input":"required name=searchparam"}}</script><script type="application/ld+json">{"@context":"http://schema.org","@type":"Product","name":"Horsetrader: Robert Sangster and the Rise and Fall of the Sport of Kings","image":"https://images2.medimops.eu/product/43a982/M00002551322-large.jpg","productID":"isbn:9780002551328","gtin13":"9780002551328","category":"Livres en langue étrangère","offers":{"@type":"Offer","priceCurrency":"EUR","price":null,"itemCondition":"UsedCondition","availability":"OutOfStock"},"brand":{"@type":"Thing","name":"Patrick Robinson","url":"https://www.momox-shop.fr/,patrick-robinson/"}}</script>'
]
def set_original_response(props_markup):
test_return_data = f"""<html>
<body>
Some initial text<br>
<p>Which is across multiple lines</p>
<br>
So let's see what happens. <br>
<div>price: $10.99</div>
{props_markup}
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
return None
def test_restock_itemprop_basic(client, live_server):
live_server_setup(live_server)
test_url = url_for('test_endpoint', _external=True)
for p in instock_props:
set_original_response(props_markup=p)
client.post(
url_for("form_quick_watch_add"),
data={"url": test_url, "tags": 'restock tests', 'processor': 'restock_diff'},
follow_redirects=True
)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b' in-stock' in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
for p in out_of_stock_props:
set_original_response(props_markup=p)
client.post(
url_for("form_quick_watch_add"),
data={"url": test_url, "tags": '', 'processor': 'restock_diff'},
follow_redirects=True
)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'not-in-stock' in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data

@ -455,9 +455,12 @@ class update_worker(threading.Thread):
except UnableToExtractRestockData as e:
# Usually when fetcher.instock_data returns empty
logger.error(f"Exception (UnableToExtractRestockData) reached processing watch UUID: {uuid}")
logger.error(str(e))
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': f"Unable to extract restock data for this page unfortunately. (Got code {e.status_code} from server)"})
self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
self.datastore.update_watch(uuid=uuid,
update_obj={
'last_error': f"Unable to extract restock data for this page unfortunately. (Got code {e.status_code} from server), no embedded stock information was found and nothing interesting in the text, try using this watch with Chrome.",
}
)
process_changedetection_results = False
except Exception as e:
logger.error(f"Exception reached processing watch UUID: {uuid}")

Loading…
Cancel
Save