From 0d114f2adc48ac92f354eefb71ad0fc6851bda05 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 22 Mar 2022 00:17:12 +0100 Subject: [PATCH] Add new fetch method: Playwright Chromium (Selenium/WebDriver alternative) Co-authored-by: Wee Sritippho --- changedetectionio/content_fetcher.py | 86 +++++++++++++++++- .../static/images/Playwright-icon.png | Bin 0 -> 6392 bytes .../templates/watch-overview.html | 1 + docker-compose.yml | 24 +++++ playwright/Dockerfile | 13 +++ playwright/seccomp_profile.json | 12 +++ playwright/server.js | 10 ++ requirements.txt | 2 + 8 files changed, 143 insertions(+), 5 deletions(-) create mode 100644 changedetectionio/static/images/Playwright-icon.png create mode 100644 playwright/Dockerfile create mode 100644 playwright/seccomp_profile.json create mode 100644 playwright/server.js diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 3d036774..518b8b09 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -1,10 +1,6 @@ from abc import ABC, abstractmethod import chardet import os -from selenium import webdriver -from selenium.webdriver.common.desired_capabilities import DesiredCapabilities -from selenium.webdriver.common.proxy import Proxy as SeleniumProxy -from selenium.common.exceptions import WebDriverException import requests import time import urllib3.exceptions @@ -26,6 +22,7 @@ class Fetcher(): headers = None fetcher_description ="No description" + fetcher_list_order = 0 @abstractmethod def get_error(self): @@ -68,16 +65,88 @@ def available_fetchers(): # @todo html_ is maybe better as fetcher_ or something # In this case, make sure to edit the default one in store.py and fetch_site_status.py if "html_" in name: - t=tuple([name,obj.fetcher_description]) + t=tuple([name,obj.fetcher_description,obj.fetcher_list_order]) p.append(t) + # sort by obj.fetcher_list_order + p.sort(key=lambda x: x[2]) + # strip obj.fetcher_list_order from each member in the tuple + p = list(map(lambda x: x[:2], p)) return p +class html_playwright(Fetcher): + fetcher_description = "Playwright {}/Javascript".format( + os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() + ) + if os.getenv("PLAYWRIGHT_DRIVER_URL"): + fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL")) + fetcher_list_order = 3 + + browser_type = '' + command_executor = '' + + # Configs for Proxy setup + # In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server" + playwright_proxy_settings_mappings = ['server', 'bypass', 'username', 'password'] + + proxy=None + + def __init__(self): + # .strip('"') is going to save someone a lot of time when they accidently wrap the env value + self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') + self.command_executor = os.getenv( + "PLAYWRIGHT_DRIVER_URL", + 'ws://playwright-server:4444/playwright' + ).strip('"') + + # If any proxy settings are enabled, then we should setup the proxy object + proxy_args = {} + for k in self.playwright_proxy_settings_mappings: + v = os.getenv('playwright_proxy_' + k, False) + if v: + proxy_args[k] = v.strip('"') + + if proxy_args: + self.proxy = proxy_args + + def run(self, + url, + timeout, + request_headers, + request_body, + request_method, + ignore_status_codes=False): + + from playwright.sync_api import sync_playwright + + with sync_playwright() as p: + browser_type = getattr(p, self.browser_type) + browser = browser_type.connect(self.command_executor, timeout=timeout*1000) + # Set user agent to prevent Cloudflare from blocking the browser + context = browser.new_context( + user_agent="Mozilla/5.0", + proxy=self.proxy + ) + page = context.new_page() + response = page.goto(url, timeout=timeout*1000) + page.wait_for_timeout(5000) + + if response is None: + raise EmptyReply(url=url, status_code=None) + + self.status_code = response.status + self.content = page.content() + self.headers = response.all_headers() + + context.close() + browser.close() + class html_webdriver(Fetcher): if os.getenv("WEBDRIVER_URL"): fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL")) else: fetcher_description = "WebDriver Chrome/Javascript" + fetcher_list_order = 2 command_executor = '' @@ -92,9 +161,12 @@ class html_webdriver(Fetcher): proxy=None def __init__(self): + from selenium.webdriver.common.proxy import Proxy as SeleniumProxy + # .strip('"') is going to save someone a lot of time when they accidently wrap the env value self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"') + # If any proxy settings are enabled, then we should setup the proxy object proxy_args = {} for k in self.selenium_proxy_settings_mappings: @@ -113,6 +185,9 @@ class html_webdriver(Fetcher): request_method, ignore_status_codes=False): + from selenium import webdriver + from selenium.webdriver.common.desired_capabilities import DesiredCapabilities + from selenium.common.exceptions import WebDriverException # request_body, request_method unused for now, until some magic in the future happens. # check env for WEBDRIVER_URL @@ -158,6 +233,7 @@ class html_webdriver(Fetcher): # "html_requests" is listed as the default fetcher in store.py! class html_requests(Fetcher): fetcher_description = "Basic fast Plaintext/HTTP Client" + fetcher_list_order = 1 def run(self, url, diff --git a/changedetectionio/static/images/Playwright-icon.png b/changedetectionio/static/images/Playwright-icon.png new file mode 100644 index 0000000000000000000000000000000000000000..75db893b6a8d93e7225759f894e2a3ae3de090f8 GIT binary patch literal 6392 zcmb7J^;6W3(|_M_^Z}BBASFn5mz;E`ba#i+eYAvtAX0LKl%#Zc0v;jV(kb0YBXM6p z|H1R@%HI_Ze~g3mPj;OMMKW-;?7cUztPdjdpHx4;Rl1~7DGFn;g{=hofp)&nhxdStqJf1c;OsVl7L5MO+~di9ld z%YQKa-P=DDmYn(Z@1H7+WjZ|h$RkM^>Ny=%H;EsLY;i?y4k%QMW3q;{{U&^7X-<8g zL(Z2F5fZ6xC(VhT_3uxwG8Mx6dS3kvnf|L$vk!hTrS!9yt#Qn9qi2GS>n=@g@Z<15 z3(Kemma4n4`xxRZF=MKxOw|34pWAnq5A|6exN=(AKXt~ZH>NXP-)mj0*)Z5yHsy+! zo4Cjq_}c6r3+K|h+|*@Nbb(p#>~K8gh}4=*dZ1MGb`IBabCa7cKXn>hx8s|E1dkey z_whbcl{?~myj4|#S;F2m4@oBMzivBP!swSy?IaA4m|tI=odM^4l}C5B3vVHegV(uT81(SWD+*L^MN!Jad0>Pjt8>(hi>6j@O1K#UAy=QT{FMyxxO5 zX&ghNiG4xIi^0K2c?lw1nIDN@UT3IDpBbh23`-iX7Xsj9%g#(2(mXMkXSU!Ji4|+ogOCavOiQ@rxT0SBrEUyxWhcz>M1$w*;+^MV>B+y+oe;zms zSs{e2e#-j^fTvTyNMW{bjLq2+TZf@5a>XJA$p z+p>}(R+Hhk9xVZ_W?A(LAYJso!_gTDl<`u#vO7e0jm45oWzS<;gjw>REv z0s#O%4&T9f9k}fNbZ{({9uOG!TE}YBC;Gx-c)sC_6cs3WYFR9qyfvm6DNn*>(|exZ zAFlurO4oN%CVmWMi4UzSZt{ZluuO_Wd9^w@%jGu&brAsAMGgToa>Lt-^R z3F~!G;yK^9f^iZ4mJ<5zASEmO}xD%j$&%yJkxwgcD_S z?OmFo?cj+Aj(Dchu4wt7B1q90=%4h02>&0`FP3SCv9%eNE&b8CgM77kicc zWcu+SG9o%sU}~%26}WdU%}QAa_ju&7Mh8g*Z3Nog_*=g@-o4( zke<_im5}%2dTL8~li#VNg~5n&*ygwsr`VBg*4D`LGor6@A&i^r=BtH!*`(1j8vgtu zAI8`66}9u_dN9Nz7$BVZI+(N%jt5k*f_SA76Z{9U-_5krNFNup<0gCg^hZ)FDcC-@ zls)&6BUR=E9}nV_SlXL{pn3&du)KWx-HdzCa8m}&iZ9>ZsL~oH!|PEFOzuN0#y2}2 z9TE?4IUGC%eOk70TE`sX3e!oB93sJX)_w81NDtH z3|}%BH0uMeoJXomPt;pmNZ~$|gQYO6c>@zO#WGX!d^Jew4|!*z;mun3$ukTyjCTSZ zy5geSahM+IFZ3RwjnJ(qDV>`bw;Tk7ZB(-LRRZqG={0vU;ZIpgrw?`0Vt|y z3Gf`#Q4lJQgF!zsjnLR!ocPIBlkSfMR>uBaLij2_j=tO&i(&&h-|ED;Yg+RGPD(Yz zZx_ifV>@_8KS4(x%#9Rx(>6y(0=MR=3kQYV7tXOV(W{I`K~ki8ddE~%hX+p{Kumv( z3w?gi*QCtMe!lW9JjEPSBhn2+}Di)*vvh^15a8)8~?4k_2pykAE$LG^#K~7O4eaf8ykq_{CZ$euhaNCHx^KQlilCa=oOWq#`@s#1~!*Kv~)jJ6T#(J z*q0*7_B(pC5=bG${uYXE>z)ywZv!(1=4NGJ(zi*bq$%=Xua9C$KCDwHgTG^(_%>~D zs7oNE`Hi10Z3dnsAb#vVlV9Wf2z1Loz%8ghmo?b(%Wl0Q-I_Y7i`06>TxndbQB_4R z7vE>6YCdIF&P^^jP-#a75>Ks9SYdA?1l2-Yc&P#(*acl44gOt$6^FeDPf&F2!GeTJ zWd$~o!DQMs@}(D?SRc~ocknu)hdHfl^@=E!9y1=~Bfoavo_KQ1Xg^lKv(9yyw|J#jPZbpu_L1&DFEim_N*nT&G~a@E=xIrMP#;&snfh22G#F*= z%MctK`2?9~N}_ zeFnKBh9^|zSw_AHD93fr>I{@yp4Ixc-45YiL+u_=FufebfTgju9ZuIWjq(@WdFjTx z<#*OTIzwl3qUVpguz`LO{V9A>X;F#GGFqQ=1I0eBQ+&EmhlYPOpTI;U*Euj?(jtq3 z9mjvaFfvsz)P2tuxYR``xz?Ip>byRrNHjc^2ey(U{+7MyH0*3)b@MDb5?${;e^|Qn zvC2M7r_N|MXx(kA39((j5Sh8JbMJmX8U8dU343Z)1>R)tZ3)XlRW19u#gkIlKCc3B zziX!J2}xvnE`)7j2E96YYqMUPFF_-g$9kG>GCy9B?D057M2Ke`x(eboxB2}TXI_<{ z@f)|XqA2?4TS%Z*2z4jwOnR%2J8;Jq1Q%U??%`CnO%kE0}Ty;&^Z{2M-kK!1gZJ-9f}zn{&!ncU%59EWJZrfg=} ziQ?lV%MMCtT#VEL=n<+%Dltlqj1py7VTAN2X(__7{7&yaw*h&4?Ow-DIA{|1KJcJw zkDOcBKQ=|a-sS1p*5oVz+jPk@n|%Ls2Xr`Lij*>{$&@b!U3>cLINbzeeOxT-pO&l> zd7_jK5BPywnfFBIRx}B`7*M?<1mEq%uw)27dKujjQN1rMuPghGS6<|yA4g?;9id^r`EDgg)#H5Z1vg3ke zTQfhSm~@w`nAWK0P6kT+fP+2TjoX4*b9f3K6PY_bQM>ELXEv5vgkl-0fw-=5fr5lld;}&pSLMH*T$V*8~6ptdTHo4VtX(dQv+$N z_3qNm0!O6Sp)S$x39W%3^Da5_T;nyyin^{uoUZM;E3noF(j~*DPc^20f)B2&iZMo8 zc5K%e-%iN`*|EuSaCd2DN;n%T&9ZjvQl0(mq0P1AI9i@|1yM(<`Sql%Wbb>?tS0cQ z&fx(KC8BhiI}v9K@;#X46LUIh#DFYLT92>d4L^C!&%*Zh4o&GPed#e+eQUZssd*sg zQ$ELuFfqrsqy;?NU-zEe97^OtNz#V&H*FtotSTs{kb8e3WbVi0lJ}Kri+_X|lJ%KS z7v&Vw_`J_+?!zZpr;BGg@uQ~oM;CiHq&KL0U8wmB>rCCx443eKdpr=wXvlYVd_WH$ z$%{cY-v#gYQn-dZpg@I9Yo7TNYZune)`u~8xAuF^ak#pszTcUY^-S;vr#{NGfIMGr zsBO$w)7F=Dbbffx_>mKU^&`$bWZ~#%X788}#~Yew27garE&;!gq}snE%u3?P6d)12 zV{SVF(if<+3?H_h`WiDF8FGNWD2TDRlr)fp)9C%#t=}G<&-&#i!Y9G^Iqdpr;bYYevNV|ZdvfSL?b>-b8tg0)w7bEp0fm>`~ z-&>BTHW1`?YpUc0?XHa{@t7LbtKlFyfW>Mm(F+r|2fyHucDd2dtSU?fUg^RiwY#?G z#|h=m+pv7RM0Va9tRLnJm|Mz2$I=R35$xMp}#hOS&tA=!g3Y!QYq=aNt7{5t6FD4 zZh?Fjn%1xDI3S`GUoe0FCbC>rO0VcysT(+yGxHRhC zbXt8NEb9{(e5vG=hxuG`*-#g~`D&gruHl^%d%1IHEY82Btv;ZD`{|p)+`zWy4*s0Y ze(*$|)D0nuNg6DRt(#FGd{to2rw7;C5d29Q?6S6m{K4qggBBXikEW4VGwmxGa!aCx#qS69=2&;w zm%ZoNF?#Azzo3iZWyRYWx%(dxB3gW&9H}kaYIoT0go4`h5)=O@Re(QK8?B#aB1Ax( zCM8C07Jn2HJ?^Jly9kphNM7wKmJMnNw@y zDJy_Q2rSa%a$^3HVk1YS+r?6j@|`?s@C%N9;x0_iXP4Vq97^FFvDM5Z|h;Y-(-E161MYi3flCA01xu&Kfp zuR{-ymc9%KJh35}sYHIh#Ij!|q~9i`{k-Wu{PmEm7v+(;eT55wPscrf@YOaE&wfEf zoZr$wsMDix$%f9{@x$amSO}XLPRjxzPdXPr*n+}rEDjMhB@+sFUWvr%zpkom`_Q0O z<9+_*Qh;ZL(ZfIdKkls%oM}LgeM!;oBw9e4R&{7^QRd9YH^#S)`>TvbQE*a z9>;KyiVotc5<4QP*Y>PnFC+~%1cwE8>i02N^)iE^{#3+dORbDd?eg>2|N1q*DAaoX z*8#8r!;8@ltZBX`Dx(DD2d{})&U-XbV=~0g%BflLSRJz*3w1SDxP_T;5V(j}OCRN& z{zOd)xg{)+s01oUC#t(4UQAee(@LFKzuvv``_a;#+>`j2(fQ$p?rxiT+IRQgrn3!7 z<%}cASU6EBTCc@_^AVC?6(9b)a$y;0AiobzsyyZ_Uq0(RO589a6yVv`orj~xwy!fu@ci$n-GbWDUXL$ zngTtCFFt*K1BGF>aH_wx9ru%X&7neLO?yGBGXmi#v4)T$kekbh!TPmJ=kWC?57Y2-eT@+ap z+O*$GtVj7U)z~BHUU-tT!b0wKV;BOSb`Fns=o$|92h>VxvUs4r$l6c=zu|S0^S;BS zm(rJeCVy0ur3``}^QzsHXS3~%Da6@j&NdzX58@vMpu%3Z$3Cd+WK#QxoR1iU!W6$`*TWGpxB7M=|c7&fy_EisI zf?OB7@)_@63Sl=f$3 zVo0&5TLAStOv~Jo!^7?F%$#)o=*I0y#6&u~K`t2jU!!t(d45c}sVibivxCDWMxMl4 zusfauqfi^gX8Y2rKvNKTZoX#Ze$HjR>auw=AR_Gg^@?v>&FdFC0K8g!J?i~qubI+R zz?h1{)d)Y(_?bYEnT|+&un% z(+wep^LDLslEL~Ru!|!nPd-%uk+mTr1B&@lR{foJRS~y^F2j8+lM#yK$ZQ|?94s6* z?08|5H>aAu!iq_0-=__&#QG!kXQFl86B@N`^@oSL*<(Zrtk`z*?iF11OUwOoD^oWC z=e4o5w!-Qcs2cD{ie^Xh>X;rwJ>mn}WwxonXM^G?&Ht77`_LF~Gz(W$sp6RbeJucG Mc@4R08Ow literal 0 HcmV?d00001 diff --git a/changedetectionio/templates/watch-overview.html b/changedetectionio/templates/watch-overview.html index 313c1bf5..506cb950 100644 --- a/changedetectionio/templates/watch-overview.html +++ b/changedetectionio/templates/watch-overview.html @@ -52,6 +52,7 @@ {{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}} {%if watch.fetch_backend == "html_webdriver" %}{% endif %} + {%if watch.fetch_backend == "html_playwright" %}{% endif %} {% if watch.last_error is defined and watch.last_error != False %}
{{ watch.last_error }}
diff --git a/docker-compose.yml b/docker-compose.yml index 2761031f..0914ed6f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,6 +23,17 @@ services: # # https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy # + # Alternative Playwright URL, do not use "'s or 's! + # - PLAYWRIGHT_DRIVER_URL=ws://playwright-server:4444/playwright + # + # Alternative Playwright Browser Type, must match with PLAYWRIGHT_BROWSER_TYPE in the playwright-server service + # See https://playwright.dev/docs/browsers + # - PLAYWRIGHT_BROWSER_TYPE=chromium + # + # Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password + # + # https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-option-proxy + # # Plain requsts - proxy support example. # - HTTP_PROXY=socks5h://10.10.1.10:1080 # - HTTPS_PROXY=socks5h://10.10.1.10:1080 @@ -60,6 +71,19 @@ services: # - /dev/shm:/dev/shm # restart: unless-stopped +# playwright-server: +# hostname: playwright-server +# build: ./playwright +# environment: +# - PLAYWRIGHT_PORT=4444 +# # Must match with PLAYWRIGHT_BROWSER_TYPE in the changedetection service +# - PLAYWRIGHT_BROWSER_TYPE=chromium +# ipc: host +# user: pwuser +# security_opt: +# - seccomp:./playwright/seccomp_profile.json +# restart: unless-stopped + volumes: changedetection-data: diff --git a/playwright/Dockerfile b/playwright/Dockerfile new file mode 100644 index 00000000..8dcd659e --- /dev/null +++ b/playwright/Dockerfile @@ -0,0 +1,13 @@ +FROM mcr.microsoft.com/playwright:v1.20.0-focal + +WORKDIR /server +RUN npm install playwright +COPY server.js . + +ENV PLAYWRIGHT_PORT=4444 +ENV PLAYWRIGHT_BROWSER_TYPE=chromium +ENV PLAYWRIGHT_HEADLESS=true + +EXPOSE ${PLAYWRIGHT_PORT} + +CMD [ "node", "server.js" ] diff --git a/playwright/seccomp_profile.json b/playwright/seccomp_profile.json new file mode 100644 index 00000000..bfeea36c --- /dev/null +++ b/playwright/seccomp_profile.json @@ -0,0 +1,12 @@ +{ + "comment": "Allow create user namespaces", + "names": [ + "clone", + "setns", + "unshare" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "includes": {}, + "excludes": {} +} diff --git a/playwright/server.js b/playwright/server.js new file mode 100644 index 00000000..9a730305 --- /dev/null +++ b/playwright/server.js @@ -0,0 +1,10 @@ +const playwright = require('playwright'); + +const port = parseInt(process.env.PLAYWRIGHT_PORT) || 4444; +const browserType = process.env.PLAYWRIGHT_BROWSER_TYPE?.toLowerCase() || 'chromium'; +const headless = process.env.PLAYWRIGHT_HEADLESS?.toLowerCase() === 'true' || true; +const wsPath = 'playwright'; +console.log('using port:', port, 'browser:', browserType, 'headless:', headless, 'wspath:', wsPath); + +const serverPromise = playwright[browserType].launchServer({ headless: headless, port: port, wsPath: wsPath }); +serverPromise.then(bs => console.log(bs.wsEndpoint())); diff --git a/requirements.txt b/requirements.txt index feef375b..2a8be8e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,3 +35,5 @@ lxml # 3.141 was missing socksVersion, 3.150 was not in pypi, so we try 4.1.0 selenium ~= 4.1.0 +# An alternative to Selenium +playwright ~= 1.20