From 13e1b9f94ccb9b34a310ad21abeff7ef4a47cbb0 Mon Sep 17 00:00:00 2001 From: meisnate12 Date: Mon, 5 Apr 2021 11:12:57 -0400 Subject: [PATCH] allows keyword IMDB searches #173 --- modules/imdb.py | 54 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/modules/imdb.py b/modules/imdb.py index ad171288..b57b8fad 100644 --- a/modules/imdb.py +++ b/modules/imdb.py @@ -13,13 +13,14 @@ class IMDbAPI: self.config = config self.urls = { "list": "https://www.imdb.com/list/ls", - "search": "https://www.imdb.com/search/title/?" + "search": "https://www.imdb.com/search/title/?", + "keyword": "https://www.imdb.com/search/keyword/?" } def validate_imdb_url(self, imdb_url): imdb_url = imdb_url.strip() - if not imdb_url.startswith(self.urls["list"]) and not imdb_url.startswith(self.urls["search"]): - raise Failed(f"IMDb Error: {imdb_url} must begin with either:\n{self.urls['list']} (For Lists)\n{self.urls['search']} (For Searches)") + if not imdb_url.startswith(self.urls["list"]) and not imdb_url.startswith(self.urls["search"]) and not imdb_url.startswith(self.urls["keyword"]): + raise Failed(f"IMDb Error: {imdb_url} must begin with either:\n{self.urls['list']} (For Lists)\n{self.urls['search']} (For Searches)\n{self.urls['keyword']} (For Keyword Searches)") return imdb_url def get_imdb_ids_from_url(self, imdb_url, language, limit): @@ -32,24 +33,47 @@ class IMDbAPI: header = {"Accept-Language": language} length = 0 imdb_ids = [] - try: results = self.send_request(current_url, header).xpath("//div[@class='desc']/span/text()")[0].replace(",", "") - except IndexError: raise Failed(f"IMDb Error: Failed to parse URL: {imdb_url}") - try: total = int(re.findall("(\\d+) title", results)[0]) - except IndexError: raise Failed(f"IMDb Error: No Results at URL: {imdb_url}") + if imdb_url.startswith(self.urls["keyword"]): + results = self.send_request(current_url, header).xpath("//div[@class='desc']/text()") + total = None + for result in results: + if "title" in result: + try: + total = int(re.findall("(\\d+) title", result)[0]) + break + except IndexError: + pass + if total is None: + raise Failed(f"IMDb Error: No Results at URL: {imdb_url}") + item_count = 50 + else: + try: results = self.send_request(current_url, header).xpath("//div[@class='desc']/span/text()")[0].replace(",", "") + except IndexError: raise Failed(f"IMDb Error: Failed to parse URL: {imdb_url}") + try: total = int(re.findall("(\\d+) title", results)[0]) + except IndexError: raise Failed(f"IMDb Error: No Results at URL: {imdb_url}") + item_count = 250 if "&start=" in current_url: current_url = re.sub("&start=\\d+", "", current_url) if "&count=" in current_url: current_url = re.sub("&count=\\d+", "", current_url) + if "&page=" in current_url: current_url = re.sub("&page=\\d+", "", current_url) if limit < 1 or total < limit: limit = total - remainder = limit % 250 - if remainder == 0: remainder = 250 - num_of_pages = math.ceil(int(limit) / 250) + + remainder = limit % item_count + if remainder == 0: remainder = item_count + num_of_pages = math.ceil(int(limit) / item_count) for i in range(1, num_of_pages + 1): - start_num = (i - 1) * 250 + 1 - length = util.print_return(length, f"Parsing Page {i}/{num_of_pages} {start_num}-{limit if i == num_of_pages else i * 250}") - response = self.send_request(f"{current_url}&count={remainder if i == num_of_pages else 250}&start={start_num}", header) - imdb_ids.extend(response.xpath("//div[contains(@class, 'lister-item-image')]//a/img//@data-tconst")) + start_num = (i - 1) * item_count + 1 + length = util.print_return(length, f"Parsing Page {i}/{num_of_pages} {start_num}-{limit if i == num_of_pages else i * item_count}") + if imdb_url.startswith(self.urls["keyword"]): + response = self.send_request(f"{current_url}&page={i}", header) + else: + response = self.send_request(f"{current_url}&count={remainder if i == num_of_pages else item_count}&start={start_num}", header) + if imdb_url.startswith(self.urls["keyword"]) and i == num_of_pages: + imdb_ids.extend(response.xpath("//div[contains(@class, 'lister-item-image')]//a/img//@data-tconst")[:remainder]) + else: + imdb_ids.extend(response.xpath("//div[contains(@class, 'lister-item-image')]//a/img//@data-tconst")) util.print_end(length) if imdb_ids: return imdb_ids - else: raise Failed(f"IMDb Error: No Movies Found at {imdb_url}") + else: raise Failed(f"IMDb Error: No IMDb IDs Found at {imdb_url}") @retry(stop_max_attempt_number=6, wait_fixed=10000) def send_request(self, url, header):