diff options
author | Alexandru Catalin Ene <[email protected]> | 2023-01-06 04:37:07 +0200 |
---|---|---|
committer | GitHub <[email protected]> | 2023-01-05 21:37:07 -0500 |
commit | 9105c53d6a9ebd309710ee47651b5a422da1886f (patch) | |
tree | bb521810aed21a7c393d9aa4d9df1e0d5e5e0ed6 | |
parent | e57a99d39eb6f6095568a11642f80b42c4512095 (diff) | |
download | bazarr-1.1.5-beta.0.tar.gz bazarr-1.1.5-beta.0.zip |
Updated regielive provider to use built in search and scraper APIv1.1.5-beta.0
-rw-r--r-- | libs/subliminal_patch/providers/regielive.py | 464 |
1 files changed, 434 insertions, 30 deletions
diff --git a/libs/subliminal_patch/providers/regielive.py b/libs/subliminal_patch/providers/regielive.py index 94fceef88..2d97d7adb 100644 --- a/libs/subliminal_patch/providers/regielive.py +++ b/libs/subliminal_patch/providers/regielive.py @@ -4,18 +4,29 @@ import logging import io import os -from requests import Session +#imports needed for the searchAPI +import re +import enum +import numpy as np +from time import sleep +from hashlib import sha1 +from subliminal.cache import region +from urllib import parse as urlparse +#end imports for searchAPI + +import zipfile + from guessit import guessit +from subzero.language import Language from subliminal_patch.providers import Provider from subliminal_patch.subtitle import Subtitle, guess_matches -from subliminal.subtitle import SUBTITLE_EXTENSIONS, fix_line_ending from subliminal.video import Episode, Movie -from subzero.language import Language - -import zipfile +from subliminal.subtitle import SUBTITLE_EXTENSIONS, fix_line_ending +from requests import Session, exceptions as req_exceptions logger = logging.getLogger(__name__) +BASE_URL = "https://subtitrari.regielive.ro" class RegieLiveSubtitle(Subtitle): """RegieLive Subtitle.""" @@ -63,6 +74,7 @@ class RegieLiveProvider(Provider): languages = {Language(l) for l in ['ron']} language = list(languages)[0] video_types = (Episode, Movie) + download_cookies = None SEARCH_THROTTLE = 8 def __init__(self): @@ -70,9 +82,7 @@ class RegieLiveProvider(Provider): def initialize(self): self.session = Session() - #self.url = 'http://api.regielive.ro/kodi/cauta.php' - # this is a proxy API/scraper for subtitrari.regielive.ro used for subtitles search only - self.url = 'http://subtitles.24-7.ro/index.php' + self.url = BASE_URL self.api = 'API-KODI-KINGUL' self.headers = {'RL-API': self.api} @@ -80,28 +90,19 @@ class RegieLiveProvider(Provider): self.session.close() def query(self, video, language): - payload = {} - if isinstance (video, Episode): - payload['nume'] = video.series - payload['sezon'] = video.season - payload['episod'] = video.episode - elif isinstance(video, Movie): - payload['nume'] = video.title - payload['an'] = video.year - response = self.session.post(self.url, data=payload, headers=self.headers) - logger.info(response.json()) + search_api = RegieLiveSearchAPI(video) + results = search_api.search_video() subtitles = [] - if response.json()['cod'] == 200: - results_subs = response.json()['rezultate'] - for film in results_subs: - for sub in results_subs[film]['subtitrari']: - logger.debug(sub) - subtitles.append( - RegieLiveSubtitle(sub['titlu'], video, sub['url'], sub['rating'], language) - ) + if results: + for result in results: + logger.debug(result) + subtitles.append( + RegieLiveSubtitle(result.title, video, result.download_url, result.rating, language) + ) # {'titlu': 'Chernobyl.S01E04.The.Happiness.of.All.Mankind.720p.AMZN.WEB-DL.DDP5.1.H.264-NTb', 'url': 'https://subtitrari.regielive.ro/descarca-33336-418567.zip', 'rating': {'nota': 4.89, 'voturi': 48}} # subtitle def __init__(self, language, filename, subtype, video, link): + self.download_cookies = search_api.get_req_cookies() return subtitles def list_subtitles(self, video, languages): @@ -112,15 +113,20 @@ class RegieLiveProvider(Provider): _addheaders = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Origin': 'https://subtitrari.regielive.ro', + 'Origin': BASE_URL, 'Accept-Language' : 'en-US,en;q=0.5', - 'Referer': 'https://subtitrari.regielive.ro', + 'Referer': BASE_URL, 'Pragma': 'no-cache', 'Cache-Control': 'no-cache' } + session.headers.update(_addheaders) - res = session.get('https://subtitrari.regielive.ro') - cookies = res.cookies + if self.download_cookies is None: #try and get the needed cookies through a request if no cookies exist from the API + res = session.get(BASE_URL) + cookies = res.cookies + else: + cookies = self.download_cookies + _zipped = session.get(subtitle.page_link, cookies=cookies) if _zipped: if _zipped.text == '500': @@ -150,3 +156,401 @@ class RegieLiveProvider(Provider): return archive.read(name) raise APIThrottled('Can not find the subtitle in the compressed file') + +""" +# Search helper for Regielive that uses scraping to find subtitles +# This utility API should return a list of RegieLiveAPIData objects when queried +# by using a mix of json api search and page scraping in order to fetch data +# from Regielive website. +# +# This may break at anytime since regex is very sensitive to website structure changes +# for this in the future I might make the regex to load directly from github +""" + +#data classes +class RegieLiveAPIData(): + 'data returned class' + title = '' + rating = None + download_url = '' + + def __init__(self, title, url, rating): + self.title = title + self.download_url = url + self.rating = rating + + def __repr__(self): + return "<RegieLiveAPIData: title = \ + " + str(self.title) + "; download url = \ + " + str(self.download_url) + \ + "; rating = " + str(self.rating.rating) + "/" + str(self.rating.count) + ">" + + +class RegieLiveAPIRating(): # probably an extraneous class + 'rating for the subtitle' + rating = 0 + count = 0 + + def __init__(self, rating, count): + if rating: + self.rating = rating + + if not count: + self.count = 0 + if count and isinstance(count, str) and count.isnumeric(): + self.count = count + elif count == 'vot': + self.count = 1 + else: + self.count = 0 + + +# constants +CACHE_PREFIX = 'RL_API' + +DEFAULT_HEADERS = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ + AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", + 'Origin': 'https://subtitrari.regielive.ro', + 'Accept-Language': 'en-US,en;q=0.5', + 'Referer': 'https://subtitrari.regielive.ro', + 'Pragma': 'no-cache', + 'Cache-Control': 'no-cache' +} + +REQUEST_TIMEOUT = 15 + +LITE_JSON_PATH = "/ajax/subtitrari/searchsuggest.php" +PAGE_SEARCH_PATH = "/cauta.html" +SEASON_URL = "sezonul-%i/" + +SUB_PAGE_EPISODE_PATTERN = r'(?ism)<h3>Episodul %s</h3>(.+?)</ul>' +SUB_PAGE_MOVIE_MATCH = re.compile( + r'(?ism)<div class="subtitrari">.*?<ul class="mt-6">(.+?)</ul>') + +SUB_FILE_INFO_MATCH = re.compile( + r'(?ism)id="sub_\d+">([^<]+)</span>.*?Nota ([0-9.]+)\s+(?:dintr-un\s+?(\w+)|din\s+?([0-9]+)\s*?)[^>].*?<a href="([^"]+)".+?</li>') +SEARCH_PAGE_MATCH = re.compile( + r'(?ism)class="detalii\s[^>]{1}.+?<a href="([^"]+)"[^>]+?>([^<]+)</a>\s*<span.+?>\((\d{4})\)</span>') + +# helpers +def title_match(s, t, ratio_calc=False): + """ title_match: + Tries to calculate the levenshtein distance between two strings. + If ratio_calc = True, the function computes the + levenshtein distance ratio of similarity between two strings + This function is mainly copied from the Levenshtein package + """ + # Initialize matrix of zeros + rows = len(s)+1 + cols = len(t)+1 + distance = np.zeros((rows, cols), dtype=int) + + for i in range(1, rows): + for k in range(1, cols): + distance[i][0] = i + distance[0][k] = k + + # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions + for col in range(1, cols): + for row in range(1, rows): + if s[row-1] == t[col-1]: + cost = 0 + else: + # the cost of a substitution is 2 for distance the cost of a substitution is 1. + if ratio_calc: + cost = 2 + else: + cost = 1 + distance[row][col] = min(distance[row-1][col] + 1, # Cost of deletions + # Cost of insertions + distance[row][col-1] + 1, + distance[row-1][col-1] + cost) # Cost of substitutions + if ratio_calc: + ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t)) + return ratio + else: + # This is the minimum number of edits needed to convert string a to string b + return distance[row][col] + +class SearchTypes(enum.Enum): + 'Search type based on video object received' + Movie = 1 + Episode = 2 + + +class RegieLiveSearchAPI(): + 'Main class that interfaces with regielive sub provider' + video = None + title = None + session = None + search_type = SearchTypes.Movie + + def __init__(self, video): + 'Constructor that needs a [Movie, Episode] object' + self.video = video + self.initialize() + + def initialize(self): + 'Instance initialization goes here' + if isinstance(self.video, Episode): + self.search_type = SearchTypes.Episode + self.title = self.video.series + else: + self.title = self.video.title + + self.session = Session() + self.session.headers.update(DEFAULT_HEADERS) + logger.debug('Initialized new RegieLiveSearchAPI with search type %s of object %s', + self.search_type, str(self.video)) + + def get_req_cookies(self): + 'Get cookies used for request' + if self.session: + return self.session.cookies + + return None + + def search_video(self): + 'Main function that should be called to get sub data back' + if self.video is None: + return None + + results = self.search_lite_api() + + if not results: + sleep(2.0) #stagger request in order to no flood the server + results = self.search_page() + + if not results or results['data'] is None: + return None # not logging since we can't get here without logging the reason elsewhere + + logger.debug(results) + found_subs = self.parse_page(results) + logger.debug(found_subs) + + return found_subs + + def parse_page(self, results): + 'fetch and parse episode/movie page' + if len(results['data']) > 1: + logger.warning("More than one page result for subtitle %s with data %s", + self.title, + str(results['data'])) + + sub_list = None + if self.search_type is SearchTypes.Movie: + sub_list = self.parse_movie_pages(results['data']) + else: + sub_list = self.parse_episode_pages(results['data']) + + return sub_list + + def parse_movie_pages(self, sub_page_data): + 'Fetch and parse movie page data' + sub_list = [] + for result in sub_page_data: + extracted_subs = self.extract_movie_sub_block( + self.get_page(result['url'], None)) + sub_data = self.parse_sub_block(extracted_subs) + if sub_data: + sub_list.extend(sub_data) + else: + logger.debug( + 'Empty results from url %s with resulted block %s', result['url'], str(sub_data)) + + return sub_list + + def parse_episode_pages(self, sub_page_data): + 'Fetch and parse episode pages' + season = SEASON_URL % self.video.season + url = '' + sub_list = [] + for result in sub_page_data: + url = urlparse.urljoin(result['url'], season) + extracted_subs = self.extract_episode_sub_block( + self.get_page(url, None)) + sub_data = self.parse_sub_block(extracted_subs) + if sub_data: + sub_list.extend(sub_data) + else: + logger.debug( + 'Empty results from url %s with resulted block %s', url, str(sub_data)) + + return sub_list + + def search_page(self): + """ + Scrape search the page for the title + This does not take into consideration pagination + since the titles should be pretty unique and this api + is not a search engine. + I will make the pagination too if this, later, turns out to be a problem + Return a similar object to the lite api in order to be consistent + """ + cache_key = sha1(CACHE_PREFIX + self.title.encode("utf-8"), usedforsecurity=False).digest() + cached_response = region.get(cache_key) + if cached_response: + logger.info("Found cached reply for search request %s", self.title) + return cached_response + + response = self.get_api_page(PAGE_SEARCH_PATH, {'s': self.title}) + data = {'error': True, 'data': []} + + if response: + m_iter = SEARCH_PAGE_MATCH.finditer(response) + if m_iter: + for m in m_iter: + data['data'].append({ + 'id': RegieLiveSearchAPI.get_id_from_url(m.group(1)), + 'text': m.group(2), + 'url': m.group(1), + 'an': m.group(3) + }) + + # could be more efficient doing this in the previous iteration + data['data'] = self.parse_json_results(data['data']) + + if data['data'] and len(data['data']) > 0: + data['error'] = False + region.set(cache_key, data) + + return data + + def search_lite_api(self): + 'Access the lite json api for info' + response = self.get_api_page(LITE_JSON_PATH, {'s': self.title}, True) + + if response is None: + logger.warning( + "Regielive lite API failed to provide a proper reply") + return None + + if response['error'] or not response['data']: + logger.warning("Regielive API responded with no results!") + logger.info(response) + return None + + response['data'] = self.parse_json_results(response['data']) + + return response + + def parse_json_results(self, data_arr): + 'Parses the results of our lite api request' + if not data_arr: + return None + + result = list(filter(self.json_result_filter, data_arr)) + + if not result: + return None + + return result + + def json_result_filter(self, element): + 'Filter function for json results' + if not element: + return False + + match_ratio = title_match(element['text'], self.title, True) + element_year = RegieLiveSearchAPI.get_safe_int(element['an']) + + #if none have valid years we'll let it match + #also RL sometimes has the year off by 1 + if abs(element_year - RegieLiveSearchAPI.get_safe_int(self.video.year)) <= 1 and match_ratio > 0.9: + return True + + logger.info("No match for title %s year %i and returned title %s year %i match ration %f", + self.title, + self.video.year, + element['text'], + element_year, + match_ratio) + return False + + def get_api_page(self, url, url_params, return_json=False): + 'request a page from RL API' + return self.get_page(urlparse.urljoin(BASE_URL, url), url_params, return_json) + + def get_page(self, url, url_params, return_json=False): + 'Request a page' + try: + req = self.session.get(url, params=url_params, + timeout=REQUEST_TIMEOUT, + allow_redirects=True) + req.raise_for_status() + + if return_json: + return req.json() + + return req.text + except req_exceptions.HTTPError as err: + logger.exception( + "Failed to request url %s\n Error %s", url, str(err)) + + return None + + def extract_movie_sub_block(self, page_html): + 'extract subtitles block from movie page' + m = SUB_PAGE_MOVIE_MATCH.search(page_html) + if m: + return m.group(1) + + logger.info("Could not find subtitle block for Movie %s", self.title) + return '' + + def extract_episode_sub_block(self, page_html): + 'extract subtitle from series page' + episode_zone_regex = SUB_PAGE_EPISODE_PATTERN % self.video.episode + m = None + try: + m = re.search(episode_zone_regex, page_html) + except Exception as err: + logger.debug(str(page_html)) + logger.exception(err) + + if m: + return m.group(1) + + logger.info("Could not find episode %i for season %i of series %s", + self.video.episode, + self.video.season, + self.title) + return '' + + def parse_sub_block(self, subs_block): + 'Parse sub block into subtitle objects' + if subs_block is None: + return None + + m_iter = SUB_FILE_INFO_MATCH.finditer(subs_block) + sub_list = [] + if m_iter: + for match in m_iter: + sub_list.append( + RegieLiveAPIData(match.group(1), match.group(5), + RegieLiveAPIRating(match.group(2), match.group(4)))) + else: + logger.debug('No subtitles matched for sub block %s of title %s', str( + subs_block), self.title) + + return sub_list + + @classmethod + def get_id_from_url(cls, url): + 'get the movie rl id from page url' + m = re.search(r'(?ms)(\d+)/', url) + if m: + return m.group(1) + + return 0 + + @classmethod + def get_safe_int(cls, value, default_value = 0): + 'returns an int from the supplied value or a default' + if value and ( isinstance(value, int) or (isinstance(value, str) and value.isdigit()) ): + return int(value) + + return default_value + |