diff options
author | Samuel Bartík <[email protected]> | 2022-04-20 11:52:46 +0200 |
---|---|---|
committer | GitHub <[email protected]> | 2022-04-20 05:52:46 -0400 |
commit | a4d9eb694d675c0b85ad7d1b88574241232d1a82 (patch) | |
tree | cc662e33fa7be0573c19d814dc89c8091c7143a1 | |
parent | baa5173a48bc074a3aa87ca9155e2398ab7edb25 (diff) | |
download | bazarr-a4d9eb694d675c0b85ad7d1b88574241232d1a82.tar.gz bazarr-a4d9eb694d675c0b85ad7d1b88574241232d1a82.zip |
Improved search and fixed some issues with Titulky providerv1.0.4-beta.26
-rw-r--r-- | libs/subliminal_patch/providers/titulky.py | 715 |
1 files changed, 403 insertions, 312 deletions
diff --git a/libs/subliminal_patch/providers/titulky.py b/libs/subliminal_patch/providers/titulky.py index 409f3534d..f8f5e0e01 100644 --- a/libs/subliminal_patch/providers/titulky.py +++ b/libs/subliminal_patch/providers/titulky.py @@ -33,6 +33,25 @@ from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST logger = logging.getLogger(__name__) +# Check if any element from source array is contained partially or exactly in any element from target array +# Returns on the first match + + +def _contains_element(_from=None, _in=None, exactly=False): + source_array = _from + target_array = _in + + for source in source_array: + for target in target_array: + if exactly: + if sanitize(source) == sanitize(target): + return True + else: + if sanitize(source) in sanitize(target): + return True + + return False + class TitulkySubtitle(Subtitle): """Titulky.com subtitle""" @@ -78,8 +97,9 @@ class TitulkySubtitle(Subtitle): self.matches = None # Try to parse S00E00 string from the main subtitle name - season_episode_string = re.findall(r'S(\d+)E(\d+)', self.names[0], - re.IGNORECASE) + season_episode_string = None + if len(self.names) > 0: + season_episode_string = re.findall(r'S(\d+)E(\d+)', self.names[0], re.IGNORECASE) # If we did not search for subtitles with season and episode numbers in search query, # try to parse it from the main subtitle name that most likely contains it @@ -103,11 +123,14 @@ class TitulkySubtitle(Subtitle): sub_names = self._remove_season_episode_string(self.names) if _type == 'episode': - ## EPISODE + # EPISODE # match imdb_id of a series if video.series_imdb_id and video.series_imdb_id == self.imdb_id: - matches.add('series_imdb_id') + # NOTE: Is matches.add('series_imdb_id') doing anything? + # For now, let's match with the 'series' to not reject + # subs with no name but a correct imdb id. + matches.add('series') # match season/episode if self.season and self.season == video.season: @@ -116,27 +139,28 @@ class TitulkySubtitle(Subtitle): matches.add('episode') # match series name - series_names = [video.series] + video.alternative_series - logger.debug( - f"Titulky.com: Finding exact match between subtitle names {sub_names} and series names {series_names}" - ) - if _contains_element(_from=series_names, - _in=sub_names, - exactly=True): - matches.add('series') - - # match episode title - episode_titles = [video.title] - logger.debug( - f"Titulky.com: Finding exact match between subtitle names {sub_names} and episode titles {episode_titles}" - ) - if _contains_element(_from=episode_titles, - _in=sub_names, - exactly=True): - matches.add('episode_title') + if len(sub_names) > 0: + series_names = [video.series] + video.alternative_series + logger.debug( + f"Titulky.com: Finding exact match between subtitle names {sub_names} and series names {series_names}" + ) + if _contains_element(_from=series_names, + _in=sub_names, + exactly=True): + matches.add('series') + + # match episode title + episode_titles = [video.title] + logger.debug( + f"Titulky.com: Finding exact match between subtitle names {sub_names} and episode titles {episode_titles}" + ) + if _contains_element(_from=episode_titles, + _in=sub_names, + exactly=True): + matches.add('episode_title') elif _type == 'movie': - ## MOVIE + # MOVIE # match imdb_id of a movie if video.imdb_id and video.imdb_id == self.imdb_id: @@ -152,7 +176,7 @@ class TitulkySubtitle(Subtitle): exactly=True): matches.add('title') - ## MOVIE OR EPISODE + # MOVIE OR EPISODE # match year if video.year and video.year == self.year: @@ -163,8 +187,7 @@ class TitulkySubtitle(Subtitle): matches |= guess_matches(video, guessit(release, {"type": _type})) # If turned on in settings, then do not match if video FPS is not equal to subtitle FPS - if self.skip_wrong_fps and video.fps and self.fps and not framerate_equal( - video.fps, self.fps): + if self.skip_wrong_fps and video.fps and self.fps and not framerate_equal(video.fps, self.fps): logger.info(f"Titulky.com: Skipping subtitle {self}: wrong FPS") matches.clear() @@ -213,16 +236,13 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): raise ConfigurationError("Username and password must be specified!") if type(skip_wrong_fps) is not bool: - raise ConfigurationError( - f"Skip_wrong_fps {skip_wrong_fps} must be a boolean!") + raise ConfigurationError(f"Skip_wrong_fps {skip_wrong_fps} must be a boolean!") if type(approved_only) is not bool: - raise ConfigurationError( - f"Approved_only {approved_only} must be a boolean!") + raise ConfigurationError(f"Approved_only {approved_only} must be a boolean!") if type(multithreading) is not bool: - raise ConfigurationError( - f"Multithreading {multithreading} must be a boolean!") + raise ConfigurationError(f"Multithreading {multithreading} must be a boolean!") self.username = username self.password = password @@ -243,7 +263,7 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): # Set headers cached_user_agent = cache.get('titulky_user_agent') if cached_user_agent == NO_VALUE: - new_user_agent = AGENT_LIST[ randint(0, len(AGENT_LIST) - 1) ] + new_user_agent = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] cache.set('titulky_user_agent', new_user_agent) self.session.headers['User-Agent'] = new_user_agent else: @@ -271,7 +291,7 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): return True logger.info("Titulky.com: Logging in...") - + data = {'LoginName': self.username, 'LoginPassword': self.password} res = self.session.post(self.server_url, data, @@ -312,22 +332,24 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): else: raise AuthenticationError("Logout failed.") - # GET request a page. This functions acts as a requests.session.get proxy handling expired cached cookies + # GET request a page. This functions acts as a requests.session.get proxy handling expired cached cookies # and subsequent relogging and sending the original request again. If all went well, returns the response. - def get_request(self, url, ref=server_url, recursion=0): - # That's deep... recursion... Stop. We don't have infinite memmory. And don't want to + def get_request(self, url, ref=server_url, allow_redirects=False, _recursion=0): + # That's deep... recursion... Stop. We don't have infinite memmory. And don't want to # spam titulky's server either. So we have to just accept the defeat. Let it throw! - if recursion >= 5: - logger.debug(f"Titulky.com: Got into a loop while trying to send a request after relogging.") - raise AuthenticationError("Got into a loop and couldn't get authenticated!") + if _recursion >= 5: + logger.debug( + f"Titulky.com: Got into a loop while trying to send a request after relogging.") + raise AuthenticationError( + "Got into a loop and couldn't get authenticated!") logger.debug(f"Titulky.com: Fetching url: {url}") res = self.session.get( url, timeout=self.timeout, - allow_redirects=False, - headers={'Referer': quote(ref) if ref else None}) # URL encode ref if it has value + allow_redirects=allow_redirects, + headers={'Referer': quote(ref) if ref else None}) # URL encode ref if it has value # Check if we got redirected because login cookies expired. # Note: microoptimization - don't bother parsing qs for non 302 responses. @@ -336,15 +358,12 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): if location_qs['msg_type'][0] == 'e' and "Přihlašte se" in location_qs['msg'][0]: logger.debug(f"Titulky.com: Login cookies expired.") self.login(True) - return self.get_request(url, ref=ref, recursion=(recursion + 1)) - - return res - + return self.get_request(url, ref=ref, _recursion=(_recursion + 1)) - def fetch_page(self, url, ref=None): - logger.debug(f"Titulky.com: Fetching url: {url}") + return res - res = self.get_request(url, ref=ref) + def fetch_page(self, url, ref=server_url, allow_redirects=False): + res = self.get_request(url, ref=ref, allow_redirects=allow_redirects) if res.status_code != 200: raise HTTPError(f"Fetch failed with status code {res.status_code}") @@ -353,13 +372,9 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): return res.text - def build_search_url(self, params): + def build_url(self, params): result = f"{self.server_url}/?" - params['action'] = 'search' - # Requires subtitle names to match full search keyword - params['fsf'] = 1 - for key, value in params.items(): result += f'{key}={value}&' @@ -371,9 +386,39 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): return result + # Makes sure the function communicates with the caller as expected. For threads, do not return data, but + # pass them via threads_data object. For synchronous calls, treat it normally, without any changes. + def capable_of_multithreading(func): + def outer_func(*args, **kwargs): + if 'threads_data' in kwargs and 'thread_id' in kwargs: + if type(kwargs['threads_data']) is list and type(kwargs['thread_id']) is int: + try: + func_kwargs = kwargs.copy() + func_kwargs.pop('threads_data', None) + func_kwargs.pop('thread_id', None) + + returnValue = func(*args, **func_kwargs) + kwargs['threads_data'][kwargs['thread_id']] = { + 'return_value': returnValue, + 'exception': None + } + + except BaseException as e: + kwargs['threads_data'][kwargs['thread_id']] = { + 'return_value': None, + 'exception': e + } + raise e + else: + return func(*args, **kwargs) + + return outer_func + + # TODO: Parse name and alternative names of a series / movie # Parse details of an individual subtitle: imdb_id, release, language, uploader, fps and year - def parse_details(self, details_url, search_url): - html_src = self.fetch_page(details_url, ref=search_url) + @capable_of_multithreading + def parse_details(self, partial_info, ref_url=None): + html_src = self.fetch_page(partial_info['details_link'], ref=ref_url) details_page_soup = ParserBeautifulSoup(html_src, ['lxml', 'html.parser']) @@ -383,9 +428,9 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): logger.info("Titulky.com: Could not find details div container. Skipping.") return False - ### IMDB ID + # IMDB ID imdb_id = None - imdb_tag = details_container.find('a', attrs={'target': 'imdb'}) + imdb_tag = details_page_soup.find('a', attrs={'target': re.compile(r"imdb", re.IGNORECASE)}) if imdb_tag: imdb_url = imdb_tag.get('href') @@ -394,20 +439,19 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): if not imdb_id: logger.debug("Titulky.com: No IMDB ID supplied on details page.") - ### RELEASE + # RELEASE release = None release_tag = details_container.find('div', class_='releas') if not release_tag: - raise ParseResponseError( - "Could not find release tag. Did the HTML source change?") + raise ParseResponseError("Could not find release tag. Did the HTML source change?") release = release_tag.get_text(strip=True) if not release: logger.debug("Titulky.com: No release information supplied on details page.") - ### LANGUAGE + # LANGUAGE language = None czech_flag = details_container.select('img[src*=\'flag-CZ\']') slovak_flag = details_container.select('img[src*=\'flag-SK\']') @@ -420,7 +464,7 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): if not language: logger.debug("Titulky.com: No language information supplied on details page.") - ### UPLOADER + # UPLOADER uploader = None uploader_tag = details_container.find('div', class_='ulozil') @@ -441,7 +485,7 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): if not uploader: logger.debug("Titulky.com: No uploader name supplied on details page.") - ### FPS + # FPS fps = None fps_icon_tag_selection = details_container.select( 'img[src*=\'Movieroll\']') @@ -462,7 +506,7 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): if not fps: logger.debug("Titulky.com: No fps supplied on details page.") - ### YEAR + # YEAR year = None h1_tag = details_container.find('h1', id='titulky') @@ -482,7 +526,8 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): details_page_soup = None # Return the subtitle details - return { + + info = { 'releases': [release], 'language': language, 'uploader': uploader, @@ -491,88 +536,141 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): 'imdb_id': imdb_id } + info.update(partial_info) + + return info + + # Process a single row of subtitles from a query method + @capable_of_multithreading def process_row(self, row, video_names, - search_url, - thread_id=None, - threads_data=None): - try: - # The first anchor tag is an image preview, the second is the name - anchor_tag = row.find_all('a')[1] - # The details link is relative, so we need to remove the dot at the beginning - details_link = f"{self.server_url}{anchor_tag.get('href')[1:]}" - id_match = re.findall(r'id=(\d+)', details_link) - sub_id = id_match[0] if len(id_match) > 0 else None - download_link = f"{self.download_url}{sub_id}" - - # Approved subtitles have a pbl1 class for their row, others have a pbl0 class - approved = True if 'pbl1' in row.get('class') else False - - # Subtitle name + its alternative names - table_columns = row.findAll('td') - main_sub_name = anchor_tag.get_text(strip=True) - - alt_sub_names = [ - alt_sub_name.strip() - for alt_sub_name in table_columns[2].string.split('/') - ] if table_columns[2].string else [] - sub_names = [main_sub_name] + alt_sub_names - - # Does at least one subtitle name contain one of the video names? - # Skip subtitles that do not match - # Video names -> the main title and alternative titles of a movie or an episode and so on... - # Subtitle names -> the main name and alternative names of a subtitle displayed in search results. - # Could be handled in TitulkySubtitle class, however we want to keep the number of requests - # as low as possible and this prevents the from requesting the details page unnecessarily - if not _contains_element(_from=video_names, _in=sub_names): - logger.info( - f"Titulky.com: Skipping subtitle with names: {sub_names}, because there was no match with video names: {video_names}" - ) - if type(threads_data) is list and type(thread_id) is int: - threads_data[thread_id] = { - 'sub_info': None, - 'exception': None - } - - return None - - details = self.parse_details(details_link, search_url) - if not details: - # Details parsing was NOT successful, skipping - if type(threads_data) is list and type(thread_id) is int: - threads_data[thread_id] = { - 'sub_info': None, - 'exception': None - } - - return None - - # Combine all subtitle data into one dict - result = { - 'names': sub_names, - 'id': sub_id, - 'approved': approved, - 'details_link': details_link, - 'download_link': download_link - } - - result.update(details) - - if type(threads_data) is list and type(thread_id) is int: - threads_data[thread_id] = { - 'sub_info': result, - 'exception': None - } + ref_url): + # The first anchor tag is an image preview, the second is the name + anchor_tag = row.find_all('a')[1] + # The details link is relative, so we need to remove the dot at the beginning + details_link = f"{self.server_url}{anchor_tag.get('href')[1:]}" + id_match = re.findall(r'id=(\d+)', details_link) + sub_id = id_match[0] if len(id_match) > 0 else None + download_link = f"{self.download_url}{sub_id}" + + # Approved subtitles have a pbl1 class for their row, others have a pbl0 class + approved = True if 'pbl1' in row.get('class') else False + + # Subtitle name + its alternative names + table_columns = row.findAll('td') + main_sub_name = anchor_tag.get_text(strip=True) + + alt_sub_names = [ + alt_sub_name.strip() + for alt_sub_name in table_columns[2].string.split('/') + ] if table_columns[2].string else [] + sub_names = [main_sub_name] + alt_sub_names + + # Does at least one subtitle name contain one of the video names? + # Skip subtitles that do not match + # Video names -> the main title and alternative titles of a movie or an episode and so on... + # Subtitle names -> the main name and alternative names of a subtitle displayed in search results. + # Could be handled in TitulkySubtitle class, however we want to keep the number of requests + # as low as possible and this prevents the from requesting the details page unnecessarily + if not _contains_element(_from=video_names, _in=sub_names): + logger.info( + f"Titulky.com: Skipping subtitle with names: {sub_names}, because there was no match with video names: {video_names}" + ) + return None + + partial_info = { + 'names': sub_names, + 'id': sub_id, + 'approved': approved, + 'details_link': details_link, + 'download_link': download_link + } + details = self.parse_details(partial_info, ref_url) + + return details + + ######### + # FIXME: After switching from Multithreaded to singlethreaded option, the provider does not return + # any data and requires bazarr to restart in order to work again with this setting. However, + # switching back to multithreaded does NOT require bazarr to be restarded. + #### + # Execute a func for each array member and return results. Handles async/sync side of things + def execute_foreach(self, array, func, args=[], kwargs={}): + if not self.multithreading: + logger.info("Titulky.com: processing in sequence") + + result_array = [] + for i, obj in enumerate(array): + passing_args = [obj] + args + return_value = func(*passing_args, **kwargs) + + if return_value: + result_array.append(return_value) + else: + logger.debug(f"Titulky.com: No data returned, element number: {i}") + + return result_array + else: + logger.info(f"Titulky.com: processing in parelell, {self.max_threads} elements at a time.") + array_length = len(array) - return details - except Exception as e: - if type(threads_data) is list and type(thread_id) is int: - threads_data[thread_id] = {'sub_info': None, 'exception': e} + threads = [None] * array_length + threads_data = [None] * array_length - raise e + # Process in parallel, self.max_threads at a time. + cycles = math.ceil(array_length / self.max_threads) + for i in range(cycles): + # Batch number i + starting_index = i * self.max_threads # Inclusive + ending_index = starting_index + self.max_threads # Non-inclusive + + # Create threads for all elements in this batch + for j in range(starting_index, ending_index): + # Check if j-th element exists + if j < array_length: + # Element number j + logger.debug(f"Titulky.com: Creating thread {j} (batch: {i})") + # Merge supplied kwargs with our dict + kwargs.update({ + 'thread_id': j, + 'threads_data': threads_data + }) + # Create a thread for element j and start it + threads[j] = Thread( + target=func, + args=[array[j]] + args, + kwargs=kwargs + ) + threads[j].start() + + # Wait for all created threads to finish before moving to another batch of data + for j in range(starting_index, ending_index): + # Check if j-th data exists + if j < array_length: + threads[j].join() + + result_array = [] + # Process the resulting data from all threads + for i in range(len(threads_data)): + thread_data = threads_data[i] + + # If the thread returned didn't communicate at all + if not thread_data: + raise ProviderError(f"No communication from thread ID: {i}") + + # If an exception was raised in a thread, raise it again here + if 'exception' in thread_data and thread_data['exception']: + logger.debug(f"Titulky.com: An error occured while processing in the thread ID {i}") + raise thread_data['exception'] + + if 'return_value' in thread_data: + result_array.append(thread_data['return_value']) + + return result_array # There are multiple ways to find subs from this provider: + # \\ Using self.query function: "Universal search" // # 1. SEARCH by sub title # - parameter: .................. Fulltext=<SUB NAME> # 2. SEARCH by imdb id @@ -590,10 +688,120 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): # - parameter: .................. ASchvalene=<('1' for approved only | '-0' for subs awaiting approval | '' for all)> # - redirects should NOT be allowed here # - # 8. BROWSE subtitles by IMDB ID + # \\ Using self.browse function: "Episode search" // + # 8. BROWSE subtitles by IMDB ID (only episodes) # - Subtitles are here categorised by seasons and episodes # - URL: https://premium.titulky.com/?action=serial&step=<SEASON>&id=<IMDB ID> # - it seems that the url redirects to a page with their own internal ID, redirects should be allowed here + + # Special search only for episodes. Complements the query method of searching. + def browse_episodes(self, + imdb_id=None, + season=None, + episode=None): + + params = { + 'action': 'serial', + 'step': season, + # Remove the "tt" prefix + 'id': imdb_id[2:] + } + browse_url = self.build_url(params) + html_src = self.fetch_page(browse_url, allow_redirects=True) + + browse_page_soup = ParserBeautifulSoup( + html_src, ['lxml', 'html.parser']) + # Container element containing subtitle div rows, None if the series was not found or similar + container = browse_page_soup.find('form', class_='cloudForm') + + # No container with subtitles + if not container: + logger.debug("Titulky.com: Could not find container element. No subtitles found.") + return [] + + # All rows: subtitle rows, episode number rows, useless rows... Gotta filter this out. + all_rows = container.find_all('div', class_='row') + + # Filtering and parsing rows + episodes_dict = {} + last_ep_num = None + for row in all_rows: + # This element holds the episode number of following row(s) of subtitles + # E.g.: 1., 2., 3., 4. + episode_num = row.find('h5') + # Link to the sub details + details_anchor = row.find('a') if 'pbl1' in row['class'] or 'pbl0' in row['class'] else None + + if episode_num: + # The row is a div with episode number as its text content + try: + # Remove period at the end and parse the string into a number + number = int(episode_num.string.replace('.', '')) + last_ep_num = number + except: + logger.debug("Titulky.com: An error during parsing episode number!") + raise ProviderError("Could not parse episode number!") + elif details_anchor: + # The row is a subtitles row. Contains link to details page + if not last_ep_num: + logger.debug("Titulky.com: No previous episode number!") + raise ProviderError("Previous episode number missing, can't parse.") + + # If this row contains the first subtitles to an episode number, + # add an empty array into the episodes dict at its place. + if not last_ep_num in episodes_dict: + episodes_dict[last_ep_num] = [] + + details_link = f"{self.server_url}{details_anchor.get('href')[1:]}" + id_match = re.findall(r'id=(\d+)', details_link) + sub_id = id_match[0] if len(id_match) > 0 else None + download_link = f"{self.download_url}{sub_id}" + # Approved subtitles have a pbl1 class for their row, others have a pbl0 class + approved = True if 'pbl1' in row.get('class') else False + + result = { + 'names': [], + 'id': sub_id, + 'approved': approved, + 'details_link': details_link, + 'download_link': download_link + } + + episodes_dict[last_ep_num].append(result) + + # Rows parsed into episodes_dict, now lets read what we got. + if not episode in episodes_dict: + # well, we got nothing, that happens! + logger.debug("Titulky.com: No subtitles found") + return [] + + # Lets parse more details about subtitles that we actually care about + subtitle_details = self.execute_foreach(episodes_dict[episode], self.parse_details) + + # After parsing, create new instances of Subtitle class + subtitles = [] + for sub_info in subtitle_details: + subtitle_instance = self.subtitle_class( + sub_info['id'], + imdb_id, + sub_info['language'], + sub_info['names'], + season, + episode, + sub_info['year'], + sub_info['releases'], + sub_info['fps'], + sub_info['uploader'], + sub_info['approved'], + sub_info['details_link'], + sub_info['download_link'], + skip_wrong_fps=self.skip_wrong_fps, + asked_for_episode=True) + subtitles.append(subtitle_instance) + + return subtitles + + # Universal search for subtitles. Searches both episodes and movies. def query(self, language, video_names, @@ -603,8 +811,12 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): season=None, episode=None, imdb_id=None): - ## Build the search URL - params = {} + # Build the search URL + params = { + 'action': 'search', + # Requires subtitle names to match full search keyword + 'fsf': 1 + } # Keyword if keyword: @@ -641,9 +853,9 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): else: params['ASchvalene'] = '' - search_url = self.build_search_url(params) + search_url = self.build_url(params) - ## Search results page parsing + # Search results page parsing html_src = self.fetch_page(search_url) search_page_soup = ParserBeautifulSoup(html_src, ['lxml', 'html.parser']) @@ -669,138 +881,32 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): table_body = table.find('tbody') if not table_body: logger.debug("Titulky.com: Could not find table body") - raise ParseResponseError( - "Could not find table body. Did the HTML source change?") + raise ParseResponseError("Could not find table body. Did the HTML source change?") - ## Loop over all subtitles on the first page and put them in a list + # Loop over all subtitles on the first page and put them in a list subtitles = [] rows = table_body.find_all('tr') - - if not self.multithreading: - # Process the rows sequentially - logger.info("Titulky.com: processing results in sequence") - for i, row in enumerate(rows): - sub_info = self.process_row(row, video_names, search_url) - - # If subtitle info was returned, then everything was okay - # and we can instationate it and add it to the list - if sub_info: - logger.debug( - f"Titulky.com: Sucessfully retrieved subtitle info, row: {i}" - ) - - # If we found the subtitle by IMDB ID, no need to get it from details page - sub_imdb_id = imdb_id or sub_info['imdb_id'] - - subtitle_instance = self.subtitle_class( - sub_info['id'], - sub_imdb_id, - sub_info['language'], - sub_info['names'], - season, - episode, - sub_info['year'], - sub_info['releases'], - sub_info['fps'], - sub_info['uploader'], - sub_info['approved'], - sub_info['details_link'], - sub_info['download_link'], - skip_wrong_fps=self.skip_wrong_fps, - asked_for_episode=(type == 'episode')) - subtitles.append(subtitle_instance) - else: - # No subtitle info was returned, i. e. something unexpected - # happend during subtitle details page fetching and processing. - logger.debug(f"Titulky.com: No subtitle info retrieved, row: {i}") - else: - # Process the rows in paralell - logger.info( - f"Titulky.com: processing results in parelell, {self.max_threads} rows at a time." - ) - - threads = [None] * len(rows) - threads_data = [None] * len(rows) - - # Process rows in parallel, self.max_threads at a time. - cycles = math.ceil(len(rows) / self.max_threads) - for i in range(cycles): - # Batch number i - starting_index = i * self.max_threads # Inclusive - ending_index = starting_index + self.max_threads # Non-inclusive - - # Create threads for all rows in this batch - for j in range(starting_index, ending_index): - # Check if j-th row exists - if j < len(rows): - # Row number j - logger.debug( - f"Titulky.com: Creating thread {j} (batch: {i})") - # Create a thread for row j and start it - threads[j] = Thread( - target=self.process_row, - args=[rows[j], video_names, search_url], - kwargs={ - 'thread_id': j, - 'threads_data': threads_data - }) - threads[j].start() - - # Wait for all created threads to finish before moving to another batch of rows - for j in range(starting_index, ending_index): - # Check if j-th row exists - if j < len(rows): - threads[j].join() - - # Process the resulting data from all threads - for i in range(len(threads_data)): - thread_data = threads_data[i] - - # If the thread returned didn't return anything, but expected a dict object - if not thread_data: - raise ProviderError(f"No data returned from thread ID: {i}") - - # If an exception was raised in a thread, raise it again here - if 'exception' in thread_data and thread_data['exception']: - logger.debug( - f"Titulky.com: An error occured while processing a row in the thread ID {i}" - ) - raise thread_data['exception'] - - # If the thread returned a subtitle info, great, instantiate it and add it to the list - if 'sub_info' in thread_data and thread_data['sub_info']: - # Instantiate the subtitle object - logger.debug( - f"Titulky.com: Sucessfully retrieved subtitle info, thread ID: {i}" - ) - sub_info = thread_data['sub_info'] - - # If we found the subtitle by IMDB ID, no need to get it from details page - sub_imdb_id = imdb_id or sub_info['imdb_id'] - - subtitle_instance = self.subtitle_class( - sub_info['id'], - sub_imdb_id, - sub_info['language'], - sub_info['names'], - season, - episode, - sub_info['year'], - sub_info['releases'], - sub_info['fps'], - sub_info['uploader'], - sub_info['approved'], - sub_info['details_link'], - sub_info['download_link'], - skip_wrong_fps=self.skip_wrong_fps, - asked_for_episode=(type == 'episode')) - subtitles.append(subtitle_instance) - else: - # The thread returned data, but it didn't contain a subtitle info, i. e. something unexpected - # happend during subtitle details page fetching and processing. - logger.debug( - f"Titulky.com: No subtitle info retrieved, thread ID: {i}" - ) + for sub_info in self.execute_foreach(rows, self.process_row, args=[video_names, search_url]): + # If we found the subtitle by IMDB ID, no need to get it from details page + sub_imdb_id = imdb_id or sub_info['imdb_id'] + + subtitle_instance = self.subtitle_class( + sub_info['id'], + sub_imdb_id, + sub_info['language'], + sub_info['names'], + season, + episode, + sub_info['year'], + sub_info['releases'], + sub_info['fps'], + sub_info['uploader'], + sub_info['approved'], + sub_info['details_link'], + sub_info['download_link'], + skip_wrong_fps=self.skip_wrong_fps, + asked_for_episode=(type == 'episode')) + subtitles.append(subtitle_instance) # Clean up search_page_soup.decompose() @@ -814,20 +920,28 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): subtitles = [] # Possible paths: + # (0) Special for episodes: Browse TV Series page and search for subtitles # (1) Search by IMDB ID [and season/episode for tv series] # (2) Search by keyword: video (title|series) [and season/episode for tv series] # (3) Search by keyword: video series + S00E00 (tv series only) for language in languages: if isinstance(video, Episode): - video_names = [video.series, video.title - ] + video.alternative_series + video_names = [video.series, video.title] + video.alternative_series + + # (0) + if video.series_imdb_id: + logger.info("Titulky.com: Finding subtitles by browsing TV Series page (0)") + partial_subs = self.browse_episodes(imdb_id=video.series_imdb_id, + season=video.season, + episode=video.episode) + if (len(partial_subs) > 0): + subtitles += partial_subs + continue # (1) - logger.info( - "Titulky.com: Finding subtitles by IMDB ID, Season and Episode (1)" - ) if video.series_imdb_id: + logger.info("Titulky.com: Finding subtitles by IMDB ID, Season and Episode (1)") partial_subs = self.query(language, video_names, 'episode', @@ -839,9 +953,7 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): continue # (2) - logger.info( - "Titulky.com: Finding subtitles by keyword, Season and Episode (2)" - ) + logger.info("Titulky.com: Finding subtitles by keyword, Season and Episode (2)") keyword = video.series partial_subs = self.query(language, video_names, @@ -865,8 +977,8 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): video_names = [video.title] + video.alternative_titles # (1) - logger.info("Titulky.com: Finding subtitles by IMDB ID (1)") if video.imdb_id: + logger.info("Titulky.com: Finding subtitles by IMDB ID (1)") partial_subs = self.query(language, video_names, 'movie', @@ -892,46 +1004,25 @@ class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin): try: res.raise_for_status() except: - raise HTTPError( - f"An error occured during the download request to {subtitle.download_link}" - ) + raise HTTPError(f"An error occured during the download request to {subtitle.download_link}") archive_stream = io.BytesIO(res.content) archive = None if rarfile.is_rarfile(archive_stream): logger.debug("Titulky.com: Identified rar archive") archive = rarfile.RarFile(archive_stream) - subtitle_content = self.get_subtitle_from_archive(subtitle, archive) + subtitle_content = self.get_subtitle_from_archive( + subtitle, archive) elif zipfile.is_zipfile(archive_stream): logger.debug("Titulky.com: Identified zip archive") archive = zipfile.ZipFile(archive_stream) - subtitle_content = self.get_subtitle_from_archive(subtitle, archive) + subtitle_content = self.get_subtitle_from_archive( + subtitle, archive) else: subtitle_content = fix_line_ending(res.content) if not subtitle_content: - logger.debug( - "Titulky.com: No subtitle content found. The downloading limit has been most likely exceeded." - ) - raise DownloadLimitExceeded( - "Subtitles download limit has been exceeded") + logger.debug("Titulky.com: No subtitle content found. The downloading limit has been most likely exceeded.") + raise DownloadLimitExceeded("Subtitles download limit has been exceeded") subtitle.content = subtitle_content - - -# Check if any element from source array is contained partially or exactly in any element from target array -# Returns on the first match -def _contains_element(_from=None, _in=None, exactly=False): - source_array = _from - target_array = _in - - for source in source_array: - for target in target_array: - if exactly: - if sanitize(source) == sanitize(target): - return True - else: - if sanitize(source) in sanitize(target): - return True - - return False |