diff options
author | Samuel BartÃk <[email protected]> | 2021-10-29 21:24:10 +0200 |
---|---|---|
committer | GitHub <[email protected]> | 2021-10-29 15:24:10 -0400 |
commit | 0e50335e2d26597cac20ee5264a78ea9c53e7ce0 (patch) | |
tree | fd2afdc62c312ce4e0450b3b4f2fcc883e46e4eb | |
parent | aaa8b48dad3a1d4de5a8f2f04fcc96cf1fb5fb3a (diff) | |
download | bazarr-0e50335e2d26597cac20ee5264a78ea9c53e7ce0.tar.gz bazarr-0e50335e2d26597cac20ee5264a78ea9c53e7ce0.zip |
Improved results with Titulki provider
-rw-r--r-- | libs/subliminal_patch/providers/titulky.py | 192 |
1 files changed, 120 insertions, 72 deletions
diff --git a/libs/subliminal_patch/providers/titulky.py b/libs/subliminal_patch/providers/titulky.py index e20bf0adc..272aee8b6 100644 --- a/libs/subliminal_patch/providers/titulky.py +++ b/libs/subliminal_patch/providers/titulky.py @@ -36,10 +36,10 @@ class TitulkySubtitle(Subtitle): hash_verifiable = False hearing_impaired_verifiable = False - def __init__(self, sub_id, language, title, year, release_info, fps, uploader, approved, page_link, download_link, season=None, episode=None, skip_wrong_fps=False): + def __init__(self, sub_id, language, names, season, episode, year, release_info, fps, uploader, approved, page_link, download_link, skip_wrong_fps=False): super().__init__(language, page_link=page_link) - self.title = title + self.names = names self.year = year self.sub_id = sub_id self.fps = fps @@ -73,21 +73,17 @@ class TitulkySubtitle(Subtitle): if self.episode and self.episode == video.episode: matches.add('episode') - if self.season is None and self.episode is None: - matches.add('episode') - - if sanitize(f"S{video.season:02d}E{video.episode:02d}") in sanitize(self.title): - matches.add('season') - matches.add('episode') - - if video.series and sanitize(video.series) in sanitize(self.title): + name_matches = [video.series and sanitize(name) in sanitize(video.series) for name in self.names] + if any(name_matches): matches.add('series') - + elif _type == 'movie': ## MOVIE - if video.title and sanitize(video.title) in sanitize(self.title): - matches.add('title') + name_matches = [video.title and sanitize(name) in sanitize(video.title) for name in self.names] + if any(name_matches): + matches.add('title') + ## MOVIE OR EPISODE if video.year and video.year == self.year: matches.add('year') @@ -223,68 +219,90 @@ class TitulkyProvider(Provider): return result - # Parse details of an individual subtitle: title, year, language, etc. + # Parse details of an individual subtitle: release, language, uploader, fps and year def parse_details(self, url): html_src = self.fetch_page(url) details_page_soup = ParserBeautifulSoup(html_src, ['lxml', 'html.parser']) details_container = details_page_soup.find('div', class_='detail') if not details_container: + # The subtitles were removed and got redirected to a different page. Better treat this silently. logger.debug("Titulky.com: Could not find details div container. Skipping.") return False - ### TITLE AND YEAR - h1_tag = details_container.find('h1', id='titulky') - if not h1_tag: - logger.debug("Titulky.com: Could not find h1 tag. Skipping.") - return False - # The h1 tag contains the title of the subtitle and year - h1_texts = [text.strip() for text in h1_tag.stripped_strings] - - if len(h1_texts) < 1: - logger.debug("Titulky.com: The header tag did not include sufficient data. Skipping.") - return False - title = h1_texts[0] - year = int(h1_texts[1]) if len(h1_texts) > 1 else None - - ### UPLOADER - uploader_tag = details_container.find('div', class_='ulozil') - if not uploader_tag: - logger.debug("Titulky.com: Could not find uploader tag. Skipping.") - return False - uploader_anchor_tag = uploader_tag.find('a') - if not uploader_anchor_tag: - logger.debug("Titulky.com: Could not find uploader anchor tag. Skipping.") - return False - uploader = uploader_anchor_tag.string.strip() - ### RELEASE + release = None release_tag = details_container.find('div', class_='releas') + if not release_tag: - logger.debug("Titulky.com: Could not find releas tag. Skipping.") - return False + raise Error("Could not find release tag. Did the HTML source change?") + release = release_tag.get_text(strip=True) + if not release: + logger.info("Titulky.com: No release information supplied on details page.") + ### LANGUAGE language = None czech_flag = details_container.select('img[src*=\'flag-CZ\']') slovak_flag = details_container.select('img[src*=\'flag-SK\']') + if czech_flag and not slovak_flag: language = Language('ces') elif slovak_flag and not czech_flag: language = Language('slk') + if not language: + logger.debug("Titulky.com: No language information supplied on details page.") + + ### UPLOADER + uploader = None + uploader_tag = details_container.find('div', class_='ulozil') + + if not uploader_tag: + raise Error("Could not find uploader tag. Did the HTML source change?") + + uploader_anchor_tag = uploader_tag.find('a') + + if not uploader_anchor_tag: + raise Error("Could not find uploader anchor tag. Did the HTML source change?") + + uploader = uploader_anchor_tag.string.strip() if uploader_anchor_tag else None + + if not uploader: + logger.debug("Titulky.com: No uploader name supplied on details page.") + ### FPS fps = None fps_icon_tag_selection = details_container.select('img[src*=\'Movieroll\']') - if len(fps_icon_tag_selection) > 0 and hasattr(fps_icon_tag_selection[0], 'parent'): - fps_icon_tag = fps_icon_tag_selection[0] - parent_text = fps_icon_tag.parent.get_text(strip=True) - match = re.findall('(\d+,\d+) fps', parent_text) + if not fps_icon_tag_selection and not hasattr(fps_icon_tag_selection[0], 'parent'): + raise Error("Could not find parent of the fps icon tag. Did the HTML source change?") + + fps_icon_tag = fps_icon_tag_selection[0] + parent_text = fps_icon_tag.parent.get_text(strip=True) + match = re.findall('(\d+,\d+) fps', parent_text) - # If the match is found, change the decimal separator to a dot and convert to float - fps = float(match[0].replace(',', '.')) if len(match) > 0 else None + # If the match is found, change the decimal separator to a dot and convert to float + fps = float(match[0].replace(',', '.')) if len(match) > 0 else None + + if not fps: + logger.debug("Titulky.com: No fps supplied on details page.") + + ### YEAR + year = None + h1_tag = details_container.find('h1', id='titulky') + + if not h1_tag: + raise Error("Could not find h1 tag. Did the HTML source change?") + + # The h1 tag contains the name of the subtitle and a year + h1_texts = [text for text in h1_tag.stripped_strings] + year = int(h1_texts[1]) if len(h1_texts) > 1 else None + + if not year: + logger.debug("Titulky.com: No year supplied on details page.") + # Clean up details_page_soup.decompose() @@ -292,19 +310,17 @@ class TitulkyProvider(Provider): # Return the subtitle details return { - 'title': title, - 'year': year, - 'uploader': uploader, 'release': release, 'language': language, - 'fps': fps + 'uploader': uploader, + 'fps': fps, + 'year': year } def process_row(self, row, keyword, thread_id=None, threads_data=None): try: - # The first anchor tag is an image preview, the second is the title + # The first anchor tag is an image preview, the second is the name anchor_tag = row.find_all('a')[1] - title = anchor_tag.string.strip() # The details link is relative, so we need to remove the dot at the beginning details_link = f"{self.server_url}{anchor_tag.get('href')[1:]}" id_match = re.findall('id=(\d+)', details_link) @@ -314,9 +330,19 @@ class TitulkyProvider(Provider): # Approved subtitles have a pbl1 class for their row, others have a pbl0 class approved = True if 'pbl1' in row.get('class') else False - # Skip subtitles that do not contain keyword in their title - if keyword and sanitize(keyword) not in sanitize(title): - logger.debug(f"Titulky.com: Skipping subtitle '{title}' because it does not contain the keyword '{keyword}'") + # Name + alternative names + table_columns = row.findAll("td") + main_name = anchor_tag.get_text(strip=True) + alt_names = [alt_name.strip() for alt_name in table_columns[2].get_text(strip=True).split("/")] + names = [main_name] + alt_names + + + # Loop over all subtitle names and check if the keyword contains them + name_matches = [keyword and sanitize(keyword) not in sanitize(name) for name in names] + + # Skip subtitles that do not contain the keyword in their name(s) + if keyword and all(name_matches) is False: + logger.debug(f"Titulky.com: Skipping subtitle with names: '{names}', because it does not not contain the keyword: '{keyword}'") if type(threads_data) is list and type(thread_id) is int: threads_data[thread_id] = { 'sub_info': None, @@ -336,22 +362,25 @@ class TitulkyProvider(Provider): return None - # Return additional data besides the subtitle details - details['id'] = sub_id - details['approved'] = approved - details['details_link'] = details_link - details['download_link'] = download_link + # Combine all subtitle data into one dict + result = { + 'names': names, + 'id': sub_id, + 'approved': approved, + 'details_link': details_link, + 'download_link': download_link + } + + result.update(details) if type(threads_data) is list and type(thread_id) is int: threads_data[thread_id] = { - 'sub_info': details, + 'sub_info': result, 'exception': None } return details - except: - e = Error("Whoops, something unexpected happend while fetching or parsing details page.") - + except Exception as e: if type(threads_data) is list and type(thread_id) is int: threads_data[thread_id] = { 'sub_info': None, @@ -362,7 +391,7 @@ class TitulkyProvider(Provider): # There are multiple ways to find subs from this provider: # 1. SEARCH by sub title - # - parameter: .................. Fulltext=<SUB TITLE> + # - parameter: .................. Fulltext=<SUB NAME> # 2. SEARCH by imdb id # - parameter: .................. IMDB=<IMDB ID> # 3. SEARCH by season/episode @@ -460,8 +489,18 @@ class TitulkyProvider(Provider): # and we can instationate it and add it to the list if sub_info: logger.debug(f"Titulky.com: Sucessfully retrieved subtitle info, row: {i}") - subtitle_instance = self.subtitle_class(sub_info['id'], sub_info['language'], sub_info['title'], sub_info['year'], sub_info['release'], sub_info['fps'], - sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], season=season, episode=episode, skip_wrong_fps=self.skip_wrong_fps) + + # Try to parse S00E00 string from the main subtitle name + sub_season = None + sub_episode = None + season_episode_string = re.findall('S(\d+)E(\d+)', sub_info['names'][0], re.IGNORECASE) + if season_episode_string: + sub_season = season_episode_string[0][0] + sub_episode = season_episode_string[0][1] + + + subtitle_instance = self.subtitle_class(sub_info['id'], sub_info['language'], sub_info['names'], sub_season, sub_episode, sub_info['year'], sub_info['release'], sub_info['fps'], + sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], skip_wrong_fps=self.skip_wrong_fps) subtitles.append(subtitle_instance) else: # No subtitle info was returned, i. e. something unexpected @@ -506,8 +545,8 @@ class TitulkyProvider(Provider): raise Error(f"No data returned from thread ID: {i}") # If an exception was raised in a thread, raise it again here - if "exception" in thread_data and thread_data["exception"]: - logger.debug(f"Titulky.com: An error occured in a thread ID: {i}") + if 'exception' in thread_data and thread_data['exception']: + logger.debug(f"Titulky.com: An error occured while processing a row in the thread ID {i}") raise thread_data['exception'] # If the thread returned a subtitle info, great, instantiate it and add it to the list @@ -515,8 +554,17 @@ class TitulkyProvider(Provider): # Instantiate the subtitle object logger.debug(f"Titulky.com: Sucessfully retrieved subtitle info, thread ID: {i}") sub_info = thread_data['sub_info'] - subtitle_instance = self.subtitle_class(sub_info['id'], sub_info['language'], sub_info['title'], sub_info['year'], sub_info['release'], sub_info['fps'], - sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], season=season, episode=episode, skip_wrong_fps=self.skip_wrong_fps) + + # Try to parse S00E00 string from the main subtitle name + sub_season = None + sub_episode = None + season_episode_string = re.findall('S(\d+)E(\d+)', sub_info['names'][0], re.IGNORECASE) + if season_episode_string: + sub_season = season_episode_string[0][0] + sub_episode = season_episode_string[0][1] + + subtitle_instance = self.subtitle_class(sub_info['id'], sub_info['language'], sub_info['names'], sub_season, sub_episode, sub_info['year'], sub_info['release'], sub_info['fps'], + sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], skip_wrong_fps=self.skip_wrong_fps) subtitles.append(subtitle_instance) else: # The thread returned data, but it didn't contain a subtitle info, i. e. something unexpected |