Improved results with Titulki provider

author: Samuel Bartík <[email protected]> 2021-10-29 21:24:10 +0200
committer: GitHub <[email protected]> 2021-10-29 15:24:10 -0400
commit: 0e50335e2d26597cac20ee5264a78ea9c53e7ce0 (patch)
tree: fd2afdc62c312ce4e0450b3b4f2fcc883e46e4eb
parent: aaa8b48dad3a1d4de5a8f2f04fcc96cf1fb5fb3a (diff)
download: bazarr-0e50335e2d26597cac20ee5264a78ea9c53e7ce0.tar.gz
bazarr-0e50335e2d26597cac20ee5264a78ea9c53e7ce0.zip
1 files changed, 120 insertions, 72 deletions
diff --git a/libs/subliminal_patch/providers/titulky.py b/libs/subliminal_patch/providers/titulky.py
index e20bf0adc..272aee8b6 100644
--- a/libs/subliminal_patch/providers/titulky.py
+++ b/libs/subliminal_patch/providers/titulky.py
@@ -36,10 +36,10 @@ class TitulkySubtitle(Subtitle):
     hash_verifiable = False
     hearing_impaired_verifiable = False
 
-    def __init__(self, sub_id, language, title, year, release_info, fps, uploader, approved, page_link, download_link, season=None, episode=None, skip_wrong_fps=False):
+    def __init__(self, sub_id, language, names, season, episode, year, release_info, fps, uploader, approved, page_link, download_link, skip_wrong_fps=False):
         super().__init__(language, page_link=page_link)
 
-        self.title = title
+        self.names = names
         self.year = year
         self.sub_id = sub_id
         self.fps = fps
@@ -73,21 +73,17 @@ class TitulkySubtitle(Subtitle):
             if self.episode and self.episode == video.episode:
                 matches.add('episode')
             
-            if self.season is None and self.episode is None:
-                matches.add('episode')
-                
-                if sanitize(f"S{video.season:02d}E{video.episode:02d}") in sanitize(self.title):
-                    matches.add('season')
-                    matches.add('episode')
-                    
-            if video.series and sanitize(video.series) in sanitize(self.title):
+            name_matches = [video.series and sanitize(name) in sanitize(video.series) for name in self.names]
+            if any(name_matches):
                 matches.add('series')
-            
+
         elif _type == 'movie':
             ## MOVIE
-            if video.title and sanitize(video.title) in sanitize(self.title):
-                matches.add('title') 
+            name_matches = [video.title and sanitize(name) in sanitize(video.title) for name in self.names]
+            if any(name_matches):
+                matches.add('title')
         
+        ## MOVIE OR EPISODE
         if video.year and video.year == self.year:
             matches.add('year')
 
@@ -223,68 +219,90 @@ class TitulkyProvider(Provider):
         
         return result
     
-    # Parse details of an individual subtitle: title, year, language, etc.
+    # Parse details of an individual subtitle: release, language, uploader, fps and year
     def parse_details(self, url):
         html_src = self.fetch_page(url)
         details_page_soup = ParserBeautifulSoup(html_src, ['lxml', 'html.parser'])
         
         details_container = details_page_soup.find('div', class_='detail')
         if not details_container:
+            # The subtitles were removed and got redirected to a different page. Better treat this silently.
             logger.debug("Titulky.com: Could not find details div container. Skipping.")
             return False
         
-        ### TITLE AND YEAR
-        h1_tag = details_container.find('h1', id='titulky')
-        if not h1_tag:
-            logger.debug("Titulky.com: Could not find h1 tag. Skipping.")
-            return False
-        # The h1 tag contains the title of the subtitle and year
-        h1_texts = [text.strip() for text in h1_tag.stripped_strings]
-        
-        if len(h1_texts) < 1:
-            logger.debug("Titulky.com: The header tag did not include sufficient data. Skipping.")
-            return False
-        title = h1_texts[0]
-        year = int(h1_texts[1]) if len(h1_texts) > 1 else None
-        
-        ### UPLOADER
-        uploader_tag = details_container.find('div', class_='ulozil')
-        if not uploader_tag:
-            logger.debug("Titulky.com: Could not find uploader tag. Skipping.")
-            return False
-        uploader_anchor_tag = uploader_tag.find('a')
-        if not uploader_anchor_tag:
-            logger.debug("Titulky.com: Could not find uploader anchor tag. Skipping.")
-            return False
-        uploader = uploader_anchor_tag.string.strip()
-        
         ### RELEASE
+        release = None
         release_tag = details_container.find('div', class_='releas')
+        
         if not release_tag:
-            logger.debug("Titulky.com: Could not find releas tag. Skipping.")
-            return False
+            raise Error("Could not find release tag. Did the HTML source change?")
+        
         release = release_tag.get_text(strip=True)
         
+        if not release:
+            logger.info("Titulky.com: No release information supplied on details page.")
+
         ### LANGUAGE
         language = None
         czech_flag = details_container.select('img[src*=\'flag-CZ\']')
         slovak_flag = details_container.select('img[src*=\'flag-SK\']')
+        
         if czech_flag and not slovak_flag:
             language = Language('ces')
         elif slovak_flag and not czech_flag: 
             language = Language('slk')
         
+        if not language:
+            logger.debug("Titulky.com: No language information supplied on details page.")
+
+        ### UPLOADER
+        uploader = None
+        uploader_tag = details_container.find('div', class_='ulozil')
+        
+        if not uploader_tag:
+            raise Error("Could not find uploader tag. Did the HTML source change?")
+        
+        uploader_anchor_tag = uploader_tag.find('a')
+        
+        if not uploader_anchor_tag:
+            raise Error("Could not find uploader anchor tag. Did the HTML source change?")
+        
+        uploader = uploader_anchor_tag.string.strip() if uploader_anchor_tag else None
+        
+        if not uploader:
+            logger.debug("Titulky.com: No uploader name supplied on details page.")
+
         ### FPS
         fps = None
         fps_icon_tag_selection = details_container.select('img[src*=\'Movieroll\']')
         
-        if len(fps_icon_tag_selection) > 0 and hasattr(fps_icon_tag_selection[0], 'parent'):
-            fps_icon_tag = fps_icon_tag_selection[0]
-            parent_text = fps_icon_tag.parent.get_text(strip=True)
-            match = re.findall('(\d+,\d+) fps', parent_text)
+        if not fps_icon_tag_selection and not hasattr(fps_icon_tag_selection[0], 'parent'):
+            raise Error("Could not find parent of the fps icon tag. Did the HTML source change?")
+        
+        fps_icon_tag = fps_icon_tag_selection[0]
+        parent_text = fps_icon_tag.parent.get_text(strip=True)
+        match = re.findall('(\d+,\d+) fps', parent_text)
             
-            # If the match is found, change the decimal separator to a dot and convert to float
-            fps = float(match[0].replace(',', '.')) if len(match) > 0 else None
+         # If the match is found, change the decimal separator to a dot and convert to float
+        fps = float(match[0].replace(',', '.')) if len(match) > 0 else None
+
+        if not fps:
+            logger.debug("Titulky.com: No fps supplied on details page.")
+        
+        ### YEAR
+        year = None
+        h1_tag = details_container.find('h1', id='titulky')
+        
+        if not h1_tag:
+            raise Error("Could not find h1 tag. Did the HTML source change?")
+        
+        # The h1 tag contains the name of the subtitle and a year
+        h1_texts = [text for text in h1_tag.stripped_strings]
+        year = int(h1_texts[1]) if len(h1_texts) > 1 else None
+        
+        if not year:
+            logger.debug("Titulky.com: No year supplied on details page.")
+        
         
         # Clean up
         details_page_soup.decompose()
@@ -292,19 +310,17 @@ class TitulkyProvider(Provider):
         
         # Return the subtitle details
         return {
-            'title': title, 
-            'year': year, 
-            'uploader': uploader, 
             'release': release, 
             'language': language, 
-            'fps': fps
+            'uploader': uploader, 
+            'fps': fps,
+            'year': year
         }
     
     def process_row(self, row, keyword, thread_id=None, threads_data=None):
         try:
-            # The first anchor tag is an image preview, the second is the title
+            # The first anchor tag is an image preview, the second is the name
             anchor_tag = row.find_all('a')[1]
-            title = anchor_tag.string.strip()
             # The details link is relative, so we need to remove the dot at the beginning
             details_link = f"{self.server_url}{anchor_tag.get('href')[1:]}"
             id_match = re.findall('id=(\d+)', details_link)
@@ -314,9 +330,19 @@ class TitulkyProvider(Provider):
             # Approved subtitles have a pbl1 class for their row, others have a pbl0 class
             approved = True if 'pbl1' in row.get('class') else False
             
-            # Skip subtitles that do not contain keyword in their title
-            if keyword and sanitize(keyword) not in sanitize(title):
-                logger.debug(f"Titulky.com: Skipping subtitle '{title}' because it does not contain the keyword '{keyword}'")
+            # Name + alternative names
+            table_columns = row.findAll("td")
+            main_name = anchor_tag.get_text(strip=True)
+            alt_names = [alt_name.strip() for alt_name in table_columns[2].get_text(strip=True).split("/")]
+            names = [main_name] + alt_names
+
+
+            # Loop over all subtitle names and check if the keyword contains them
+            name_matches = [keyword and sanitize(keyword) not in sanitize(name) for name in names]
+
+            # Skip subtitles that do not contain the keyword in their name(s)
+            if keyword and all(name_matches) is False:
+                logger.debug(f"Titulky.com: Skipping subtitle with names: '{names}', because it does not not contain the keyword: '{keyword}'")
                 if type(threads_data) is list and type(thread_id) is int:
                     threads_data[thread_id] = {
                         'sub_info': None,
@@ -336,22 +362,25 @@ class TitulkyProvider(Provider):
                     
                 return None
             
-            # Return additional data besides the subtitle details
-            details['id'] = sub_id
-            details['approved'] = approved
-            details['details_link'] = details_link
-            details['download_link'] = download_link
+            # Combine all subtitle data into one dict
+            result = {
+                'names': names,
+                'id': sub_id,
+                'approved': approved,
+                'details_link': details_link,
+                'download_link': download_link
+            }
+            
+            result.update(details)
             
             if type(threads_data) is list and type(thread_id) is int:
                 threads_data[thread_id] = {
-                    'sub_info': details,
+                    'sub_info': result,
                     'exception': None
                 }
                 
             return details
-        except:
-            e = Error("Whoops, something unexpected happend while fetching or parsing details page.")
-            
+        except Exception as e:
             if type(threads_data) is list and type(thread_id) is int:
                 threads_data[thread_id] = {
                     'sub_info': None,
@@ -362,7 +391,7 @@ class TitulkyProvider(Provider):
     
     # There are multiple ways to find subs from this provider:
     # 1. SEARCH by sub title
-    #    - parameter: .................. Fulltext=<SUB TITLE> 
+    #    - parameter: .................. Fulltext=<SUB NAME> 
     # 2. SEARCH by imdb id
     #    - parameter: .................. IMDB=<IMDB ID>
     # 3. SEARCH by season/episode
@@ -460,8 +489,18 @@ class TitulkyProvider(Provider):
                 # and we can instationate it and add it to the list
                 if sub_info:
                     logger.debug(f"Titulky.com: Sucessfully retrieved subtitle info, row: {i}")
-                    subtitle_instance = self.subtitle_class(sub_info['id'], sub_info['language'], sub_info['title'], sub_info['year'], sub_info['release'], sub_info['fps'],
-                                                            sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], season=season, episode=episode, skip_wrong_fps=self.skip_wrong_fps)
+
+                    # Try to parse S00E00 string from the main subtitle name
+                    sub_season = None
+                    sub_episode = None
+                    season_episode_string = re.findall('S(\d+)E(\d+)', sub_info['names'][0], re.IGNORECASE)
+                    if season_episode_string:
+                        sub_season = season_episode_string[0][0]
+                        sub_episode = season_episode_string[0][1]
+
+
+                    subtitle_instance = self.subtitle_class(sub_info['id'], sub_info['language'], sub_info['names'], sub_season, sub_episode, sub_info['year'], sub_info['release'], sub_info['fps'],
+                                                            sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], skip_wrong_fps=self.skip_wrong_fps)
                     subtitles.append(subtitle_instance)
                 else:
                     # No subtitle info was returned, i. e. something unexpected
@@ -506,8 +545,8 @@ class TitulkyProvider(Provider):
                     raise Error(f"No data returned from thread ID: {i}")
                 
                 # If an exception was raised in a thread, raise it again here
-                if "exception" in thread_data and thread_data["exception"]:
-                    logger.debug(f"Titulky.com: An error occured in a thread ID: {i}")
+                if 'exception' in thread_data and thread_data['exception']:
+                    logger.debug(f"Titulky.com: An error occured while processing a row in the thread ID {i}")
                     raise thread_data['exception']
 
                 # If the thread returned a subtitle info, great, instantiate it and add it to the list
@@ -515,8 +554,17 @@ class TitulkyProvider(Provider):
                     # Instantiate the subtitle object
                     logger.debug(f"Titulky.com: Sucessfully retrieved subtitle info, thread ID: {i}")
                     sub_info = thread_data['sub_info']
-                    subtitle_instance = self.subtitle_class(sub_info['id'], sub_info['language'], sub_info['title'], sub_info['year'], sub_info['release'], sub_info['fps'],
-                                                            sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], season=season, episode=episode, skip_wrong_fps=self.skip_wrong_fps)
+
+                    # Try to parse S00E00 string from the main subtitle name
+                    sub_season = None
+                    sub_episode = None
+                    season_episode_string = re.findall('S(\d+)E(\d+)', sub_info['names'][0], re.IGNORECASE)
+                    if season_episode_string:
+                        sub_season = season_episode_string[0][0]
+                        sub_episode = season_episode_string[0][1]
+
+                    subtitle_instance = self.subtitle_class(sub_info['id'], sub_info['language'], sub_info['names'], sub_season, sub_episode, sub_info['year'], sub_info['release'], sub_info['fps'],
+                                                            sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], skip_wrong_fps=self.skip_wrong_fps)
                     subtitles.append(subtitle_instance)
                 else:
                     # The thread returned data, but it didn't contain a subtitle info, i. e. something unexpected
author	Samuel Bartík <[email protected]>	2021-10-29 21:24:10 +0200
committer	GitHub <[email protected]>	2021-10-29 15:24:10 -0400
commit	0e50335e2d26597cac20ee5264a78ea9c53e7ce0 (patch)
tree	fd2afdc62c312ce4e0450b3b4f2fcc883e46e4eb
parent	aaa8b48dad3a1d4de5a8f2f04fcc96cf1fb5fb3a (diff)
download	bazarr-0e50335e2d26597cac20ee5264a78ea9c53e7ce0.tar.gz bazarr-0e50335e2d26597cac20ee5264a78ea9c53e7ce0.zip