Refactored the Titulki providerv1.0.1-beta.4

author: Samuel Bartík <[email protected]> 2021-10-27 20:23:58 +0200
committer: GitHub <[email protected]> 2021-10-27 14:23:58 -0400
commit: 801ff0d47895a3fb00a9aa190a2ca1eaac55100a (patch)
tree: d5416ff5668d33dbf49b96521e14dcb34afae337
parent: 3d936ffccf767d0a91cbc3b7b3c05306e957e561 (diff)
download: bazarr-801ff0d47895a3fb00a9aa190a2ca1eaac55100a.tar.gz
bazarr-801ff0d47895a3fb00a9aa190a2ca1eaac55100a.zip
4 files changed, 544 insertions, 266 deletions
diff --git a/bazarr/config.py b/bazarr/config.py
index 57e0d3ef0..af581c352 100644
--- a/bazarr/config.py
+++ b/bazarr/config.py
@@ -185,7 +185,10 @@ defaults = {
     },
     'titulky': {
         'username': '',
-        'password': ''
+        'password': '',
+        'skip_wrong_fps': 'False',
+        'approved_only': 'False',
+        'multithreading': 'True'
     },
     'subsync': {
         'use_subsync': 'False',
diff --git a/bazarr/get_providers.py b/bazarr/get_providers.py
index fa23b40cc..6445942c0 100644
--- a/bazarr/get_providers.py
+++ b/bazarr/get_providers.py
@@ -183,6 +183,9 @@ def get_providers_auth():
         'titulky'         : {
             'username': settings.titulky.username,
             'password': settings.titulky.password,
+            'skip_wrong_fps': settings.titulky.getboolean('skip_wrong_fps'),
+            'approved_only': settings.titulky.getboolean('approved_only'),
+            'multithreading': settings.titulky.getboolean('multithreading'),
         },
         'titlovi'         : {
             'username': settings.titlovi.username,
diff --git a/frontend/src/Settings/Providers/list.ts b/frontend/src/Settings/Providers/list.ts
index df2f309a8..dc076eb1c 100644
--- a/frontend/src/Settings/Providers/list.ts
+++ b/frontend/src/Settings/Providers/list.ts
@@ -230,6 +230,14 @@ export const ProviderList: Readonly<ProviderInfo[]> = [
     defaultKey: {
       username: "",
       password: "",
+      skip_wrong_fps: false,
+      approved_only: false,
+      multithreading: true,
+    },
+    keyNameOverride: {
+      skip_wrong_fps: "Skip mismatching FPS",
+      approved_only: "Skip unapproved subtitles",
+      multithreading: "Enable multithreading",
     },
   },
   { key: "tvsubtitles", name: "TVSubtitles" },
diff --git a/libs/subliminal_patch/providers/titulky.py b/libs/subliminal_patch/providers/titulky.py
index 0639a042f..e20bf0adc 100644
--- a/libs/subliminal_patch/providers/titulky.py
+++ b/libs/subliminal_patch/providers/titulky.py
@@ -1,352 +1,616 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
+
 import io
 import logging
+import math
 import os
+import re
 import zipfile
-import time
+from random import randint
+from threading import Thread
 
+import chardet
 import rarfile
-from subzero.language import Language
 from guessit import guessit
 from requests import Session
-from six import text_type
-
+from requests.adapters import HTTPAdapter
 from subliminal import __short_version__
-from subliminal.exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded
+from subliminal.exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, Error, ProviderError
 from subliminal.providers import ParserBeautifulSoup, Provider
-from subliminal.subtitle import SUBTITLE_EXTENSIONS, fix_line_ending, Subtitle
-from subliminal_patch.subtitle import guess_matches
+from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending
 from subliminal.video import Episode, Movie
-from subliminal.utils import sanitize_release_group
-from subliminal.score import get_equivalent_release_groups
-from subliminal_patch.utils import sanitize
+from subliminal_patch.score import framerate_equal
+from subliminal_patch.subtitle import guess_matches, sanitize
+from subzero.language import Language
 
-logger = logging.getLogger(__name__)
+from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST
 
+logger = logging.getLogger(__name__)
 
-# class TitulkySubtitle(Subtitle):
-#     """Titulky Subtitle."""
-#     provider_name = 'Titulky'
-#
-#     def __init__(self, language, page_link, year, version, download_link):
-#         super(TitulkySubtitle, self).__init__(language, page_link=page_link)
-#         self.year = year
-#         self.version = version
-#         self.download_link = download_link
-#         self.hearing_impaired = None
-#         self.encoding = 'UTF-8'
-#
-#     @property
-#     def id(self):
-#         return self.download_link
-#
-#     def get_matches(self, video):
-#         matches = set()
-#
-#         # episode
-#         if isinstance(video, Episode):
-#             # other properties
-#             matches |= guess_matches(video, guessit(self.version, {'type': 'episode'}), partial=True)
-#         # movie
-#         elif isinstance(video, Movie):
-#             # other properties
-#             matches |= guess_matches(video, guessit(self.version, {'type': 'movie'}), partial=True)
-#
-#         return matches
 
 class TitulkySubtitle(Subtitle):
+    """Titulky.com subtitle"""
     provider_name = 'titulky'
     
-    def __init__(self, language, page_link, season, episode, version, download_link, year, title, asked_for_release_group=None,
-                 asked_for_episode=None):
-        super(TitulkySubtitle, self).__init__(language, page_link=page_link)
+    hash_verifiable = False
+    hearing_impaired_verifiable = False
+
+    def __init__(self, sub_id, language, title, year, release_info, fps, uploader, approved, page_link, download_link, season=None, episode=None, skip_wrong_fps=False):
+        super().__init__(language, page_link=page_link)
+
+        self.title = title
+        self.year = year
+        self.sub_id = sub_id
+        self.fps = fps
         self.season = season
         self.episode = episode
-        self.version = version
-        self.year = year
-        self.download_link = download_link
-        for t in title:
-            self.title = t
-        if year:
-            self.year = int(year)
-        
+        self.release_info = release_info
+        self.language = language
+        self.approved = approved
         self.page_link = page_link
-        self.asked_for_release_group = asked_for_release_group
-        self.asked_for_episode = asked_for_episode
-    
+        self.uploader = uploader
+        self.download_link = download_link
+        self.skip_wrong_fps = skip_wrong_fps
+        self.matches = None
+
     @property
     def id(self):
-        return self.download_link
+        return self.sub_id
+    
+    def get_fps(self):
+        return self.fps
     
+
     def get_matches(self, video):
-        """
-        patch: set guessit to single_value
-        :param video:
-        :return:
-        """
         matches = set()
-
-        # episode
-        if isinstance(video, Episode):
-            # series
-            if video.series:
-                matches.add('series')
-            # year
-            if video.original_series and self.year is None or video.year and video.year == self.year:
-                matches.add('year')
-            # season
-            if video.season and self.season == video.season:
+        _type = 'movie' if isinstance(video, Movie) else 'episode'
+       
+        if _type == 'episode':
+            ## EPISODE
+            if self.season and self.season == video.season:
                 matches.add('season')
-            # episode
-            if video.episode and self.episode == video.episode:
+            if self.episode and self.episode == video.episode:
+                matches.add('episode')
+            
+            if self.season is None and self.episode is None:
                 matches.add('episode')
-            # guess
-            matches |= guess_matches(video, guessit(self.version, {'type': 'episode'}))
-            pass
-        # movie
-        elif isinstance(video, Movie):
-            # title
-            if video.title and (sanitize(self.title) in (
-                    sanitize(name) for name in [video.title] + video.alternative_titles)):
-                matches.add('title')
-            # year
-            if video.year and self.year == video.year:
-                matches.add('year')
-            # guess
-            matches |= guess_matches(video, guessit(self.version, {'type': 'movie'}))
+                
+                if sanitize(f"S{video.season:02d}E{video.episode:02d}") in sanitize(self.title):
+                    matches.add('season')
+                    matches.add('episode')
+                    
+            if video.series and sanitize(video.series) in sanitize(self.title):
+                matches.add('series')
+            
+        elif _type == 'movie':
+            ## MOVIE
+            if video.title and sanitize(video.title) in sanitize(self.title):
+                matches.add('title') 
+        
+        if video.year and video.year == self.year:
+            matches.add('year')
 
-        self.matches = matches
 
+        matches |= guess_matches(video, guessit(self.release_info, {"type": _type}))
+        
+        
+        if self.skip_wrong_fps and video.fps and self.fps and not framerate_equal(video.fps, self.fps):
+            logger.info(f"Titulky.com: Skipping subtitle {self}: wrong FPS")
+            matches.clear()
+        
+        self.matches = matches
         return matches
-    
-    # def get_matches(self, video):
-    #     matches = set()
-    #
-    #     # episode
-    #     if isinstance(video, Episode):
-    #         # series
-    #         if video.series and (sanitize(self.series_name) in (
-    #                 sanitize(name) for name in [video.series] + video.alternative_series)):
-    #             matches.add('series')
-    #     # movie
-    #     elif isinstance(video, Movie):
-    #         # title
-    #         if video.title and (sanitize(self.movie_name) in (
-    #                 sanitize(name) for name in [video.title] + video.alternative_titles)):
-    #             matches.add('title')
-    #
-    #     # # episode
-    #     # if isinstance(video, Episode):
-    #     #     # other properties
-    #     #     matches |= guess_matches(video, guessit(self.version, {'type': 'episode'}), partial=True)
-    #     # # movie
-    #     # elif isinstance(video, Movie):
-    #     #     # other properties
-    #     #     matches |= guess_matches(video, guessit(self.version, {'type': 'movie'}), partial=True)
-    #
-    #     return matches
 
 
 class TitulkyProvider(Provider):
-    """Titulky Provider."""
-    languages = {Language(l) for l in ['ces', 'slk']}
+    """Titulky.com provider"""
     
-    server_url = 'https://oldpremium.titulky.com'
-    sign_out_url = '?Logoff=true'
-    search_url_series = '?Fulltext={}'
-    search_url_movies = '?Searching=AdvancedResult&ARelease={}'
-    dn_url = 'https://oldpremium.titulky.com'
-    download_url = 'https://oldpremium.titulky.com/idown.php?titulky='
+    languages = {Language(l) for l in ['ces', 'slk']}
+    hash_verifiable = False
+    hearing_impaired_verifiable = False
+
     
-    UserAgent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)'
+    server_url = 'https://premium.titulky.com'
+    login_url = server_url
+    logout_url = f"{server_url}?action=logout"
+    download_url = f"{server_url}/download.php?id="
+
+    timeout = 30
+    max_threads = 5
     
     subtitle_class = TitulkySubtitle
     
-    def __init__(self, username=None, password=None):
-        if any((username, password)) and not all((username, password)):
-            raise ConfigurationError('Username and password must be specified')
+    def __init__(self, username=None, password=None, skip_wrong_fps=None, approved_only=None, multithreading=None):
+        if not all([username, password]):
+            raise ConfigurationError("Username and password must be specified!")
+        
+        if type(skip_wrong_fps) is not bool:
+            raise ConfigurationError(f"Skip_wrong_fps {skip_wrong_fps} must be a boolean!")
+        
+        if type(approved_only) is not bool:
+            raise ConfigurationError(f"Approved_only {approved_only} must be a boolean!")
+        
+        if type(multithreading) is not bool:
+            raise ConfigurationError(f"Multithreading {multithreading} must be a boolean!")
+                
         
         self.username = username
         self.password = password
-        self.logged_in = False
+        self.skip_wrong_fps = skip_wrong_fps
+        self.approved_only = approved_only
+        self.multithreading = multithreading
+        
         self.session = None
     
     def initialize(self):
         self.session = Session()
-        self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__)
-        
-        # login
-        if self.username and self.password:
-            logger.info('Logging in')
-            self.session.get(self.server_url)
-            data = {'Login': self.username,
-                    'Password': self.password}
-            r = self.session.post(self.server_url, data, allow_redirects=False, timeout=10)
-            
-            if 'BadLogin' in r.text:
-                raise AuthenticationError(self.username)
-            
-            logger.debug('Logged in')
-            self.logged_in = True
+        # Set max pool size to the max number of threads we will use (i .e. the max number of search result rows)
+        # or set it to the default value if multithreading is disabled.
+        pool_maxsize = self.max_threads + 3 if self.max_threads > 10 else 10
+        self.session.mount('https://', HTTPAdapter(pool_maxsize=pool_maxsize))
+        self.session.mount('http://', HTTPAdapter(pool_maxsize=pool_maxsize))
+        
+        # Set headers
+        self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)]
+        self.session.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
+        self.session.headers['Accept-Language'] = 'sk,cz,en;q=0.5'
+        self.session.headers['Accept-Encoding'] = 'gzip, deflate'
+        self.session.headers['DNT'] = '1'
+        self.session.headers['Connection'] = 'keep-alive'
+        self.session.headers['Upgrade-Insecure-Requests'] = '1'
+        self.session.headers['Cache-Control'] = 'max-age=0'
+        
+        self.login()
     
     def terminate(self):
-        # logout
-        if self.logged_in:
-            logger.info('Logging out')
-            r = self.session.get(self.server_url + self.sign_out_url, timeout=10)
-            r.raise_for_status()
-            logger.debug('Logged out')
-            self.logged_in = False
-        
+        self.logout()
         self.session.close()
     
-    def query(self, keyword, season=None, episode=None, year=None, video=None):
-        params = keyword
-        if season and episode:
-            params += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode)
-        elif year:
-            params += '&ARok={:4d}'.format(year)
-        
-        logger.debug('Searching subtitles %r', params)
-        subtitles = []
-        if season and episode:
-            search_link = self.server_url + text_type(self.search_url_series).format(params)
+    def login(self):
+        logger.info("Titulky.com: Logging in")
+        
+        self.session.get(self.server_url)
+        
+        data = {
+            'LoginName': self.username,
+            'LoginPassword': self.password
+        }
+        res = self.session.post(self.server_url, data, allow_redirects=False, timeout=self.timeout)
+        
+        # If the response is a redirect and doesnt point to an error message page, then we are logged in
+        if res.status_code == 302 and 'msg_type=i' in res.headers['Location']:
+            return True
+        else:
+            raise AuthenticationError("Login failed")
+    
+    def logout(self):
+        logger.info("Titulky.com: Logging out")
+        
+        res = self.session.get(self.logout_url, allow_redirects=False, timeout=self.timeout)
+        
+        # If the response is a redirect and doesnt point to an error message page, then we are logged out
+        if res.status_code == 302 and 'msg_type=i' in res.headers['Location']:
+            return True
+        else:
+            raise AuthenticationError("Logout failed.")
+
+    def fetch_page(self, url):
+        logger.debug(f"Titulky.com: Fetching url: {url}")
+        res = self.session.get(url, timeout=self.timeout)
+        
+        if res.status_code != 200:
+            raise ProviderError(f"Fetch failed with status code {res.status_code}")
+        if not res.text:
+            raise ProviderError("No response returned from the provider")
+        
+        return res.text
+
+    def build_search_url(self, params):
+        result = f"{self.server_url}/?"
+        
+        params['action'] = 'search'
+        params['fsf'] = 1 # Requires subtitle names to match full search keyword
+        
+        for key, value in params.items():
+            result += f'{key}={value}&'
+        
+        # Remove last &
+        result = result[:-1]
+        
+        # Remove spaces
+        result = result.replace(' ', '+')
+        
+        return result
+    
+    # Parse details of an individual subtitle: title, year, language, etc.
+    def parse_details(self, url):
+        html_src = self.fetch_page(url)
+        details_page_soup = ParserBeautifulSoup(html_src, ['lxml', 'html.parser'])
+        
+        details_container = details_page_soup.find('div', class_='detail')
+        if not details_container:
+            logger.debug("Titulky.com: Could not find details div container. Skipping.")
+            return False
+        
+        ### TITLE AND YEAR
+        h1_tag = details_container.find('h1', id='titulky')
+        if not h1_tag:
+            logger.debug("Titulky.com: Could not find h1 tag. Skipping.")
+            return False
+        # The h1 tag contains the title of the subtitle and year
+        h1_texts = [text.strip() for text in h1_tag.stripped_strings]
+        
+        if len(h1_texts) < 1:
+            logger.debug("Titulky.com: The header tag did not include sufficient data. Skipping.")
+            return False
+        title = h1_texts[0]
+        year = int(h1_texts[1]) if len(h1_texts) > 1 else None
+        
+        ### UPLOADER
+        uploader_tag = details_container.find('div', class_='ulozil')
+        if not uploader_tag:
+            logger.debug("Titulky.com: Could not find uploader tag. Skipping.")
+            return False
+        uploader_anchor_tag = uploader_tag.find('a')
+        if not uploader_anchor_tag:
+            logger.debug("Titulky.com: Could not find uploader anchor tag. Skipping.")
+            return False
+        uploader = uploader_anchor_tag.string.strip()
+        
+        ### RELEASE
+        release_tag = details_container.find('div', class_='releas')
+        if not release_tag:
+            logger.debug("Titulky.com: Could not find releas tag. Skipping.")
+            return False
+        release = release_tag.get_text(strip=True)
+        
+        ### LANGUAGE
+        language = None
+        czech_flag = details_container.select('img[src*=\'flag-CZ\']')
+        slovak_flag = details_container.select('img[src*=\'flag-SK\']')
+        if czech_flag and not slovak_flag:
+            language = Language('ces')
+        elif slovak_flag and not czech_flag: 
+            language = Language('slk')
+        
+        ### FPS
+        fps = None
+        fps_icon_tag_selection = details_container.select('img[src*=\'Movieroll\']')
+        
+        if len(fps_icon_tag_selection) > 0 and hasattr(fps_icon_tag_selection[0], 'parent'):
+            fps_icon_tag = fps_icon_tag_selection[0]
+            parent_text = fps_icon_tag.parent.get_text(strip=True)
+            match = re.findall('(\d+,\d+) fps', parent_text)
+            
+            # If the match is found, change the decimal separator to a dot and convert to float
+            fps = float(match[0].replace(',', '.')) if len(match) > 0 else None
+        
+        # Clean up
+        details_page_soup.decompose()
+        details_page_soup = None
+        
+        # Return the subtitle details
+        return {
+            'title': title, 
+            'year': year, 
+            'uploader': uploader, 
+            'release': release, 
+            'language': language, 
+            'fps': fps
+        }
+    
+    def process_row(self, row, keyword, thread_id=None, threads_data=None):
+        try:
+            # The first anchor tag is an image preview, the second is the title
+            anchor_tag = row.find_all('a')[1]
+            title = anchor_tag.string.strip()
+            # The details link is relative, so we need to remove the dot at the beginning
+            details_link = f"{self.server_url}{anchor_tag.get('href')[1:]}"
+            id_match = re.findall('id=(\d+)', details_link)
+            sub_id = id_match[0] if len(id_match) > 0 else None
+            download_link = f"{self.download_url}{sub_id}"
+
+            # Approved subtitles have a pbl1 class for their row, others have a pbl0 class
+            approved = True if 'pbl1' in row.get('class') else False
+            
+            # Skip subtitles that do not contain keyword in their title
+            if keyword and sanitize(keyword) not in sanitize(title):
+                logger.debug(f"Titulky.com: Skipping subtitle '{title}' because it does not contain the keyword '{keyword}'")
+                if type(threads_data) is list and type(thread_id) is int:
+                    threads_data[thread_id] = {
+                        'sub_info': None,
+                        'exception': None
+                    }
+                    
+                return None
+            
+            details = self.parse_details(details_link)
+            if not details:
+                # Details parsing was NOT successful, skipping
+                if type(threads_data) is list and type(thread_id) is int:
+                    threads_data[thread_id] = {
+                        'sub_info': None,
+                        'exception': None
+                    }
+                    
+                return None
+            
+            # Return additional data besides the subtitle details
+            details['id'] = sub_id
+            details['approved'] = approved
+            details['details_link'] = details_link
+            details['download_link'] = download_link
+            
+            if type(threads_data) is list and type(thread_id) is int:
+                threads_data[thread_id] = {
+                    'sub_info': details,
+                    'exception': None
+                }
+                
+            return details
+        except:
+            e = Error("Whoops, something unexpected happend while fetching or parsing details page.")
+            
+            if type(threads_data) is list and type(thread_id) is int:
+                threads_data[thread_id] = {
+                    'sub_info': None,
+                    'exception': e
+                }
+                
+            raise e
+    
+    # There are multiple ways to find subs from this provider:
+    # 1. SEARCH by sub title
+    #    - parameter: .................. Fulltext=<SUB TITLE> 
+    # 2. SEARCH by imdb id
+    #    - parameter: .................. IMDB=<IMDB ID>
+    # 3. SEARCH by season/episode
+    #    - parameter: .................. Sezona=<SEASON>
+    #    - parameter: .................. Epizoda=<EPISODE>
+    # 4. SEARCH by year
+    #    - parameter: .................. Rok=<YEAR>
+    # 5. SEARCH by video type
+    #    - parameter: .................. Serial=<('S' for series | 'F' for movies | '' for all)>
+    # 6. SEARCH by language
+    #    - parameter: .................. Jazyk=<('CZ' for czech | 'SK' for slovak | '' for all)>
+    # 7. SEARCH by status
+    #    - parameter: .................. ASchvalene=<('1' for approved only | '-0' for subs awaiting approval | '' for all)>
+    # - redirects should NOT be allowed here
+    #
+    # 8. BROWSE subtitles by IMDB ID
+    #   - Subtitles are here categorised by seasons and episodes
+    #   - URL: https://premium.titulky.com/?action=serial&step=<SEASON>&id=<IMDB ID>
+    #   - it seems that the url redirects to a page with their own internal ID, redirects should be allowed here
+    def query(self, language, type, keyword=None, year=None, season=None, episode=None, imdb_id=None):
+        ## Build the search URL
+        params = {}
+        
+        # Keyword
+        if keyword:
+            params['Fulltext'] = keyword
+        # Video type
+        if type == 'episode':
+            params['Serial'] = 'S'
+        else:
+            params['Serial'] = 'F'
+        # Season / Episode
+        if season:
+            params['Sezona'] = season
+        if episode:
+            params['Epizoda'] = episode
+        # IMDB ID
+        if imdb_id:
+            params['IMDB'] = imdb_id[2:] # Remove the tt from the imdb id
+        # Year
+        if year:
+            params['Rok'] = year
+        # Language
+        if language == Language('ces'):
+            params['Jazyk'] = 'CZ'
+        elif language == Language('slk'):
+            params['Jazyk'] = 'SK'
+        elif language == None:
+            params['Jazyk'] = ''
+        else:
+            return []
+        # Status
+        if self.approved_only:
+            params['ASchvalene'] = '1'
         else:
-            search_link = self.server_url + text_type(self.search_url_movies).format(params)
+            params['ASchvalene'] = ''
+            
         
+        search_url = self.build_search_url(params)
         
-        r = self.session.get(search_link, timeout=30)
-        r.raise_for_status()
+        ## Search results page parsing
+        html_src = self.fetch_page(search_url)
+        search_page_soup = ParserBeautifulSoup(html_src, ['lxml', 'html.parser'])
         
-        if not r.content:
-            logger.debug('No data returned from provider')
+        # If there is a message containing "Žádny odpovídající záznam", it means that there are no results
+        # If that's the case, return an empty list
+        error_message = search_page_soup.select('.panel-body > strong')
+        if len(error_message) > 0 and 'Žádný odpovídající záznam' in error_message[0].get_text(strip=True):
+            logger.info("Titulky.com: No results found")
             return []
         
-        # soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser'])
-        
-        # for entity in soup.select('table .main_table > tbody > tr'):
-        # for entity in soup.find_all("table", class_="main_table"):
-        #     moviename = entity.text
-        # entity_url = self.server_url + entity['href']
-        # logger.debug(entity_url)
-        # r = self.session.get(entity_url, timeout=30)
-        # r.raise_for_status()
-        # logger.debug('looking into ' + entity_url)
-        
-        soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']).find("table",
-                                                                                                      class_="main_table")
-        # loop over subtitles cells
-        if soup:
-            subs = soup.find_all("tr", class_="row1")
-            subs += soup.find_all("tr", class_="row2")
-            for sub in subs:
-                page_link = '%s%s' % (self.server_url, sub.a.get('href').encode('utf-8'))
-                title = sub.find_all('td')[0:1]
-                title = [x.text for x in title]
-                version = sub.find(class_="fixedTip")
-                if version is None:
-                    version = ""
-                else:
-                    version = version['title']
-                try:
-                    r = sub.find_all('td')[6:7]
-                    # r2 = td.find("td", "img")
-                    langs = [x.text.encode('utf-8') for x in r]
-                    pass
-                except:
-                    langs = 'CZ'
-                name = '%s (%s)' % (version, langs)
+        # Get the table containing the search results
+        table = search_page_soup.find('table', class_='table')
+        if not table:
+            logger.debug("Titulky.com: Could not find table")
+            raise Error("Could not find table. Did the HTML source change?")
+        
+        # Get table body containing rows of subtitles
+        table_body = table.find('tbody')
+        if not table_body:
+            logger.debug("Titulky.com: Could not find table body")
+            raise Error("Could not find table body. Did the HTML source change?")
+        
+        ## Loop over all subtitles on the first page and put them in a list
+        subtitles = []
+        rows = table_body.find_all('tr')
+        
+        if not self.multithreading:
+            # Process the rows sequentially
+            logger.info("Titulky.com: processing results in sequence")
+            for i, row in enumerate(rows):
+                sub_info = self.process_row(row, keyword)
                 
-                if b'CZ' in langs:
-                    language = Language('ces')
-                elif b'SK' in langs:
-                    language = Language('slk')
-                # read the item
-                # subtitle = self.subtitle_class(language, page_link, year, version, page_link.replace("detail", "dld"))
-                download_link = sub.find('a', class_='titulkydownloadajax')
-                download_link = self.download_url + download_link.get('href')
+                # If subtitle info was returned, then everything was okay 
+                # and we can instationate it and add it to the list
+                if sub_info:
+                    logger.debug(f"Titulky.com: Sucessfully retrieved subtitle info, row: {i}")
+                    subtitle_instance = self.subtitle_class(sub_info['id'], sub_info['language'], sub_info['title'], sub_info['year'], sub_info['release'], sub_info['fps'],
+                                                            sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], season=season, episode=episode, skip_wrong_fps=self.skip_wrong_fps)
+                    subtitles.append(subtitle_instance)
+                else:
+                    # No subtitle info was returned, i. e. something unexpected
+                    # happend during subtitle details page fetching and processing.
+                    logger.debug(f"Titulky.com: No subtitle info retrieved, row: {i}")
+        else:
+            # Process the rows in paralell
+            logger.info(f"Titulky.com: processing results in parelell, {self.max_threads} rows at a time.")
+
+            threads = [None] * len(rows)
+            threads_data = [None] * len(rows)
+
+            # Process rows in parallel, self.max_threads at a time.
+            cycles = math.ceil(len(rows)/self.max_threads)
+            for i in range(cycles):
+                # Batch number i
+                starting_index = i * self.max_threads # Inclusive
+                ending_index = starting_index + self.max_threads # Non-inclusive
+
+                # Create threads for all rows in this batch
+                for j in range(starting_index, ending_index):
+                    # Check if j-th row exists
+                    if j < len(rows):
+                        # Row number j
+                        logger.debug(f"Titulky.com: Creating thread {j} (batch: {i})")
+                        # Create a thread for row j and start it
+                        threads[j] = Thread(target=self.process_row, args=[rows[j], keyword], kwargs={'thread_id': j, 'threads_data': threads_data})
+                        threads[j].start()
+
+                # Wait for all created threads to finish before moving to another batch of rows
+                for j in range(starting_index, ending_index):
+                    # Check if j-th row exists
+                    if j < len(rows):
+                        threads[j].join()
+
+            # Process the resulting data from all threads
+            for i in range(len(threads_data)):
+                thread_data = threads_data[i]
+
+                # If the thread returned didn't return anything, but expected a dict object
+                if not thread_data:
+                    raise Error(f"No data returned from thread ID: {i}")
                 
-                subtitle = self.subtitle_class(language, page_link,
-                                               season, episode, version, download_link, year, title,
-                                               asked_for_release_group=video.release_group,
-                                               asked_for_episode=episode)
+                # If an exception was raised in a thread, raise it again here
+                if "exception" in thread_data and thread_data["exception"]:
+                    logger.debug(f"Titulky.com: An error occured in a thread ID: {i}")
+                    raise thread_data['exception']
+
+                # If the thread returned a subtitle info, great, instantiate it and add it to the list
+                if 'sub_info' in thread_data and thread_data['sub_info']:
+                    # Instantiate the subtitle object
+                    logger.debug(f"Titulky.com: Sucessfully retrieved subtitle info, thread ID: {i}")
+                    sub_info = thread_data['sub_info']
+                    subtitle_instance = self.subtitle_class(sub_info['id'], sub_info['language'], sub_info['title'], sub_info['year'], sub_info['release'], sub_info['fps'],
+                                                            sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], season=season, episode=episode, skip_wrong_fps=self.skip_wrong_fps)
+                    subtitles.append(subtitle_instance)
+                else:
+                    # The thread returned data, but it didn't contain a subtitle info, i. e. something unexpected
+                    # happend during subtitle details page fetching and processing.
+                    logger.debug(f"Titulky.com: No subtitle info retrieved, thread ID: {i}")
                 
-                logger.debug('Found subtitle %r', subtitle)
-                subtitles.append(subtitle)
-            
-            soup.decompose()
-            soup = None
+        # Clean up
+        search_page_soup.decompose()
+        search_page_soup = None
+        
+        logger.debug(f"Titulky.com: Found subtitles: {subtitles}")
         
         return subtitles
     
-    def list_subtitles(self, video, languages):
-        if isinstance(video, Episode):
-            titles = [video.series] + video.alternative_series
-        elif isinstance(video, Movie):
-            titles = [video.title] + video.alternative_titles
-        else:
-            titles = []
-        
+    def list_subtitles(self, video, languages):        
         subtitles = []
-        # query for subtitles with the show_id
-        for title in titles:
+        
+        # Possible paths:
+        # (1) Search by IMDB ID [and season/episode for tv series]
+        # (2) Search by keyword: video (title|series) [and season/episode for tv series]
+        # (3) Search by keyword: video series + S00E00 (tv series only)
+        
+        for language in languages:
             if isinstance(video, Episode):
-                subtitles += [s for s in self.query(title, season=video.season, episode=video.episode,
-                                                    year=video.year, video=video)
-                              if s.language in languages]
+                # (1)
+                logger.debug("Titulky.com: Finding subtitles by IMDB ID (1)")
+                if video.series_imdb_id:
+                    partial_subs = self.query(language, 'episode', imdb_id=video.series_imdb_id, season=video.season, episode=video.episode)
+                    if(len(partial_subs) > 0):
+                        subtitles += partial_subs
+                        continue
+                
+                # (2)
+                logger.debug("Titulky.com: Finding subtitles by keyword (2)")
+                keyword = video.series
+                partial_subs = self.query(language, 'episode', keyword=keyword, season=video.season, episode=video.episode)
+                if(len(partial_subs) > 0):
+                    subtitles += partial_subs
+                    continue
+                
+                # (3)
+                logger.debug("Titulky.com: Finding subtitles by keyword (3)")
+                keyword = f"{video.series} S{video.season:02d}E{video.episode:02d}"
+                partial_subs = self.query(language, 'episode', keyword=keyword)
+                subtitles += partial_subs
             elif isinstance(video, Movie):
-                subtitles += [s for s in self.query(title, year=video.year, video=video)
-                              if s.language in languages]
-        
+                # (1)
+                logger.debug("Titulky.com: Finding subtitles by IMDB ID (1)")
+                if video.imdb_id:
+                    partial_subs = self.query(language, 'movie', imdb_id=video.imdb_id)
+                    if(len(partial_subs) > 0):
+                        subtitles += partial_subs
+                        continue
+                
+                # (2)
+                logger.debug("Titulky.com: Finding subtitles by keyword (2)")
+                keyword = video.title
+                partial_subs = self.query(language, 'movie', keyword=keyword)
+                subtitles += partial_subs
+                
         return subtitles
     
+# The rest is mostly old code from original implementation. Might want to redo it.
     def download_subtitle(self, subtitle):
-        if isinstance(subtitle, TitulkySubtitle):
-            # download the subtitle
-            logger.info('Downloading subtitle %r', subtitle)
-            r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link},
-                                 timeout=30)
-            r.raise_for_status()
+        res = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link},
+                             timeout=self.timeout)
+        res.raise_for_status()
             
-            if not r.content:
-                logger.debug('Unable to download subtitle. No data returned from provider')
-                return
-            elif 'Limit vyčerpán' in r.text:
-                raise DownloadLimitExceeded
-            
-            soup = ParserBeautifulSoup(r.text, ['lxml', 'html.parser'])
-            # links = soup.find("a", {"id": "downlink"}).find_all('a')
-            link = soup.find(id="downlink")
-            # TODO: add settings for choice
-            
-            url = self.dn_url + link.get('href')
-            time.sleep(0.5)
-            r = self.session.get(url, headers={'Referer': subtitle.download_link},
-                                 timeout=30)
-            r.raise_for_status()
-            
-        
-        archive_stream = io.BytesIO(r.content)
+        archive_stream = io.BytesIO(res.content)
         archive = None
         if rarfile.is_rarfile(archive_stream):
-            logger.debug('Identified rar archive')
+            logger.debug("Titulky.com: Identified rar archive")
             archive = rarfile.RarFile(archive_stream)
             subtitle_content = _get_subtitle_from_archive(archive)
         elif zipfile.is_zipfile(archive_stream):
-            logger.debug('Identified zip archive')
+            logger.debug("Titulky.com: Identified zip archive")
             archive = zipfile.ZipFile(archive_stream)
             subtitle_content = _get_subtitle_from_archive(archive)
         else:
-            subtitle_content = r.content
-        
+            subtitle_content = res.content
+            
         if subtitle_content:
             subtitle.content = fix_line_ending(subtitle_content)
+            return subtitle_content
         else:
-            logger.debug('Could not extract subtitle from %r', archive)
-
+            logger.debug(f"Titulky.com: Could not extract subtitle from {archive}")
 
 def _get_subtitle_from_archive(archive):
+    if '_info.txt' in archive.namelist():
+        info_content_binary = archive.read('_info.txt')
+        info_content = info_content_binary.decode(chardet.detect(info_content_binary)['encoding'])
+        if "nestaženo - překročen limit" in info_content:
+            raise DownloadLimitExceeded("The download limit has been exceeded")
+
     for name in archive.namelist():
         # discard hidden files
         if os.path.split(name)[-1].startswith('.'):
author	Samuel Bartík <[email protected]>	2021-10-27 20:23:58 +0200
committer	GitHub <[email protected]>	2021-10-27 14:23:58 -0400
commit	801ff0d47895a3fb00a9aa190a2ca1eaac55100a (patch)
tree	d5416ff5668d33dbf49b96521e14dcb34afae337
parent	3d936ffccf767d0a91cbc3b7b3c05306e957e561 (diff)
download	bazarr-801ff0d47895a3fb00a9aa190a2ca1eaac55100a.tar.gz bazarr-801ff0d47895a3fb00a9aa190a2ca1eaac55100a.zip