summaryrefslogtreecommitdiffhomepage
path: root/libs
diff options
context:
space:
mode:
authorIonuČ› Neagu <[email protected]>2023-01-09 14:47:44 +0200
committerGitHub <[email protected]>2023-01-09 07:47:44 -0500
commit237e85bd3454ce061f2632ba066c5f3ed2e5c31e (patch)
treec2489f2745f69a786d5667271a874ecf1e4fab85 /libs
parent9105c53d6a9ebd309710ee47651b5a422da1886f (diff)
downloadbazarr-237e85bd3454ce061f2632ba066c5f3ed2e5c31e.tar.gz
bazarr-237e85bd3454ce061f2632ba066c5f3ed2e5c31e.zip
Fixed RegieLive provider to use the official API
Diffstat (limited to 'libs')
-rw-r--r--libs/subliminal_patch/providers/regielive.py483
1 files changed, 40 insertions, 443 deletions
diff --git a/libs/subliminal_patch/providers/regielive.py b/libs/subliminal_patch/providers/regielive.py
index 2d97d7adb..d20972f03 100644
--- a/libs/subliminal_patch/providers/regielive.py
+++ b/libs/subliminal_patch/providers/regielive.py
@@ -4,29 +4,18 @@ import logging
import io
import os
-#imports needed for the searchAPI
-import re
-import enum
-import numpy as np
-from time import sleep
-from hashlib import sha1
-from subliminal.cache import region
-from urllib import parse as urlparse
-#end imports for searchAPI
-
-import zipfile
-
+from requests import Session
from guessit import guessit
-from subzero.language import Language
from subliminal_patch.providers import Provider
from subliminal_patch.subtitle import Subtitle, guess_matches
-from subliminal.video import Episode, Movie
from subliminal.subtitle import SUBTITLE_EXTENSIONS, fix_line_ending
-from requests import Session, exceptions as req_exceptions
+from subliminal.video import Episode, Movie
+from subzero.language import Language
+import urllib
+import zipfile
logger = logging.getLogger(__name__)
-BASE_URL = "https://subtitrari.regielive.ro"
class RegieLiveSubtitle(Subtitle):
"""RegieLive Subtitle."""
@@ -49,7 +38,7 @@ class RegieLiveSubtitle(Subtitle):
def get_matches(self, video):
type_ = "movie" if isinstance(video, Movie) else "episode"
matches = set()
- subtitle_filename = self.filename
+ subtitle_filename = self.filename.lower()
# episode
if type_ == "episode":
@@ -60,9 +49,8 @@ class RegieLiveSubtitle(Subtitle):
# already matched in search query
matches.update(['title', 'year'])
- # release_group
if video.release_group and video.release_group.lower() in subtitle_filename:
- matches.add('release_group')
+ matches.update(['release_group', 'hash'])
matches |= guess_matches(video, guessit(self.filename, {"type": type_}))
@@ -74,59 +62,66 @@ class RegieLiveProvider(Provider):
languages = {Language(l) for l in ['ron']}
language = list(languages)[0]
video_types = (Episode, Movie)
- download_cookies = None
SEARCH_THROTTLE = 8
+ hash_verifiable = False
def __init__(self):
self.initialize()
def initialize(self):
self.session = Session()
- self.url = BASE_URL
- self.api = 'API-KODI-KINGUL'
+ self.url = 'https://api.regielive.ro/bazarr/search.php'
+ self.api = 'API-BAZARR-YTZ-SL'
self.headers = {'RL-API': self.api}
def terminate(self):
self.session.close()
def query(self, video, language):
- search_api = RegieLiveSearchAPI(video)
- results = search_api.search_video()
- subtitles = []
- if results:
- for result in results:
- logger.debug(result)
- subtitles.append(
- RegieLiveSubtitle(result.title, video, result.download_url, result.rating, language)
- )
+ payload = {}
+ if isinstance(video, Episode):
+ payload['nume'] = video.series
+ payload['sezon'] = video.season
+ payload['episod'] = video.episode
+ elif isinstance(video, Movie):
+ payload['nume'] = video.title
+ payload['an'] = video.year
+
+ response = self.session.get(
+ self.url + "?" + urllib.parse.urlencode(payload),
+ data=payload, headers=self.headers)
- # {'titlu': 'Chernobyl.S01E04.The.Happiness.of.All.Mankind.720p.AMZN.WEB-DL.DDP5.1.H.264-NTb', 'url': 'https://subtitrari.regielive.ro/descarca-33336-418567.zip', 'rating': {'nota': 4.89, 'voturi': 48}}
- # subtitle def __init__(self, language, filename, subtype, video, link):
- self.download_cookies = search_api.get_req_cookies()
+ subtitles = []
+ if response.json()['cod'] == 200:
+ results_subs = response.json()['rezultate']
+ for film in results_subs:
+ for sub in results_subs[film]['subtitrari']:
+ subtitles.append(
+ RegieLiveSubtitle(
+ results_subs[film]['subtitrari'][sub]['titlu'],
+ video,
+ results_subs[film]['subtitrari'][sub]['url'],
+ results_subs[film]['subtitrari'][sub]['rating']['nota'],
+ language))
return subtitles
def list_subtitles(self, video, languages):
return self.query(video, self.language)
def download_subtitle(self, subtitle):
- session = Session()
+ session = self.session
_addheaders = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Origin': BASE_URL,
- 'Accept-Language' : 'en-US,en;q=0.5',
- 'Referer': BASE_URL,
+ 'Origin': 'https://subtitrari.regielive.ro',
+ 'Accept-Language': 'en-US,en;q=0.5',
+ 'Referer': 'https://subtitrari.regielive.ro',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
-
session.headers.update(_addheaders)
- if self.download_cookies is None: #try and get the needed cookies through a request if no cookies exist from the API
- res = session.get(BASE_URL)
- cookies = res.cookies
- else:
- cookies = self.download_cookies
-
+ res = session.get('https://subtitrari.regielive.ro')
+ cookies = res.cookies
_zipped = session.get(subtitle.page_link, cookies=cookies)
if _zipped:
if _zipped.text == '500':
@@ -156,401 +151,3 @@ class RegieLiveProvider(Provider):
return archive.read(name)
raise APIThrottled('Can not find the subtitle in the compressed file')
-
-"""
-# Search helper for Regielive that uses scraping to find subtitles
-# This utility API should return a list of RegieLiveAPIData objects when queried
-# by using a mix of json api search and page scraping in order to fetch data
-# from Regielive website.
-#
-# This may break at anytime since regex is very sensitive to website structure changes
-# for this in the future I might make the regex to load directly from github
-"""
-
-#data classes
-class RegieLiveAPIData():
- 'data returned class'
- title = ''
- rating = None
- download_url = ''
-
- def __init__(self, title, url, rating):
- self.title = title
- self.download_url = url
- self.rating = rating
-
- def __repr__(self):
- return "<RegieLiveAPIData: title = \
- " + str(self.title) + "; download url = \
- " + str(self.download_url) + \
- "; rating = " + str(self.rating.rating) + "/" + str(self.rating.count) + ">"
-
-
-class RegieLiveAPIRating(): # probably an extraneous class
- 'rating for the subtitle'
- rating = 0
- count = 0
-
- def __init__(self, rating, count):
- if rating:
- self.rating = rating
-
- if not count:
- self.count = 0
- if count and isinstance(count, str) and count.isnumeric():
- self.count = count
- elif count == 'vot':
- self.count = 1
- else:
- self.count = 0
-
-
-# constants
-CACHE_PREFIX = 'RL_API'
-
-DEFAULT_HEADERS = {
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
- AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
- 'Origin': 'https://subtitrari.regielive.ro',
- 'Accept-Language': 'en-US,en;q=0.5',
- 'Referer': 'https://subtitrari.regielive.ro',
- 'Pragma': 'no-cache',
- 'Cache-Control': 'no-cache'
-}
-
-REQUEST_TIMEOUT = 15
-
-LITE_JSON_PATH = "/ajax/subtitrari/searchsuggest.php"
-PAGE_SEARCH_PATH = "/cauta.html"
-SEASON_URL = "sezonul-%i/"
-
-SUB_PAGE_EPISODE_PATTERN = r'(?ism)<h3>Episodul %s</h3>(.+?)</ul>'
-SUB_PAGE_MOVIE_MATCH = re.compile(
- r'(?ism)<div class="subtitrari">.*?<ul class="mt-6">(.+?)</ul>')
-
-SUB_FILE_INFO_MATCH = re.compile(
- r'(?ism)id="sub_\d+">([^<]+)</span>.*?Nota ([0-9.]+)\s+(?:dintr-un\s+?(\w+)|din\s+?([0-9]+)\s*?)[^>].*?<a href="([^"]+)".+?</li>')
-SEARCH_PAGE_MATCH = re.compile(
- r'(?ism)class="detalii\s[^>]{1}.+?<a href="([^"]+)"[^>]+?>([^<]+)</a>\s*<span.+?>\((\d{4})\)</span>')
-
-# helpers
-def title_match(s, t, ratio_calc=False):
- """ title_match:
- Tries to calculate the levenshtein distance between two strings.
- If ratio_calc = True, the function computes the
- levenshtein distance ratio of similarity between two strings
- This function is mainly copied from the Levenshtein package
- """
- # Initialize matrix of zeros
- rows = len(s)+1
- cols = len(t)+1
- distance = np.zeros((rows, cols), dtype=int)
-
- for i in range(1, rows):
- for k in range(1, cols):
- distance[i][0] = i
- distance[0][k] = k
-
- # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions
- for col in range(1, cols):
- for row in range(1, rows):
- if s[row-1] == t[col-1]:
- cost = 0
- else:
- # the cost of a substitution is 2 for distance the cost of a substitution is 1.
- if ratio_calc:
- cost = 2
- else:
- cost = 1
- distance[row][col] = min(distance[row-1][col] + 1, # Cost of deletions
- # Cost of insertions
- distance[row][col-1] + 1,
- distance[row-1][col-1] + cost) # Cost of substitutions
- if ratio_calc:
- ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
- return ratio
- else:
- # This is the minimum number of edits needed to convert string a to string b
- return distance[row][col]
-
-class SearchTypes(enum.Enum):
- 'Search type based on video object received'
- Movie = 1
- Episode = 2
-
-
-class RegieLiveSearchAPI():
- 'Main class that interfaces with regielive sub provider'
- video = None
- title = None
- session = None
- search_type = SearchTypes.Movie
-
- def __init__(self, video):
- 'Constructor that needs a [Movie, Episode] object'
- self.video = video
- self.initialize()
-
- def initialize(self):
- 'Instance initialization goes here'
- if isinstance(self.video, Episode):
- self.search_type = SearchTypes.Episode
- self.title = self.video.series
- else:
- self.title = self.video.title
-
- self.session = Session()
- self.session.headers.update(DEFAULT_HEADERS)
- logger.debug('Initialized new RegieLiveSearchAPI with search type %s of object %s',
- self.search_type, str(self.video))
-
- def get_req_cookies(self):
- 'Get cookies used for request'
- if self.session:
- return self.session.cookies
-
- return None
-
- def search_video(self):
- 'Main function that should be called to get sub data back'
- if self.video is None:
- return None
-
- results = self.search_lite_api()
-
- if not results:
- sleep(2.0) #stagger request in order to no flood the server
- results = self.search_page()
-
- if not results or results['data'] is None:
- return None # not logging since we can't get here without logging the reason elsewhere
-
- logger.debug(results)
- found_subs = self.parse_page(results)
- logger.debug(found_subs)
-
- return found_subs
-
- def parse_page(self, results):
- 'fetch and parse episode/movie page'
- if len(results['data']) > 1:
- logger.warning("More than one page result for subtitle %s with data %s",
- self.title,
- str(results['data']))
-
- sub_list = None
- if self.search_type is SearchTypes.Movie:
- sub_list = self.parse_movie_pages(results['data'])
- else:
- sub_list = self.parse_episode_pages(results['data'])
-
- return sub_list
-
- def parse_movie_pages(self, sub_page_data):
- 'Fetch and parse movie page data'
- sub_list = []
- for result in sub_page_data:
- extracted_subs = self.extract_movie_sub_block(
- self.get_page(result['url'], None))
- sub_data = self.parse_sub_block(extracted_subs)
- if sub_data:
- sub_list.extend(sub_data)
- else:
- logger.debug(
- 'Empty results from url %s with resulted block %s', result['url'], str(sub_data))
-
- return sub_list
-
- def parse_episode_pages(self, sub_page_data):
- 'Fetch and parse episode pages'
- season = SEASON_URL % self.video.season
- url = ''
- sub_list = []
- for result in sub_page_data:
- url = urlparse.urljoin(result['url'], season)
- extracted_subs = self.extract_episode_sub_block(
- self.get_page(url, None))
- sub_data = self.parse_sub_block(extracted_subs)
- if sub_data:
- sub_list.extend(sub_data)
- else:
- logger.debug(
- 'Empty results from url %s with resulted block %s', url, str(sub_data))
-
- return sub_list
-
- def search_page(self):
- """
- Scrape search the page for the title
- This does not take into consideration pagination
- since the titles should be pretty unique and this api
- is not a search engine.
- I will make the pagination too if this, later, turns out to be a problem
- Return a similar object to the lite api in order to be consistent
- """
- cache_key = sha1(CACHE_PREFIX + self.title.encode("utf-8"), usedforsecurity=False).digest()
- cached_response = region.get(cache_key)
- if cached_response:
- logger.info("Found cached reply for search request %s", self.title)
- return cached_response
-
- response = self.get_api_page(PAGE_SEARCH_PATH, {'s': self.title})
- data = {'error': True, 'data': []}
-
- if response:
- m_iter = SEARCH_PAGE_MATCH.finditer(response)
- if m_iter:
- for m in m_iter:
- data['data'].append({
- 'id': RegieLiveSearchAPI.get_id_from_url(m.group(1)),
- 'text': m.group(2),
- 'url': m.group(1),
- 'an': m.group(3)
- })
-
- # could be more efficient doing this in the previous iteration
- data['data'] = self.parse_json_results(data['data'])
-
- if data['data'] and len(data['data']) > 0:
- data['error'] = False
- region.set(cache_key, data)
-
- return data
-
- def search_lite_api(self):
- 'Access the lite json api for info'
- response = self.get_api_page(LITE_JSON_PATH, {'s': self.title}, True)
-
- if response is None:
- logger.warning(
- "Regielive lite API failed to provide a proper reply")
- return None
-
- if response['error'] or not response['data']:
- logger.warning("Regielive API responded with no results!")
- logger.info(response)
- return None
-
- response['data'] = self.parse_json_results(response['data'])
-
- return response
-
- def parse_json_results(self, data_arr):
- 'Parses the results of our lite api request'
- if not data_arr:
- return None
-
- result = list(filter(self.json_result_filter, data_arr))
-
- if not result:
- return None
-
- return result
-
- def json_result_filter(self, element):
- 'Filter function for json results'
- if not element:
- return False
-
- match_ratio = title_match(element['text'], self.title, True)
- element_year = RegieLiveSearchAPI.get_safe_int(element['an'])
-
- #if none have valid years we'll let it match
- #also RL sometimes has the year off by 1
- if abs(element_year - RegieLiveSearchAPI.get_safe_int(self.video.year)) <= 1 and match_ratio > 0.9:
- return True
-
- logger.info("No match for title %s year %i and returned title %s year %i match ration %f",
- self.title,
- self.video.year,
- element['text'],
- element_year,
- match_ratio)
- return False
-
- def get_api_page(self, url, url_params, return_json=False):
- 'request a page from RL API'
- return self.get_page(urlparse.urljoin(BASE_URL, url), url_params, return_json)
-
- def get_page(self, url, url_params, return_json=False):
- 'Request a page'
- try:
- req = self.session.get(url, params=url_params,
- timeout=REQUEST_TIMEOUT,
- allow_redirects=True)
- req.raise_for_status()
-
- if return_json:
- return req.json()
-
- return req.text
- except req_exceptions.HTTPError as err:
- logger.exception(
- "Failed to request url %s\n Error %s", url, str(err))
-
- return None
-
- def extract_movie_sub_block(self, page_html):
- 'extract subtitles block from movie page'
- m = SUB_PAGE_MOVIE_MATCH.search(page_html)
- if m:
- return m.group(1)
-
- logger.info("Could not find subtitle block for Movie %s", self.title)
- return ''
-
- def extract_episode_sub_block(self, page_html):
- 'extract subtitle from series page'
- episode_zone_regex = SUB_PAGE_EPISODE_PATTERN % self.video.episode
- m = None
- try:
- m = re.search(episode_zone_regex, page_html)
- except Exception as err:
- logger.debug(str(page_html))
- logger.exception(err)
-
- if m:
- return m.group(1)
-
- logger.info("Could not find episode %i for season %i of series %s",
- self.video.episode,
- self.video.season,
- self.title)
- return ''
-
- def parse_sub_block(self, subs_block):
- 'Parse sub block into subtitle objects'
- if subs_block is None:
- return None
-
- m_iter = SUB_FILE_INFO_MATCH.finditer(subs_block)
- sub_list = []
- if m_iter:
- for match in m_iter:
- sub_list.append(
- RegieLiveAPIData(match.group(1), match.group(5),
- RegieLiveAPIRating(match.group(2), match.group(4))))
- else:
- logger.debug('No subtitles matched for sub block %s of title %s', str(
- subs_block), self.title)
-
- return sub_list
-
- @classmethod
- def get_id_from_url(cls, url):
- 'get the movie rl id from page url'
- m = re.search(r'(?ms)(\d+)/', url)
- if m:
- return m.group(1)
-
- return 0
-
- @classmethod
- def get_safe_int(cls, value, default_value = 0):
- 'returns an int from the supplied value or a default'
- if value and ( isinstance(value, int) or (isinstance(value, str) and value.isdigit()) ):
- return int(value)
-
- return default_value
-