diff options
author | Louis Vézina <[email protected]> | 2019-09-13 15:12:26 -0400 |
---|---|---|
committer | Louis Vézina <[email protected]> | 2019-09-13 15:12:26 -0400 |
commit | 645952c61aba3cccb5ca919be966a7ba02d853fa (patch) | |
tree | 005362d11c260b4293b427ed8ba9d94e8b478ec3 | |
parent | 4e7e3a39d26cb3771876b43c82a747f6c6526dda (diff) | |
download | bazarr-645952c61aba3cccb5ca919be966a7ba02d853fa.tar.gz bazarr-645952c61aba3cccb5ca919be966a7ba02d853fa.zip |
WIP
119 files changed, 2349 insertions, 5267 deletions
@@ -1,5 +1,7 @@ # coding=utf-8 +from __future__ import absolute_import +from __future__ import print_function import subprocess as sp import time import os @@ -12,14 +14,16 @@ from bazarr.get_args import args def check_python_version(): python_version = platform.python_version_tuple() minimum_python_version_tuple = (2, 7, 13) + minimum_python3_version_tuple = (3, 6, 0) minimum_python_version = ".".join(str(i) for i in minimum_python_version_tuple) + minimum_python3_version = ".".join(str(i) for i in minimum_python3_version_tuple) - if int(python_version[0]) > minimum_python_version_tuple[0]: - print "Python 3 isn't supported. Please use Python " + minimum_python_version + " or greater." + if int(python_version[0]) == minimum_python3_version_tuple[0] and int(python_version[1]) < minimum_python3_version_tuple[1]: + print("Python " + minimum_python3_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python.") os._exit(0) - elif int(python_version[1]) < minimum_python_version_tuple[1] or int(python_version[2].rstrip('+')) < minimum_python_version_tuple[2]: - print "Python " + minimum_python_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python." + elif int(python_version[0]) == minimum_python_version_tuple[0] and (int(python_version[1]) < minimum_python_version_tuple[1] or int(python_version[2].rstrip('+')) < minimum_python_version_tuple[2]): + print("Python " + minimum_python_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python.") os._exit(0) @@ -32,10 +36,10 @@ def start_bazarr(): script = [sys.executable, "-u", os.path.normcase(os.path.join(dir_name, 'bazarr', 'main.py'))] + sys.argv[1:] ep = sp.Popen(script, stdout=sp.PIPE, stderr=sp.STDOUT, stdin=sp.PIPE) - print "Bazarr starting..." + print("Bazarr starting...") try: for line in iter(ep.stdout.readline, ''): - sys.stdout.write(line) + sys.stdout.buffer.write(line) except KeyboardInterrupt: pass @@ -60,16 +64,16 @@ if __name__ == '__main__': try: os.remove(stopfile) except: - print 'Unable to delete stop file.' + print('Unable to delete stop file.') else: - print 'Bazarr exited.' + print('Bazarr exited.') os._exit(0) if os.path.exists(restartfile): try: os.remove(restartfile) except: - print 'Unable to delete restart file.' + print('Unable to delete restart file.') else: start_bazarr() diff --git a/bazarr/analytics.py b/bazarr/analytics.py index a751dc477..64c147bc2 100644 --- a/bazarr/analytics.py +++ b/bazarr/analytics.py @@ -1,6 +1,7 @@ # coding=utf-8 -import cPickle as pickle +from __future__ import absolute_import +import six.moves.cPickle as pickle import base64 import random import platform @@ -30,7 +31,7 @@ def track_event(category=None, action=None, label=None): visitor = pickle.loads(base64.b64decode(settings.analytics.visitor)) except: visitor = Visitor() - unique_id = long(random.getrandbits(32)) + unique_id = int(random.getrandbits(32)) visitor.unique_id = unique_id session = Session() diff --git a/bazarr/check_update.py b/bazarr/check_update.py index 08b1946fc..033e34896 100644 --- a/bazarr/check_update.py +++ b/bazarr/check_update.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import os import logging import json diff --git a/bazarr/config.py b/bazarr/config.py index d5b606e8e..810cb4d54 100644 --- a/bazarr/config.py +++ b/bazarr/config.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import os from simpleconfigparser import simpleconfigparser diff --git a/bazarr/database.py b/bazarr/database.py index bee060907..8ec8b02b9 100644 --- a/bazarr/database.py +++ b/bazarr/database.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import import os import atexit diff --git a/bazarr/embedded_subs_reader.py b/bazarr/embedded_subs_reader.py index df9625d4c..4d4819d38 100644 --- a/bazarr/embedded_subs_reader.py +++ b/bazarr/embedded_subs_reader.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import import enzyme import logging import os diff --git a/bazarr/get_args.py b/bazarr/get_args.py index a92052729..3eb99c2a9 100644 --- a/bazarr/get_args.py +++ b/bazarr/get_args.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import os import argparse diff --git a/bazarr/get_episodes.py b/bazarr/get_episodes.py index 09a1220b8..c1f280d1d 100644 --- a/bazarr/get_episodes.py +++ b/bazarr/get_episodes.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import os import requests import logging diff --git a/bazarr/get_languages.py b/bazarr/get_languages.py index ab71deb3f..a3658c1a4 100644 --- a/bazarr/get_languages.py +++ b/bazarr/get_languages.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import os import pycountry diff --git a/bazarr/get_movies.py b/bazarr/get_movies.py index 6a754ae32..303dbcc90 100644 --- a/bazarr/get_movies.py +++ b/bazarr/get_movies.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import os import requests import logging @@ -13,6 +14,7 @@ from list_subtitles import store_subtitles_movie, list_missing_subtitles_movies, from get_subtitle import movies_download_subtitles from database import TableMovies, wal_cleaning +import six def update_all_movies(): @@ -82,7 +84,7 @@ def update_movies(): if movie["path"] != None and movie['movieFile']['relativePath'] != None: try: - overview = unicode(movie['overview']) + overview = six.text_type(movie['overview']) except: overview = "" try: @@ -136,27 +138,27 @@ def update_movies(): audioCodec = None # Add movies in radarr to current movies list - current_movies_radarr.append(unicode(movie['tmdbId'])) + current_movies_radarr.append(six.text_type(movie['tmdbId'])) - if unicode(movie['tmdbId']) in current_movies_db_list: + if six.text_type(movie['tmdbId']) in current_movies_db_list: movies_to_update.append({'radarr_id': movie["id"], - 'title': unicode(movie["title"]), - 'path': unicode(movie["path"] + separator + movie['movieFile']['relativePath']), - 'tmdb_id': unicode(movie["tmdbId"]), - 'poster': unicode(poster), - 'fanart': unicode(fanart), - 'audio_language': unicode(profile_id_to_language(movie['qualityProfileId'], audio_profiles)), + 'title': six.text_type(movie["title"]), + 'path': six.text_type(movie["path"] + separator + movie['movieFile']['relativePath']), + 'tmdb_id': six.text_type(movie["tmdbId"]), + 'poster': six.text_type(poster), + 'fanart': six.text_type(fanart), + 'audio_language': six.text_type(profile_id_to_language(movie['qualityProfileId'], audio_profiles)), 'scene_name': sceneName, - 'monitored': unicode(bool(movie['monitored'])), - 'year': unicode(movie['year']), - 'sort_title': unicode(movie['sortTitle']), - 'alternative_titles': unicode(alternativeTitles), - 'format': unicode(format), - 'resolution': unicode(resolution), - 'video_codec': unicode(videoCodec), - 'audio_codec': unicode(audioCodec), - 'overview': unicode(overview), - 'imdb_id': unicode(imdbId)}) + 'monitored': six.text_type(bool(movie['monitored'])), + 'year': six.text_type(movie['year']), + 'sort_title': six.text_type(movie['sortTitle']), + 'alternative_titles': six.text_type(alternativeTitles), + 'format': six.text_type(format), + 'resolution': six.text_type(resolution), + 'video_codec': six.text_type(videoCodec), + 'audio_codec': six.text_type(audioCodec), + 'overview': six.text_type(overview), + 'imdb_id': six.text_type(imdbId)}) else: if movie_default_enabled is True: movies_to_add.append({'radarr_id': movie["id"], @@ -171,7 +173,7 @@ def update_movies(): 'fanart': fanart, 'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles), 'scene_name': sceneName, - 'monitored': unicode(bool(movie['monitored'])), + 'monitored': six.text_type(bool(movie['monitored'])), 'sort_title': movie['sortTitle'], 'year': movie['year'], 'alternative_titles': alternativeTitles, @@ -191,7 +193,7 @@ def update_movies(): 'fanart': fanart, 'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles), 'scene_name': sceneName, - 'monitored': unicode(bool(movie['monitored'])), + 'monitored': six.text_type(bool(movie['monitored'])), 'sort_title': movie['sortTitle'], 'year': movie['year'], 'alternative_titles': alternativeTitles, diff --git a/bazarr/get_providers.py b/bazarr/get_providers.py index 461a21615..5f4d84884 100644 --- a/bazarr/get_providers.py +++ b/bazarr/get_providers.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import os import datetime import logging @@ -159,8 +160,8 @@ def provider_throttle(name, exception): def throttled_count(name): global throttle_count - if name in throttle_count.keys(): - if 'count' in throttle_count[name].keys(): + if name in list(throttle_count.keys()): + if 'count' in list(throttle_count[name].keys()): for key, value in throttle_count[name].items(): if key == 'count': value += 1 diff --git a/bazarr/get_series.py b/bazarr/get_series.py index b3ac34d28..a453f65ca 100644 --- a/bazarr/get_series.py +++ b/bazarr/get_series.py @@ -1,5 +1,7 @@ # coding=utf-8 +from __future__ import absolute_import +from __future__ import print_function import os import requests import logging @@ -12,6 +14,7 @@ from config import settings, url_sonarr from list_subtitles import list_missing_subtitles from database import TableShows from utils import get_sonarr_version +import six def update_series(): @@ -60,7 +63,7 @@ def update_series(): for i, show in enumerate(r.json(), 1): notifications.write(msg="Getting series data from Sonarr...", queue='get_series', item=i, length=seriesListLength) try: - overview = unicode(show['overview']) + overview = six.text_type(show['overview']) except: overview = "" try: @@ -82,17 +85,17 @@ def update_series(): current_shows_sonarr.append(show['tvdbId']) if show['tvdbId'] in current_shows_db_list: - series_to_update.append({'title': unicode(show["title"]), - 'path': unicode(show["path"]), + series_to_update.append({'title': six.text_type(show["title"]), + 'path': six.text_type(show["path"]), 'tvdb_id': int(show["tvdbId"]), 'sonarr_series_id': int(show["id"]), - 'overview': unicode(overview), - 'poster': unicode(poster), - 'fanart': unicode(fanart), - 'audio_language': unicode(profile_id_to_language((show['qualityProfileId'] if get_sonarr_version().startswith('2') else show['languageProfileId']), audio_profiles)), - 'sort_title': unicode(show['sortTitle']), - 'year': unicode(show['year']), - 'alternate_titles': unicode(alternateTitles)}) + 'overview': six.text_type(overview), + 'poster': six.text_type(poster), + 'fanart': six.text_type(fanart), + 'audio_language': six.text_type(profile_id_to_language((show['qualityProfileId'] if get_sonarr_version().startswith('2') else show['languageProfileId']), audio_profiles)), + 'sort_title': six.text_type(show['sortTitle']), + 'year': six.text_type(show['year']), + 'alternate_titles': six.text_type(alternateTitles)}) else: if serie_default_enabled is True: series_to_add.append({'title': show["title"], @@ -161,9 +164,9 @@ def update_series(): removed_series = list(set(current_shows_db_list) - set(current_shows_sonarr)) for series in removed_series: - print TableShows.delete().where( + print(TableShows.delete().where( TableShows.tvdb_id == series - ).execute() + ).execute()) logging.debug('BAZARR All series synced from Sonarr into database.') diff --git a/bazarr/get_subtitle.py b/bazarr/get_subtitle.py index a12fadd92..c2567ead4 100644 --- a/bazarr/get_subtitle.py +++ b/bazarr/get_subtitle.py @@ -1,12 +1,13 @@ # coding=utf-8 +from __future__ import absolute_import import os import sys import ast import logging import subprocess import time -import cPickle as pickle +import six.moves.cPickle as pickle import codecs import types import re @@ -37,6 +38,9 @@ from database import TableShows, TableEpisodes, TableMovies, TableHistory, Table from peewee import fn, JOIN from analytics import track_event +import six +from six.moves import range +from functools import reduce def get_video(path, title, sceneName, use_scenename, providers=None, media_type="movie"): @@ -91,11 +95,11 @@ def get_scores(video, media_type, min_score_movie_perc=60 * 100 / 120.0, min_sco """ max_score = 120.0 min_score = max_score * min_score_movie_perc / 100.0 - scores = subliminal_scores.movie_scores.keys() + scores = list(subliminal_scores.movie_scores.keys()) if media_type == "series": max_score = 360.0 min_score = max_score * min_score_series_perc / 100.0 - scores = subliminal_scores.episode_scores.keys() + scores = list(subliminal_scores.episode_scores.keys()) if video.is_special: min_score = max_score * min_score_special_ep / 100.0 @@ -119,7 +123,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce hi = "force non-HI" language_set = set() - if not isinstance(language, types.ListType): + if not isinstance(language, list): language = [language] if forced == "True": @@ -185,7 +189,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce saved_any = False if downloaded_subtitles: - for video, subtitles in downloaded_subtitles.iteritems(): + for video, subtitles in six.iteritems(downloaded_subtitles): if not subtitles: continue @@ -221,10 +225,10 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce else: action = "downloaded" if video.used_scene_name: - message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + unicode( + message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + six.text_type( round(subtitle.score * 100 / max_score, 2)) + "% using this scene name: " + sceneName else: - message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + unicode( + message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + six.text_type( round(subtitle.score * 100 / max_score, 2)) + "% using filename guessing." if use_postprocessing is True: @@ -444,7 +448,7 @@ def manual_download_subtitle(path, language, hi, forced, subtitle, provider, pro downloaded_path = saved_subtitle.storage_path logging.debug('BAZARR Subtitles file saved to disk: ' + downloaded_path) is_forced_string = " forced" if subtitle.language.forced else "" - message = downloaded_language + is_forced_string + " subtitles downloaded from " + downloaded_provider + " with a score of " + unicode( + message = downloaded_language + is_forced_string + " subtitles downloaded from " + downloaded_provider + " with a score of " + six.text_type( score) + "% using manual search." if use_postprocessing is True: @@ -749,7 +753,7 @@ def wanted_download_subtitles(path, l, count_episodes): for episode in episodes_details: attempt = episode.failed_attempts - if type(attempt) == unicode: + if type(attempt) == six.text_type: attempt = ast.literal_eval(attempt) for language in ast.literal_eval(episode.missing_subtitles): if attempt is None: @@ -762,7 +766,7 @@ def wanted_download_subtitles(path, l, count_episodes): TableEpisodes.update( { - TableEpisodes.failed_attempts: unicode(attempt) + TableEpisodes.failed_attempts: six.text_type(attempt) } ).where( TableEpisodes.sonarr_episode_id == episode.sonarr_episode_id @@ -818,7 +822,7 @@ def wanted_download_subtitles_movie(path, l, count_movies): for movie in movies_details: attempt = movie.failed_attempts - if type(attempt) == unicode: + if type(attempt) == six.text_type: attempt = ast.literal_eval(attempt) for language in ast.literal_eval(movie.missing_subtitles): if attempt is None: @@ -831,7 +835,7 @@ def wanted_download_subtitles_movie(path, l, count_movies): TableMovies.update( { - TableMovies.failed_attempts: unicode(attempt) + TableMovies.failed_attempts: six.text_type(attempt) } ).where( TableMovies.radarr_id == movie.radarr_id @@ -991,7 +995,7 @@ def refine_from_db(path, video): TableMovies.audio_codec, TableMovies.imdb_id ).where( - TableMovies.path == unicode(path_replace_reverse_movie(path)) + TableMovies.path == six.text_type(path_replace_reverse_movie(path)) ).first() if data: diff --git a/bazarr/helper.py b/bazarr/helper.py index 9519651bf..6cd409b5a 100644 --- a/bazarr/helper.py +++ b/bazarr/helper.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import ast import os import re @@ -126,7 +127,7 @@ def force_unicode(s): :param s: string :return: unicode string """ - if not isinstance(s, types.UnicodeType): + if not isinstance(s, str): try: s = s.decode("utf-8") except UnicodeDecodeError: diff --git a/bazarr/init.py b/bazarr/init.py index 87ff2a9d7..a0cc711a5 100644 --- a/bazarr/init.py +++ b/bazarr/init.py @@ -1,12 +1,13 @@ # coding=utf-8 +from __future__ import absolute_import import os import logging import time import rarfile from cork import Cork -from ConfigParser2 import ConfigParser +from backports import configparser2 from config import settings from check_update import check_releases from get_args import args @@ -66,7 +67,7 @@ if not os.path.exists(os.path.join(args.config_dir, 'config', 'releases.txt')): config_file = os.path.normpath(os.path.join(args.config_dir, 'config', 'config.ini')) -cfg = ConfigParser() +cfg = configparser2.ConfigParser() def init_binaries(): diff --git a/bazarr/libs.py b/bazarr/libs.py index 37b8809ac..5cab7b473 100644 --- a/bazarr/libs.py +++ b/bazarr/libs.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import os import sys diff --git a/bazarr/list_subtitles.py b/bazarr/list_subtitles.py index dcac64f4b..cc8432dcf 100644 --- a/bazarr/list_subtitles.py +++ b/bazarr/list_subtitles.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import gc import os import babelfish @@ -24,6 +25,7 @@ from helper import path_replace, path_replace_movie, path_replace_reverse, \ from queueconfig import notifications from embedded_subs_reader import embedded_subs_reader +import six gc.enable() @@ -63,7 +65,7 @@ def store_subtitles(file): logging.exception("BAZARR unable to index external subtitles.") pass else: - for subtitle, language in subtitles.iteritems(): + for subtitle, language in six.iteritems(subtitles): subtitle_path = get_external_subtitles_path(file, subtitle) if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)): logging.debug("BAZARR external subtitles detected: " + "pb") @@ -155,7 +157,7 @@ def store_subtitles_movie(file): logging.exception("BAZARR unable to index external subtitles.") pass else: - for subtitle, language in subtitles.iteritems(): + for subtitle, language in six.iteritems(subtitles): if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)) is True: logging.debug("BAZARR external subtitles detected: " + "pb") actual_subtitles.append( diff --git a/bazarr/logger.py b/bazarr/logger.py index 90931270f..03334354f 100644 --- a/bazarr/logger.py +++ b/bazarr/logger.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import os import logging import re @@ -9,6 +10,7 @@ import platform from logging.handlers import TimedRotatingFileHandler from get_args import args from config import settings +import six logger = logging.getLogger() @@ -107,10 +109,10 @@ class MyFilter(logging.Filter): class ArgsFilteringFilter(logging.Filter): def filter_args(self, record, func): - if isinstance(record.args, (types.ListType, types.TupleType)): + if isinstance(record.args, (list, tuple)): final_args = [] for arg in record.args: - if not isinstance(arg, basestring): + if not isinstance(arg, six.string_types): final_args.append(arg) continue @@ -118,7 +120,7 @@ class ArgsFilteringFilter(logging.Filter): record.args = type(record.args)(final_args) elif isinstance(record.args, dict): for key, arg in record.args.items(): - if not isinstance(arg, basestring): + if not isinstance(arg, six.string_types): continue record.args[key] = func(arg) diff --git a/bazarr/main.py b/bazarr/main.py index 33917be77..819c16bd6 100644 --- a/bazarr/main.py +++ b/bazarr/main.py @@ -1,5 +1,8 @@ # coding=utf-8 +import six +from six.moves import zip +from functools import reduce bazarr_version = '0.8.2' import gc @@ -12,7 +15,7 @@ import pretty import math import ast import hashlib -import urllib +import six.moves.urllib.request, six.moves.urllib.parse, six.moves.urllib.error import warnings import queueconfig import platform @@ -1575,12 +1578,12 @@ def save_settings(): settings_death_by_captcha_username = request.forms.get('settings_death_by_captcha_username') settings_death_by_captcha_password = request.forms.get('settings_death_by_captcha_password') - before = (unicode(settings.general.ip), int(settings.general.port), unicode(settings.general.base_url), - unicode(settings.general.path_mappings), unicode(settings.general.getboolean('use_sonarr')), - unicode(settings.general.getboolean('use_radarr')), unicode(settings.general.path_mappings_movie)) - after = (unicode(settings_general_ip), int(settings_general_port), unicode(settings_general_baseurl), - unicode(settings_general_pathmapping), unicode(settings_general_use_sonarr), - unicode(settings_general_use_radarr), unicode(settings_general_pathmapping_movie)) + before = (six.text_type(settings.general.ip), int(settings.general.port), six.text_type(settings.general.base_url), + six.text_type(settings.general.path_mappings), six.text_type(settings.general.getboolean('use_sonarr')), + six.text_type(settings.general.getboolean('use_radarr')), six.text_type(settings.general.path_mappings_movie)) + after = (six.text_type(settings_general_ip), int(settings_general_port), six.text_type(settings_general_baseurl), + six.text_type(settings_general_pathmapping), six.text_type(settings_general_use_sonarr), + six.text_type(settings_general_use_radarr), six.text_type(settings_general_pathmapping_movie)) settings.general.ip = text_type(settings_general_ip) settings.general.port = text_type(settings_general_port) @@ -1645,7 +1648,7 @@ def save_settings(): settings_proxy_password = request.forms.get('settings_proxy_password') settings_proxy_exclude = request.forms.get('settings_proxy_exclude') - before_proxy_password = (unicode(settings.proxy.type), unicode(settings.proxy.exclude)) + before_proxy_password = (six.text_type(settings.proxy.type), six.text_type(settings.proxy.exclude)) if before_proxy_password[0] != settings_proxy_type: configured() if before_proxy_password[1] == settings_proxy_password: @@ -2029,7 +2032,7 @@ def remove_subtitles(): history_log(0, sonarrSeriesId, sonarrEpisodeId, result) except OSError as e: logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath) - store_subtitles(unicode(episodePath)) + store_subtitles(six.text_type(episodePath)) list_missing_subtitles(sonarrSeriesId) @@ -2048,7 +2051,7 @@ def remove_subtitles_movie(): history_log_movie(0, radarrId, result) except OSError as e: logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath) - store_subtitles_movie(unicode(moviePath)) + store_subtitles_movie(six.text_type(moviePath)) list_missing_subtitles_movies(radarrId) @@ -2082,7 +2085,7 @@ def get_subtitle(): score = result[4] history_log(1, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score) send_notifications(sonarrSeriesId, sonarrEpisodeId, message) - store_subtitles(unicode(episodePath)) + store_subtitles(six.text_type(episodePath)) list_missing_subtitles(sonarrSeriesId) redirect(ref) except OSError: @@ -2140,7 +2143,7 @@ def manual_get_subtitle(): score = result[4] history_log(2, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score) send_notifications(sonarrSeriesId, sonarrEpisodeId, message) - store_subtitles(unicode(episodePath)) + store_subtitles(six.text_type(episodePath)) list_missing_subtitles(sonarrSeriesId) redirect(ref) except OSError: @@ -2184,7 +2187,7 @@ def perform_manual_upload_subtitle(): score = 360 history_log(4, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score) send_notifications(sonarrSeriesId, sonarrEpisodeId, message) - store_subtitles(unicode(episodePath)) + store_subtitles(six.text_type(episodePath)) list_missing_subtitles(sonarrSeriesId) redirect(ref) @@ -2221,7 +2224,7 @@ def get_subtitle_movie(): score = result[4] history_log_movie(1, radarrId, message, path, language_code, provider, score) send_notifications_movie(radarrId, message) - store_subtitles_movie(unicode(moviePath)) + store_subtitles_movie(six.text_type(moviePath)) list_missing_subtitles_movies(radarrId) redirect(ref) except OSError: @@ -2277,7 +2280,7 @@ def manual_get_subtitle_movie(): score = result[4] history_log_movie(2, radarrId, message, path, language_code, provider, score) send_notifications_movie(radarrId, message) - store_subtitles_movie(unicode(moviePath)) + store_subtitles_movie(six.text_type(moviePath)) list_missing_subtitles_movies(radarrId) redirect(ref) except OSError: @@ -2320,7 +2323,7 @@ def perform_manual_upload_subtitle_movie(): score = 120 history_log_movie(4, radarrId, message, path, language_code, provider, score) send_notifications_movie(radarrId, message) - store_subtitles_movie(unicode(moviePath)) + store_subtitles_movie(six.text_type(moviePath)) list_missing_subtitles_movies(radarrId) redirect(ref) @@ -2421,7 +2424,7 @@ def api_history(): @route(base_url + 'test_url/<protocol>/<url:path>', method='GET') @custom_auth_basic(check_credentials) def test_url(protocol, url): - url = urllib.unquote(url) + url = six.moves.urllib.parse.unquote(url) try: result = requests.get(protocol + "://" + url, allow_redirects=False, verify=False).json()['version'] except: @@ -2433,7 +2436,7 @@ def test_url(protocol, url): @route(base_url + 'test_notification/<protocol>/<provider:path>', method='GET') @custom_auth_basic(check_credentials) def test_notification(protocol, provider): - provider = urllib.unquote(provider) + provider = six.moves.urllib.parse.unquote(provider) apobj = apprise.Apprise() apobj.add(protocol + "://" + provider) diff --git a/bazarr/notifier.py b/bazarr/notifier.py index 9a0eea98f..3b4fbea85 100644 --- a/bazarr/notifier.py +++ b/bazarr/notifier.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import apprise import os import logging diff --git a/bazarr/queueconfig.py b/bazarr/queueconfig.py index fd33c1850..5ba427d59 100644 --- a/bazarr/queueconfig.py +++ b/bazarr/queueconfig.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import from collections import deque import json diff --git a/bazarr/scheduler.py b/bazarr/scheduler.py index e7580ae2e..dee28bfbb 100644 --- a/bazarr/scheduler.py +++ b/bazarr/scheduler.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import from get_episodes import sync_episodes, update_all_episodes from get_movies import update_movies, update_all_movies from get_series import update_series diff --git a/bazarr/utils.py b/bazarr/utils.py index c3dde6c56..f86d35191 100644 --- a/bazarr/utils.py +++ b/bazarr/utils.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import os import time import platform diff --git a/libs/ConfigParser2.py b/libs/ConfigParser2.py deleted file mode 100644 index 4ec642a5d..000000000 --- a/libs/ConfigParser2.py +++ /dev/null @@ -1,797 +0,0 @@ -"""Configuration file parser. - -A setup file consists of sections, lead by a "[section]" header, -and followed by "name: value" entries, with continuations and such in -the style of RFC 822. - -The option values can contain format strings which refer to other values in -the same section, or values in a special [DEFAULT] section. - -For example: - - something: %(dir)s/whatever - -would resolve the "%(dir)s" to the value of dir. All reference -expansions are done late, on demand. - -Intrinsic defaults can be specified by passing them into the -ConfigParser constructor as a dictionary. - -class: - -ConfigParser -- responsible for parsing a list of - configuration files, and managing the parsed database. - - methods: - - __init__(defaults=None) - create the parser and specify a dictionary of intrinsic defaults. The - keys must be strings, the values must be appropriate for %()s string - interpolation. Note that `__name__' is always an intrinsic default; - its value is the section's name. - - sections() - return all the configuration section names, sans DEFAULT - - has_section(section) - return whether the given section exists - - has_option(section, option) - return whether the given option exists in the given section - - options(section) - return list of configuration options for the named section - - read(filenames) - read and parse the list of named configuration files, given by - name. A single filename is also allowed. Non-existing files - are ignored. Return list of successfully read files. - - readfp(fp, filename=None) - read and parse one configuration file, given as a file object. - The filename defaults to fp.name; it is only used in error - messages (if fp has no `name' attribute, the string `<???>' is used). - - get(section, option, raw=False, vars=None) - return a string value for the named option. All % interpolations are - expanded in the return values, based on the defaults passed into the - constructor and the DEFAULT section. Additional substitutions may be - provided using the `vars' argument, which must be a dictionary whose - contents override any pre-existing defaults. - - getint(section, options) - like get(), but convert value to an integer - - getfloat(section, options) - like get(), but convert value to a float - - getboolean(section, options) - like get(), but convert value to a boolean (currently case - insensitively defined as 0, false, no, off for False, and 1, true, - yes, on for True). Returns False or True. - - items(section, raw=False, vars=None) - return a list of tuples with (name, value) for each option - in the section. - - remove_section(section) - remove the given file section and all its options - - remove_option(section, option) - remove the given option from the given section - - set(section, option, value) - set the given option - - write(fp) - write the configuration state in .ini format -""" - -try: - from collections import OrderedDict as _default_dict -except ImportError: - # fallback for setup.py which hasn't yet built _collections - _default_dict = dict - -import re - -__all__ = ["NoSectionError", "DuplicateSectionError", "NoOptionError", - "InterpolationError", "InterpolationDepthError", - "InterpolationSyntaxError", "ParsingError", - "MissingSectionHeaderError", - "ConfigParser", "SafeConfigParser", "RawConfigParser", - "DEFAULTSECT", "MAX_INTERPOLATION_DEPTH"] - -DEFAULTSECT = "DEFAULT" - -MAX_INTERPOLATION_DEPTH = 10 - - - -# exception classes -class Error(Exception): - """Base class for ConfigParser exceptions.""" - - def _get_message(self): - """Getter for 'message'; needed only to override deprecation in - BaseException.""" - return self.__message - - def _set_message(self, value): - """Setter for 'message'; needed only to override deprecation in - BaseException.""" - self.__message = value - - # BaseException.message has been deprecated since Python 2.6. To prevent - # DeprecationWarning from popping up over this pre-existing attribute, use - # a new property that takes lookup precedence. - message = property(_get_message, _set_message) - - def __init__(self, msg=''): - self.message = msg - Exception.__init__(self, msg) - - def __repr__(self): - return self.message - - __str__ = __repr__ - -class NoSectionError(Error): - """Raised when no section matches a requested option.""" - - def __init__(self, section): - Error.__init__(self, 'No section: %r' % (section,)) - self.section = section - self.args = (section, ) - -class DuplicateSectionError(Error): - """Raised when a section is multiply-created.""" - - def __init__(self, section): - Error.__init__(self, "Section %r already exists" % section) - self.section = section - self.args = (section, ) - -class NoOptionError(Error): - """A requested option was not found.""" - - def __init__(self, option, section): - Error.__init__(self, "No option %r in section: %r" % - (option, section)) - self.option = option - self.section = section - self.args = (option, section) - -class InterpolationError(Error): - """Base class for interpolation-related exceptions.""" - - def __init__(self, option, section, msg): - Error.__init__(self, msg) - self.option = option - self.section = section - self.args = (option, section, msg) - -class InterpolationMissingOptionError(InterpolationError): - """A string substitution required a setting which was not available.""" - - def __init__(self, option, section, rawval, reference): - msg = ("Bad value substitution:\n" - "\tsection: [%s]\n" - "\toption : %s\n" - "\tkey : %s\n" - "\trawval : %s\n" - % (section, option, reference, rawval)) - InterpolationError.__init__(self, option, section, msg) - self.reference = reference - self.args = (option, section, rawval, reference) - -class InterpolationSyntaxError(InterpolationError): - """Raised when the source text into which substitutions are made - does not conform to the required syntax.""" - -class InterpolationDepthError(InterpolationError): - """Raised when substitutions are nested too deeply.""" - - def __init__(self, option, section, rawval): - msg = ("Value interpolation too deeply recursive:\n" - "\tsection: [%s]\n" - "\toption : %s\n" - "\trawval : %s\n" - % (section, option, rawval)) - InterpolationError.__init__(self, option, section, msg) - self.args = (option, section, rawval) - -class ParsingError(Error): - """Raised when a configuration file does not follow legal syntax.""" - - def __init__(self, filename): - Error.__init__(self, 'File contains parsing errors: %s' % filename) - self.filename = filename - self.errors = [] - self.args = (filename, ) - - def append(self, lineno, line): - self.errors.append((lineno, line)) - self.message += '\n\t[line %2d]: %s' % (lineno, line) - -class MissingSectionHeaderError(ParsingError): - """Raised when a key-value pair is found before any section header.""" - - def __init__(self, filename, lineno, line): - Error.__init__( - self, - 'File contains no section headers.\nfile: %s, line: %d\n%r' % - (filename, lineno, line)) - self.filename = filename - self.lineno = lineno - self.line = line - self.args = (filename, lineno, line) - - -class RawConfigParser: - def __init__(self, defaults=None, dict_type=_default_dict, - allow_no_value=False): - self._dict = dict_type - self._sections = self._dict() - self._defaults = self._dict() - if allow_no_value: - self._optcre = self.OPTCRE_NV - else: - self._optcre = self.OPTCRE - if defaults: - for key, value in defaults.items(): - self._defaults[self.optionxform(key)] = value - self.comment_store = None ## used for storing comments in ini - - - def defaults(self): - return self._defaults - - def sections(self): - """Return a list of section names, excluding [DEFAULT]""" - # self._sections will never have [DEFAULT] in it - return self._sections.keys() - - def add_section(self, section): - """Create a new section in the configuration. - - Raise DuplicateSectionError if a section by the specified name - already exists. Raise ValueError if name is DEFAULT or any of it's - case-insensitive variants. - """ - if section.lower() == "default": - raise ValueError, 'Invalid section name: %s' % section - - if section in self._sections: - raise DuplicateSectionError(section) - self._sections[section] = self._dict() - - def has_section(self, section): - """Indicate whether the named section is present in the configuration. - - The DEFAULT section is not acknowledged. - """ - return section in self._sections - - def options(self, section): - """Return a list of option names for the given section name.""" - try: - opts = self._sections[section].copy() - except KeyError: - raise NoSectionError(section) - opts.update(self._defaults) - if '__name__' in opts: - del opts['__name__'] - return opts.keys() - - def read(self, filenames): - """Read and parse a filename or a list of filenames. - - Files that cannot be opened are silently ignored; this is - designed so that you can specify a list of potential - configuration file locations (e.g. current directory, user's - home directory, systemwide directory), and all existing - configuration files in the list will be read. A single - filename may also be given. - - Return list of successfully read files. - """ - if isinstance(filenames, basestring): - filenames = [filenames] - read_ok = [] - for filename in filenames: - try: - fp = open(filename) - except IOError: - continue - self._read(fp, filename) - fp.close() - read_ok.append(filename) - return read_ok - - def readfp(self, fp, filename=None): - """Like read() but the argument must be a file-like object. - - The `fp' argument must have a `readline' method. Optional - second argument is the `filename', which if not given, is - taken from fp.name. If fp has no `name' attribute, `<???>' is - used. - - """ - if filename is None: - try: - filename = fp.name - except AttributeError: - filename = '<???>' - self._read(fp, filename) - - def get(self, section, option): - opt = self.optionxform(option) - if section not in self._sections: - if section != DEFAULTSECT: - raise NoSectionError(section) - if opt in self._defaults: - return self._defaults[opt] - else: - raise NoOptionError(option, section) - elif opt in self._sections[section]: - return self._sections[section][opt] - elif opt in self._defaults: - return self._defaults[opt] - else: - raise NoOptionError(option, section) - - def items(self, section): - try: - d2 = self._sections[section] - except KeyError: - if section != DEFAULTSECT: - raise NoSectionError(section) - d2 = self._dict() - d = self._defaults.copy() - d.update(d2) - if "__name__" in d: - del d["__name__"] - return d.items() - - def _get(self, section, conv, option): - return conv(self.get(section, option)) - - def getint(self, section, option): - return self._get(section, int, option) - - def getfloat(self, section, option): - return self._get(section, float, option) - - _boolean_states = {'1': True, 'yes': True, 'true': True, 'on': True, - '0': False, 'no': False, 'false': False, 'off': False} - - def getboolean(self, section, option): - v = self.get(section, option) - if v.lower() not in self._boolean_states: - raise ValueError, 'Not a boolean: %s' % v - return self._boolean_states[v.lower()] - - def optionxform(self, optionstr): - return optionstr.lower() - - def has_option(self, section, option): - """Check for the existence of a given option in a given section.""" - if not section or section == DEFAULTSECT: - option = self.optionxform(option) - return option in self._defaults - elif section not in self._sections: - return False - else: - option = self.optionxform(option) - return (option in self._sections[section] - or option in self._defaults) - - def set(self, section, option, value=None): - """Set an option.""" - if not section or section == DEFAULTSECT: - sectdict = self._defaults - else: - try: - sectdict = self._sections[section] - except KeyError: - raise NoSectionError(section) - sectdict[self.optionxform(option)] = value - - def write(self, fp): - """Write an .ini-format representation of the configuration state.""" - if self._defaults: - fp.write("[%s]\n" % DEFAULTSECT) - for (key, value) in self._defaults.items(): - fp.write("%s = %s\n" % (key, str(value).replace('\n', '\n\t'))) - fp.write("\n") - for section in self._sections: - fp.write("[%s]\n" % section) - for (key, value) in self._sections[section].items(): - if key == "__name__": - continue - if (value is not None) or (self._optcre == self.OPTCRE): - key = " = ".join((key, str(value).replace('\n', '\n\t'))) - fp.write("%s\n" % (key)) - fp.write("\n") - - def remove_option(self, section, option): - """Remove an option.""" - if not section or section == DEFAULTSECT: - sectdict = self._defaults - else: - try: - sectdict = self._sections[section] - except KeyError: - raise NoSectionError(section) - option = self.optionxform(option) - existed = option in sectdict - if existed: - del sectdict[option] - return existed - - def remove_section(self, section): - """Remove a file section.""" - existed = section in self._sections - if existed: - del self._sections[section] - return existed - - # - # Regular expressions for parsing section headers and options. - # - SECTCRE = re.compile( - r'\[' # [ - r'(?P<header>[^]]+)' # very permissive! - r'\]' # ] - ) - OPTCRE = re.compile( - r'(?P<option>[^:=\s][^:=]*)' # very permissive! - r'\s*(?P<vi>[:=])\s*' # any number of space/tab, - # followed by separator - # (either : or =), followed - # by any # space/tab - r'(?P<value>.*)$' # everything up to eol - ) - OPTCRE_NV = re.compile( - r'(?P<option>[^:=\s][^:=]*)' # very permissive! - r'\s*(?:' # any number of space/tab, - r'(?P<vi>[:=])\s*' # optionally followed by - # separator (either : or - # =), followed by any # - # space/tab - r'(?P<value>.*))?$' # everything up to eol - ) - - def _read(self, fp, fpname): - """Parse a sectioned setup file. - - The sections in setup file contains a title line at the top, - indicated by a name in square brackets (`[]'), plus key/value - options lines, indicated by `name: value' format lines. - Continuations are represented by an embedded newline then - leading whitespace. Blank lines, lines beginning with a '#', - and just about everything else are ignored. - """ - - comment_store = {} - cursect = None # None, or a dictionary - optname = None - lineno = 0 - e = None # None, or an exception - while True: - line = fp.readline() - if not line: - break - lineno = lineno + 1 - # comment or blank line? - if line.strip() == '' : - continue - ### store comments for doc purposes - ### Deal with cases of sections and options being there or not - if line[0] in '#;' and cursect is not None: - if optname is None: - comment_store.setdefault(cursect['__name__'] + - "::" + "global",[]).append(line) - else: - comment_store.setdefault(cursect['__name__'] + - "::" + optname,[]).append(line) - continue - elif line[0] in '#;' and cursect is None: - comment_store.setdefault("global" + - "::" + optname,[]).append(line) - continue - - if line.split(None, 1)[0].lower() == 'rem' and line[0] in "rR": - # no leading whitespace - continue - # continuation line? - if line[0].isspace() and cursect is not None and optname: - value = line.strip() - if value: - cursect[optname].append(value) - # a section header or option header? - else: - # is it a section header? - mo = self.SECTCRE.match(line) - if mo: - sectname = mo.group('header') - if sectname in self._sections: - cursect = self._sections[sectname] - elif sectname == DEFAULTSECT: - cursect = self._defaults - else: - cursect = self._dict() - cursect['__name__'] = sectname - self._sections[sectname] = cursect - # So sections can't start with a continuation line - optname = None - # no section header in the file? - elif cursect is None: - raise MissingSectionHeaderError(fpname, lineno, line) - # an option line? - else: - mo = self._optcre.match(line) - if mo: - optname, vi, optval = mo.group('option', 'vi', 'value') - optname = self.optionxform(optname.rstrip()) - # This check is fine because the OPTCRE cannot - # match if it would set optval to None - if optval is not None: - if vi in ('=', ':') and ';' in optval: - # ';' is a comment delimiter only if it follows - # a spacing character - pos = optval.find(';') - if pos != -1 and optval[pos-1].isspace(): - optval = optval[:pos] - optval = optval.strip() - # allow empty values - if optval == '""': - optval = '' - cursect[optname] = [optval] - else: - # valueless option handling - cursect[optname] = optval - else: - # a non-fatal parsing error occurred. set up the - # exception but keep going. the exception will be - # raised at the end of the file and will contain a - # list of all bogus lines - if not e: - e = ParsingError(fpname) - e.append(lineno, repr(line)) - # if any parsing errors occurred, raise an exception - if e: - raise e - - # join the multi-line values collected while reading - all_sections = [self._defaults] - all_sections.extend(self._sections.values()) - for options in all_sections: - for name, val in options.items(): - if isinstance(val, list): - options[name] = '\n'.join(val) - self.comment_store = comment_store - - def ini_as_rst(self): - """trivial helper function to putput comment_stroe as rest - - .. todo:: write actual doctests with string input - >> p = ConfigParser2.SafeConfigParser() - >> p.read(f) - ['/usr/home/pbrian/src/public/configparser2/example.ini'] - >> open("/tmp/foo.rst", "w").write(p.ini_as_rst()) - - """ - outstr = ".. rst version of ini file\n\n" - _cursectname = None - for item in sorted(self.comment_store.keys()): - _sect, _opt = item.split("::") - if _sect != _cursectname: - outstr += "\n%s\n%s\n" % (_sect, "-"* len(_sect)) - _cursectname = _sect - txt = " ".join(self.comment_store[item]) - txt = txt.replace("#", "").replace(";","") - outstr += ":%s: %s" % (_opt, txt) - return outstr - - - -import UserDict as _UserDict - -class _Chainmap(_UserDict.DictMixin): - """Combine multiple mappings for successive lookups. - - For example, to emulate Python's normal lookup sequence: - - import __builtin__ - pylookup = _Chainmap(locals(), globals(), vars(__builtin__)) - """ - - def __init__(self, *maps): - self._maps = maps - - def __getitem__(self, key): - for mapping in self._maps: - try: - return mapping[key] - except KeyError: - pass - raise KeyError(key) - - def keys(self): - result = [] - seen = set() - for mapping in self._maps: - for key in mapping: - if key not in seen: - result.append(key) - seen.add(key) - return result - -class ConfigParser(RawConfigParser): - - def get(self, section, option, raw=False, vars=None): - """Get an option value for a given section. - - If `vars' is provided, it must be a dictionary. The option is looked up - in `vars' (if provided), `section', and in `defaults' in that order. - - All % interpolations are expanded in the return values, unless the - optional argument `raw' is true. Values for interpolation keys are - looked up in the same manner as the option. - - The section DEFAULT is special. - """ - sectiondict = {} - try: - sectiondict = self._sections[section] - except KeyError: - if section != DEFAULTSECT: - raise NoSectionError(section) - # Update with the entry specific variables - vardict = {} - if vars: - for key, value in vars.items(): - vardict[self.optionxform(key)] = value - d = _Chainmap(vardict, sectiondict, self._defaults) - option = self.optionxform(option) - try: - value = d[option] - except KeyError: - raise NoOptionError(option, section) - - if raw or value is None: - return value - else: - return self._interpolate(section, option, value, d) - - def items(self, section, raw=False, vars=None): - """Return a list of tuples with (name, value) for each option - in the section. - - All % interpolations are expanded in the return values, based on the - defaults passed into the constructor, unless the optional argument - `raw' is true. Additional substitutions may be provided using the - `vars' argument, which must be a dictionary whose contents overrides - any pre-existing defaults. - - The section DEFAULT is special. - """ - d = self._defaults.copy() - try: - d.update(self._sections[section]) - except KeyError: - if section != DEFAULTSECT: - raise NoSectionError(section) - # Update with the entry specific variables - if vars: - for key, value in vars.items(): - d[self.optionxform(key)] = value - options = d.keys() - if "__name__" in options: - options.remove("__name__") - if raw: - return [(option, d[option]) - for option in options] - else: - return [(option, self._interpolate(section, option, d[option], d)) - for option in options] - - def _interpolate(self, section, option, rawval, vars): - # do the string interpolation - value = rawval - depth = MAX_INTERPOLATION_DEPTH - while depth: # Loop through this until it's done - depth -= 1 - if value and "%(" in value: - value = self._KEYCRE.sub(self._interpolation_replace, value) - try: - value = value % vars - except KeyError, e: - raise InterpolationMissingOptionError( - option, section, rawval, e.args[0]) - else: - break - if value and "%(" in value: - raise InterpolationDepthError(option, section, rawval) - return value - - _KEYCRE = re.compile(r"%\(([^)]*)\)s|.") - - def _interpolation_replace(self, match): - s = match.group(1) - if s is None: - return match.group() - else: - return "%%(%s)s" % self.optionxform(s) - - -class SafeConfigParser(ConfigParser): - - def _interpolate(self, section, option, rawval, vars): - # do the string interpolation - L = [] - self._interpolate_some(option, L, rawval, section, vars, 1) - return ''.join(L) - - _interpvar_re = re.compile(r"%\(([^)]+)\)s") - - def _interpolate_some(self, option, accum, rest, section, map, depth): - if depth > MAX_INTERPOLATION_DEPTH: - raise InterpolationDepthError(option, section, rest) - while rest: - p = rest.find("%") - if p < 0: - accum.append(rest) - return - if p > 0: - accum.append(rest[:p]) - rest = rest[p:] - # p is no longer used - c = rest[1:2] - if c == "%": - accum.append("%") - rest = rest[2:] - elif c == "(": - m = self._interpvar_re.match(rest) - if m is None: - raise InterpolationSyntaxError(option, section, - "bad interpolation variable reference %r" % rest) - var = self.optionxform(m.group(1)) - rest = rest[m.end():] - try: - v = map[var] - except KeyError: - raise InterpolationMissingOptionError( - option, section, rest, var) - if "%" in v: - self._interpolate_some(option, accum, v, - section, map, depth + 1) - else: - accum.append(v) - else: - raise InterpolationSyntaxError( - option, section, - "'%%' must be followed by '%%' or '(', found: %r" % (rest,)) - - def set(self, section, option, value=None): - """Set an option. Extend ConfigParser.set: check for string values.""" - # The only legal non-string value if we allow valueless - # options is None, so we need to check if the value is a - # string if: - # - we do not allow valueless options, or - # - we allow valueless options but the value is not None - if self._optcre is self.OPTCRE or value: - if not isinstance(value, basestring): - raise TypeError("option values must be strings") - if value is not None: - # check for bad percent signs: - # first, replace all "good" interpolations - tmp_value = value.replace('%%', '') - tmp_value = self._interpvar_re.sub('', tmp_value) - # then, check if there's a lone percent sign left - if '%' in tmp_value: - raise ValueError("invalid interpolation syntax in %r at " - "position %d" % (value, tmp_value.find('%'))) - ConfigParser.set(self, section, option, value) diff --git a/libs/bs4/AUTHORS.txt b/libs/bs4/AUTHORS.txt deleted file mode 100644 index 2ac8fcc8c..000000000 --- a/libs/bs4/AUTHORS.txt +++ /dev/null @@ -1,43 +0,0 @@ -Behold, mortal, the origins of Beautiful Soup... -================================================ - -Leonard Richardson is the primary programmer. - -Aaron DeVore is awesome. - -Mark Pilgrim provided the encoding detection code that forms the base -of UnicodeDammit. - -Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful -Soup 4 working under Python 3. - -Simon Willison wrote soupselect, which was used to make Beautiful Soup -support CSS selectors. - -Sam Ruby helped with a lot of edge cases. - -Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his -work in solving the nestable tags conundrum. - -An incomplete list of people have contributed patches to Beautiful -Soup: - - Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang, - Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris - Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren, - Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed - Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko - Samastur, Jouni Sepp�nen, Alexander Schmolck, Andy Theyers, Glyn - Webster, Paul Wright, Danny Yoo - -An incomplete list of people who made suggestions or found bugs or -found ways to break Beautiful Soup: - - Hanno B�ck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel, - Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes, - Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams, - warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison, - Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed - Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart - Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de - Sousa Rocha, Yichun Wei, Per Vognsen diff --git a/libs/bs4/COPYING.txt b/libs/bs4/COPYING.txt deleted file mode 100644 index b91188869..000000000 --- a/libs/bs4/COPYING.txt +++ /dev/null @@ -1,27 +0,0 @@ -Beautiful Soup is made available under the MIT license: - - Copyright (c) 2004-2015 Leonard Richardson - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - -Beautiful Soup incorporates code from the html5lib library, which is -also made available under the MIT license. Copyright (c) 2006-2013 -James Graham and other contributors diff --git a/libs/bs4/NEWS.txt b/libs/bs4/NEWS.txt deleted file mode 100644 index 3726c570a..000000000 --- a/libs/bs4/NEWS.txt +++ /dev/null @@ -1,1190 +0,0 @@ -= 4.4.1 (20150928) = - -* Fixed a bug that deranged the tree when part of it was - removed. Thanks to Eric Weiser for the patch and John Wiseman for a - test. [bug=1481520] - -* Fixed a parse bug with the html5lib tree-builder. Thanks to Roel - Kramer for the patch. [bug=1483781] - -* Improved the implementation of CSS selector grouping. Thanks to - Orangain for the patch. [bug=1484543] - -* Fixed the test_detect_utf8 test so that it works when chardet is - installed. [bug=1471359] - -* Corrected the output of Declaration objects. [bug=1477847] - - -= 4.4.0 (20150703) = - -Especially important changes: - -* Added a warning when you instantiate a BeautifulSoup object without - explicitly naming a parser. [bug=1398866] - -* __repr__ now returns an ASCII bytestring in Python 2, and a Unicode - string in Python 3, instead of a UTF8-encoded bytestring in both - versions. In Python 3, __str__ now returns a Unicode string instead - of a bytestring. [bug=1420131] - -* The `text` argument to the find_* methods is now called `string`, - which is more accurate. `text` still works, but `string` is the - argument described in the documentation. `text` may eventually - change its meaning, but not for a very long time. [bug=1366856] - -* Changed the way soup objects work under copy.copy(). Copying a - NavigableString or a Tag will give you a new NavigableString that's - equal to the old one but not connected to the parse tree. Patch by - Martijn Peters. [bug=1307490] - -* Started using a standard MIT license. [bug=1294662] - -* Added a Chinese translation of the documentation by Delong .w. - -New features: - -* Introduced the select_one() method, which uses a CSS selector but - only returns the first match, instead of a list of - matches. [bug=1349367] - -* You can now create a Tag object without specifying a - TreeBuilder. Patch by Martijn Pieters. [bug=1307471] - -* You can now create a NavigableString or a subclass just by invoking - the constructor. [bug=1294315] - -* Added an `exclude_encodings` argument to UnicodeDammit and to the - Beautiful Soup constructor, which lets you prohibit the detection of - an encoding that you know is wrong. [bug=1469408] - -* The select() method now supports selector grouping. Patch by - Francisco Canas [bug=1191917] - -Bug fixes: - -* Fixed yet another problem that caused the html5lib tree builder to - create a disconnected parse tree. [bug=1237763] - -* Force object_was_parsed() to keep the tree intact even when an element - from later in the document is moved into place. [bug=1430633] - -* Fixed yet another bug that caused a disconnected tree when html5lib - copied an element from one part of the tree to another. [bug=1270611] - -* Fixed a bug where Element.extract() could create an infinite loop in - the remaining tree. - -* The select() method can now find tags whose names contain - dashes. Patch by Francisco Canas. [bug=1276211] - -* The select() method can now find tags with attributes whose names - contain dashes. Patch by Marek Kapolka. [bug=1304007] - -* Improved the lxml tree builder's handling of processing - instructions. [bug=1294645] - -* Restored the helpful syntax error that happens when you try to - import the Python 2 edition of Beautiful Soup under Python - 3. [bug=1213387] - -* In Python 3.4 and above, set the new convert_charrefs argument to - the html.parser constructor to avoid a warning and future - failures. Patch by Stefano Revera. [bug=1375721] - -* The warning when you pass in a filename or URL as markup will now be - displayed correctly even if the filename or URL is a Unicode - string. [bug=1268888] - -* If the initial <html> tag contains a CDATA list attribute such as - 'class', the html5lib tree builder will now turn its value into a - list, as it would with any other tag. [bug=1296481] - -* Fixed an import error in Python 3.5 caused by the removal of the - HTMLParseError class. [bug=1420063] - -* Improved docstring for encode_contents() and - decode_contents(). [bug=1441543] - -* Fixed a crash in Unicode, Dammit's encoding detector when the name - of the encoding itself contained invalid bytes. [bug=1360913] - -* Improved the exception raised when you call .unwrap() or - .replace_with() on an element that's not attached to a tree. - -* Raise a NotImplementedError whenever an unsupported CSS pseudoclass - is used in select(). Previously some cases did not result in a - NotImplementedError. - -* It's now possible to pickle a BeautifulSoup object no matter which - tree builder was used to create it. However, the only tree builder - that survives the pickling process is the HTMLParserTreeBuilder - ('html.parser'). If you unpickle a BeautifulSoup object created with - some other tree builder, soup.builder will be None. [bug=1231545] - -= 4.3.2 (20131002) = - -* Fixed a bug in which short Unicode input was improperly encoded to - ASCII when checking whether or not it was the name of a file on - disk. [bug=1227016] - -* Fixed a crash when a short input contains data not valid in - filenames. [bug=1232604] - -* Fixed a bug that caused Unicode data put into UnicodeDammit to - return None instead of the original data. [bug=1214983] - -* Combined two tests to stop a spurious test failure when tests are - run by nosetests. [bug=1212445] - -= 4.3.1 (20130815) = - -* Fixed yet another problem with the html5lib tree builder, caused by - html5lib's tendency to rearrange the tree during - parsing. [bug=1189267] - -* Fixed a bug that caused the optimized version of find_all() to - return nothing. [bug=1212655] - -= 4.3.0 (20130812) = - -* Instead of converting incoming data to Unicode and feeding it to the - lxml tree builder in chunks, Beautiful Soup now makes successive - guesses at the encoding of the incoming data, and tells lxml to - parse the data as that encoding. Giving lxml more control over the - parsing process improves performance and avoids a number of bugs and - issues with the lxml parser which had previously required elaborate - workarounds: - - - An issue in which lxml refuses to parse Unicode strings on some - systems. [bug=1180527] - - - A returning bug that truncated documents longer than a (very - small) size. [bug=963880] - - - A returning bug in which extra spaces were added to a document if - the document defined a charset other than UTF-8. [bug=972466] - - This required a major overhaul of the tree builder architecture. If - you wrote your own tree builder and didn't tell me, you'll need to - modify your prepare_markup() method. - -* The UnicodeDammit code that makes guesses at encodings has been - split into its own class, EncodingDetector. A lot of apparently - redundant code has been removed from Unicode, Dammit, and some - undocumented features have also been removed. - -* Beautiful Soup will issue a warning if instead of markup you pass it - a URL or the name of a file on disk (a common beginner's mistake). - -* A number of optimizations improve the performance of the lxml tree - builder by about 33%, the html.parser tree builder by about 20%, and - the html5lib tree builder by about 15%. - -* All find_all calls should now return a ResultSet object. Patch by - Aaron DeVore. [bug=1194034] - -= 4.2.1 (20130531) = - -* The default XML formatter will now replace ampersands even if they - appear to be part of entities. That is, "<" will become - "&lt;". The old code was left over from Beautiful Soup 3, which - didn't always turn entities into Unicode characters. - - If you really want the old behavior (maybe because you add new - strings to the tree, those strings include entities, and you want - the formatter to leave them alone on output), it can be found in - EntitySubstitution.substitute_xml_containing_entities(). [bug=1182183] - -* Gave new_string() the ability to create subclasses of - NavigableString. [bug=1181986] - -* Fixed another bug by which the html5lib tree builder could create a - disconnected tree. [bug=1182089] - -* The .previous_element of a BeautifulSoup object is now always None, - not the last element to be parsed. [bug=1182089] - -* Fixed test failures when lxml is not installed. [bug=1181589] - -* html5lib now supports Python 3. Fixed some Python 2-specific - code in the html5lib test suite. [bug=1181624] - -* The html.parser treebuilder can now handle numeric attributes in - text when the hexidecimal name of the attribute starts with a - capital X. Patch by Tim Shirley. [bug=1186242] - -= 4.2.0 (20130514) = - -* The Tag.select() method now supports a much wider variety of CSS - selectors. - - - Added support for the adjacent sibling combinator (+) and the - general sibling combinator (~). Tests by "liquider". [bug=1082144] - - - The combinators (>, +, and ~) can now combine with any supported - selector, not just one that selects based on tag name. - - - Added limited support for the "nth-of-type" pseudo-class. Code - by Sven Slootweg. [bug=1109952] - -* The BeautifulSoup class is now aliased to "_s" and "_soup", making - it quicker to type the import statement in an interactive session: - - from bs4 import _s - or - from bs4 import _soup - - The alias may change in the future, so don't use this in code you're - going to run more than once. - -* Added the 'diagnose' submodule, which includes several useful - functions for reporting problems and doing tech support. - - - diagnose(data) tries the given markup on every installed parser, - reporting exceptions and displaying successes. If a parser is not - installed, diagnose() mentions this fact. - - - lxml_trace(data, html=True) runs the given markup through lxml's - XML parser or HTML parser, and prints out the parser events as - they happen. This helps you quickly determine whether a given - problem occurs in lxml code or Beautiful Soup code. - - - htmlparser_trace(data) is the same thing, but for Python's - built-in HTMLParser class. - -* In an HTML document, the contents of a <script> or <style> tag will - no longer undergo entity substitution by default. XML documents work - the same way they did before. [bug=1085953] - -* Methods like get_text() and properties like .strings now only give - you strings that are visible in the document--no comments or - processing commands. [bug=1050164] - -* The prettify() method now leaves the contents of <pre> tags - alone. [bug=1095654] - -* Fix a bug in the html5lib treebuilder which sometimes created - disconnected trees. [bug=1039527] - -* Fix a bug in the lxml treebuilder which crashed when a tag included - an attribute from the predefined "xml:" namespace. [bug=1065617] - -* Fix a bug by which keyword arguments to find_parent() were not - being passed on. [bug=1126734] - -* Stop a crash when unwisely messing with a tag that's been - decomposed. [bug=1097699] - -* Now that lxml's segfault on invalid doctype has been fixed, fixed a - corresponding problem on the Beautiful Soup end that was previously - invisible. [bug=984936] - -* Fixed an exception when an overspecified CSS selector didn't match - anything. Code by Stefaan Lippens. [bug=1168167] - -= 4.1.3 (20120820) = - -* Skipped a test under Python 2.6 and Python 3.1 to avoid a spurious - test failure caused by the lousy HTMLParser in those - versions. [bug=1038503] - -* Raise a more specific error (FeatureNotFound) when a requested - parser or parser feature is not installed. Raise NotImplementedError - instead of ValueError when the user calls insert_before() or - insert_after() on the BeautifulSoup object itself. Patch by Aaron - Devore. [bug=1038301] - -= 4.1.2 (20120817) = - -* As per PEP-8, allow searching by CSS class using the 'class_' - keyword argument. [bug=1037624] - -* Display namespace prefixes for namespaced attribute names, instead of - the fully-qualified names given by the lxml parser. [bug=1037597] - -* Fixed a crash on encoding when an attribute name contained - non-ASCII characters. - -* When sniffing encodings, if the cchardet library is installed, - Beautiful Soup uses it instead of chardet. cchardet is much - faster. [bug=1020748] - -* Use logging.warning() instead of warning.warn() to notify the user - that characters were replaced with REPLACEMENT - CHARACTER. [bug=1013862] - -= 4.1.1 (20120703) = - -* Fixed an html5lib tree builder crash which happened when html5lib - moved a tag with a multivalued attribute from one part of the tree - to another. [bug=1019603] - -* Correctly display closing tags with an XML namespace declared. Patch - by Andreas Kostyrka. [bug=1019635] - -* Fixed a typo that made parsing significantly slower than it should - have been, and also waited too long to close tags with XML - namespaces. [bug=1020268] - -* get_text() now returns an empty Unicode string if there is no text, - rather than an empty bytestring. [bug=1020387] - -= 4.1.0 (20120529) = - -* Added experimental support for fixing Windows-1252 characters - embedded in UTF-8 documents. (UnicodeDammit.detwingle()) - -* Fixed the handling of " with the built-in parser. [bug=993871] - -* Comments, processing instructions, document type declarations, and - markup declarations are now treated as preformatted strings, the way - CData blocks are. [bug=1001025] - -* Fixed a bug with the lxml treebuilder that prevented the user from - adding attributes to a tag that didn't originally have - attributes. [bug=1002378] Thanks to Oliver Beattie for the patch. - -* Fixed some edge-case bugs having to do with inserting an element - into a tag it's already inside, and replacing one of a tag's - children with another. [bug=997529] - -* Added the ability to search for attribute values specified in UTF-8. [bug=1003974] - - This caused a major refactoring of the search code. All the tests - pass, but it's possible that some searches will behave differently. - -= 4.0.5 (20120427) = - -* Added a new method, wrap(), which wraps an element in a tag. - -* Renamed replace_with_children() to unwrap(), which is easier to - understand and also the jQuery name of the function. - -* Made encoding substitution in <meta> tags completely transparent (no - more %SOUP-ENCODING%). - -* Fixed a bug in decoding data that contained a byte-order mark, such - as data encoded in UTF-16LE. [bug=988980] - -* Fixed a bug that made the HTMLParser treebuilder generate XML - definitions ending with two question marks instead of - one. [bug=984258] - -* Upon document generation, CData objects are no longer run through - the formatter. [bug=988905] - -* The test suite now passes when lxml is not installed, whether or not - html5lib is installed. [bug=987004] - -* Print a warning on HTMLParseErrors to let people know they should - install a better parser library. - -= 4.0.4 (20120416) = - -* Fixed a bug that sometimes created disconnected trees. - -* Fixed a bug with the string setter that moved a string around the - tree instead of copying it. [bug=983050] - -* Attribute values are now run through the provided output formatter. - Previously they were always run through the 'minimal' formatter. In - the future I may make it possible to specify different formatters - for attribute values and strings, but for now, consistent behavior - is better than inconsistent behavior. [bug=980237] - -* Added the missing renderContents method from Beautiful Soup 3. Also - added an encode_contents() method to go along with decode_contents(). - -* Give a more useful error when the user tries to run the Python 2 - version of BS under Python 3. - -* UnicodeDammit can now convert Microsoft smart quotes to ASCII with - UnicodeDammit(markup, smart_quotes_to="ascii"). - -= 4.0.3 (20120403) = - -* Fixed a typo that caused some versions of Python 3 to convert the - Beautiful Soup codebase incorrectly. - -* Got rid of the 4.0.2 workaround for HTML documents--it was - unnecessary and the workaround was triggering a (possibly different, - but related) bug in lxml. [bug=972466] - -= 4.0.2 (20120326) = - -* Worked around a possible bug in lxml that prevents non-tiny XML - documents from being parsed. [bug=963880, bug=963936] - -* Fixed a bug where specifying `text` while also searching for a tag - only worked if `text` wanted an exact string match. [bug=955942] - -= 4.0.1 (20120314) = - -* This is the first official release of Beautiful Soup 4. There is no - 4.0.0 release, to eliminate any possibility that packaging software - might treat "4.0.0" as being an earlier version than "4.0.0b10". - -* Brought BS up to date with the latest release of soupselect, adding - CSS selector support for direct descendant matches and multiple CSS - class matches. - -= 4.0.0b10 (20120302) = - -* Added support for simple CSS selectors, taken from the soupselect project. - -* Fixed a crash when using html5lib. [bug=943246] - -* In HTML5-style <meta charset="foo"> tags, the value of the "charset" - attribute is now replaced with the appropriate encoding on - output. [bug=942714] - -* Fixed a bug that caused calling a tag to sometimes call find_all() - with the wrong arguments. [bug=944426] - -* For backwards compatibility, brought back the BeautifulStoneSoup - class as a deprecated wrapper around BeautifulSoup. - -= 4.0.0b9 (20120228) = - -* Fixed the string representation of DOCTYPEs that have both a public - ID and a system ID. - -* Fixed the generated XML declaration. - -* Renamed Tag.nsprefix to Tag.prefix, for consistency with - NamespacedAttribute. - -* Fixed a test failure that occured on Python 3.x when chardet was - installed. - -* Made prettify() return Unicode by default, so it will look nice on - Python 3 when passed into print(). - -= 4.0.0b8 (20120224) = - -* All tree builders now preserve namespace information in the - documents they parse. If you use the html5lib parser or lxml's XML - parser, you can access the namespace URL for a tag as tag.namespace. - - However, there is no special support for namespace-oriented - searching or tree manipulation. When you search the tree, you need - to use namespace prefixes exactly as they're used in the original - document. - -* The string representation of a DOCTYPE always ends in a newline. - -* Issue a warning if the user tries to use a SoupStrainer in - conjunction with the html5lib tree builder, which doesn't support - them. - -= 4.0.0b7 (20120223) = - -* Upon decoding to string, any characters that can't be represented in - your chosen encoding will be converted into numeric XML entity - references. - -* Issue a warning if characters were replaced with REPLACEMENT - CHARACTER during Unicode conversion. - -* Restored compatibility with Python 2.6. - -* The install process no longer installs docs or auxillary text files. - -* It's now possible to deepcopy a BeautifulSoup object created with - Python's built-in HTML parser. - -* About 100 unit tests that "test" the behavior of various parsers on - invalid markup have been removed. Legitimate changes to those - parsers caused these tests to fail, indicating that perhaps - Beautiful Soup should not test the behavior of foreign - libraries. - - The problematic unit tests have been reformulated as informational - comparisons generated by the script - scripts/demonstrate_parser_differences.py. - - This makes Beautiful Soup compatible with html5lib version 0.95 and - future versions of HTMLParser. - -= 4.0.0b6 (20120216) = - -* Multi-valued attributes like "class" always have a list of values, - even if there's only one value in the list. - -* Added a number of multi-valued attributes defined in HTML5. - -* Stopped generating a space before the slash that closes an - empty-element tag. This may come back if I add a special XHTML mode - (http://www.w3.org/TR/xhtml1/#C_2), but right now it's pretty - useless. - -* Passing text along with tag-specific arguments to a find* method: - - find("a", text="Click here") - - will find tags that contain the given text as their - .string. Previously, the tag-specific arguments were ignored and - only strings were searched. - -* Fixed a bug that caused the html5lib tree builder to build a - partially disconnected tree. Generally cleaned up the html5lib tree - builder. - -* If you restrict a multi-valued attribute like "class" to a string - that contains spaces, Beautiful Soup will only consider it a match - if the values correspond to that specific string. - -= 4.0.0b5 (20120209) = - -* Rationalized Beautiful Soup's treatment of CSS class. A tag - belonging to multiple CSS classes is treated as having a list of - values for the 'class' attribute. Searching for a CSS class will - match *any* of the CSS classes. - - This actually affects all attributes that the HTML standard defines - as taking multiple values (class, rel, rev, archive, accept-charset, - and headers), but 'class' is by far the most common. [bug=41034] - -* If you pass anything other than a dictionary as the second argument - to one of the find* methods, it'll assume you want to use that - object to search against a tag's CSS classes. Previously this only - worked if you passed in a string. - -* Fixed a bug that caused a crash when you passed a dictionary as an - attribute value (possibly because you mistyped "attrs"). [bug=842419] - -* Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags - like <meta charset="utf-8" />. [bug=837268] - -* If Unicode, Dammit can't figure out a consistent encoding for a - page, it will try each of its guesses again, with errors="replace" - instead of errors="strict". This may mean that some data gets - replaced with REPLACEMENT CHARACTER, but at least most of it will - get turned into Unicode. [bug=754903] - -* Patched over a bug in html5lib (?) that was crashing Beautiful Soup - on certain kinds of markup. [bug=838800] - -* Fixed a bug that wrecked the tree if you replaced an element with an - empty string. [bug=728697] - -* Improved Unicode, Dammit's behavior when you give it Unicode to - begin with. - -= 4.0.0b4 (20120208) = - -* Added BeautifulSoup.new_string() to go along with BeautifulSoup.new_tag() - -* BeautifulSoup.new_tag() will follow the rules of whatever - tree-builder was used to create the original BeautifulSoup object. A - new <p> tag will look like "<p />" if the soup object was created to - parse XML, but it will look like "<p></p>" if the soup object was - created to parse HTML. - -* We pass in strict=False to html.parser on Python 3, greatly - improving html.parser's ability to handle bad HTML. - -* We also monkeypatch a serious bug in html.parser that made - strict=False disastrous on Python 3.2.2. - -* Replaced the "substitute_html_entities" argument with the - more general "formatter" argument. - -* Bare ampersands and angle brackets are always converted to XML - entities unless the user prevents it. - -* Added PageElement.insert_before() and PageElement.insert_after(), - which let you put an element into the parse tree with respect to - some other element. - -* Raise an exception when the user tries to do something nonsensical - like insert a tag into itself. - - -= 4.0.0b3 (20120203) = - -Beautiful Soup 4 is a nearly-complete rewrite that removes Beautiful -Soup's custom HTML parser in favor of a system that lets you write a -little glue code and plug in any HTML or XML parser you want. - -Beautiful Soup 4.0 comes with glue code for four parsers: - - * Python's standard HTMLParser (html.parser in Python 3) - * lxml's HTML and XML parsers - * html5lib's HTML parser - -HTMLParser is the default, but I recommend you install lxml if you -can. - -For complete documentation, see the Sphinx documentation in -bs4/doc/source/. What follows is a summary of the changes from -Beautiful Soup 3. - -=== The module name has changed === - -Previously you imported the BeautifulSoup class from a module also -called BeautifulSoup. To save keystrokes and make it clear which -version of the API is in use, the module is now called 'bs4': - - >>> from bs4 import BeautifulSoup - -=== It works with Python 3 === - -Beautiful Soup 3.1.0 worked with Python 3, but the parser it used was -so bad that it barely worked at all. Beautiful Soup 4 works with -Python 3, and since its parser is pluggable, you don't sacrifice -quality. - -Special thanks to Thomas Kluyver and Ezio Melotti for getting Python 3 -support to the finish line. Ezio Melotti is also to thank for greatly -improving the HTML parser that comes with Python 3.2. - -=== CDATA sections are normal text, if they're understood at all. === - -Currently, the lxml and html5lib HTML parsers ignore CDATA sections in -markup: - - <p><![CDATA[foo]]></p> => <p></p> - -A future version of html5lib will turn CDATA sections into text nodes, -but only within tags like <svg> and <math>: - - <svg><![CDATA[foo]]></svg> => <p>foo</p> - -The default XML parser (which uses lxml behind the scenes) turns CDATA -sections into ordinary text elements: - - <p><![CDATA[foo]]></p> => <p>foo</p> - -In theory it's possible to preserve the CDATA sections when using the -XML parser, but I don't see how to get it to work in practice. - -=== Miscellaneous other stuff === - -If the BeautifulSoup instance has .is_xml set to True, an appropriate -XML declaration will be emitted when the tree is transformed into a -string: - - <?xml version="1.0" encoding="utf-8"> - <markup> - ... - </markup> - -The ['lxml', 'xml'] tree builder sets .is_xml to True; the other tree -builders set it to False. If you want to parse XHTML with an HTML -parser, you can set it manually. - - -= 3.2.0 = - -The 3.1 series wasn't very useful, so I renamed the 3.0 series to 3.2 -to make it obvious which one you should use. - -= 3.1.0 = - -A hybrid version that supports 2.4 and can be automatically converted -to run under Python 3.0. There are three backwards-incompatible -changes you should be aware of, but no new features or deliberate -behavior changes. - -1. str() may no longer do what you want. This is because the meaning -of str() inverts between Python 2 and 3; in Python 2 it gives you a -byte string, in Python 3 it gives you a Unicode string. - -The effect of this is that you can't pass an encoding to .__str__ -anymore. Use encode() to get a string and decode() to get Unicode, and -you'll be ready (well, readier) for Python 3. - -2. Beautiful Soup is now based on HTMLParser rather than SGMLParser, -which is gone in Python 3. There's some bad HTML that SGMLParser -handled but HTMLParser doesn't, usually to do with attribute values -that aren't closed or have brackets inside them: - - <a href="foo</a>, </a><a href="bar">baz</a> - <a b="<a>">', '<a b="<a>"></a><a>"></a> - -A later version of Beautiful Soup will allow you to plug in different -parsers to make tradeoffs between speed and the ability to handle bad -HTML. - -3. In Python 3 (but not Python 2), HTMLParser converts entities within -attributes to the corresponding Unicode characters. In Python 2 it's -possible to parse this string and leave the é intact. - - <a href="http://crummy.com?sacré&bleu"> - -In Python 3, the é is always converted to \xe9 during -parsing. - - -= 3.0.7a = - -Added an import that makes BS work in Python 2.3. - - -= 3.0.7 = - -Fixed a UnicodeDecodeError when unpickling documents that contain -non-ASCII characters. - -Fixed a TypeError that occured in some circumstances when a tag -contained no text. - -Jump through hoops to avoid the use of chardet, which can be extremely -slow in some circumstances. UTF-8 documents should never trigger the -use of chardet. - -Whitespace is preserved inside <pre> and <textarea> tags that contain -nothing but whitespace. - -Beautiful Soup can now parse a doctype that's scoped to an XML namespace. - - -= 3.0.6 = - -Got rid of a very old debug line that prevented chardet from working. - -Added a Tag.decompose() method that completely disconnects a tree or a -subset of a tree, breaking it up into bite-sized pieces that are -easy for the garbage collecter to collect. - -Tag.extract() now returns the tag that was extracted. - -Tag.findNext() now does something with the keyword arguments you pass -it instead of dropping them on the floor. - -Fixed a Unicode conversion bug. - -Fixed a bug that garbled some <meta> tags when rewriting them. - - -= 3.0.5 = - -Soup objects can now be pickled, and copied with copy.deepcopy. - -Tag.append now works properly on existing BS objects. (It wasn't -originally intended for outside use, but it can be now.) (Giles -Radford) - -Passing in a nonexistent encoding will no longer crash the parser on -Python 2.4 (John Nagle). - -Fixed an underlying bug in SGMLParser that thinks ASCII has 255 -characters instead of 127 (John Nagle). - -Entities are converted more consistently to Unicode characters. - -Entity references in attribute values are now converted to Unicode -characters when appropriate. Numeric entities are always converted, -because SGMLParser always converts them outside of attribute values. - -ALL_ENTITIES happens to just be the XHTML entities, so I renamed it to -XHTML_ENTITIES. - -The regular expression for bare ampersands was too loose. In some -cases ampersands were not being escaped. (Sam Ruby?) - -Non-breaking spaces and other special Unicode space characters are no -longer folded to ASCII spaces. (Robert Leftwich) - -Information inside a TEXTAREA tag is now parsed literally, not as HTML -tags. TEXTAREA now works exactly the same way as SCRIPT. (Zephyr Fang) - -= 3.0.4 = - -Fixed a bug that crashed Unicode conversion in some cases. - -Fixed a bug that prevented UnicodeDammit from being used as a -general-purpose data scrubber. - -Fixed some unit test failures when running against Python 2.5. - -When considering whether to convert smart quotes, UnicodeDammit now -looks at the original encoding in a case-insensitive way. - -= 3.0.3 (20060606) = - -Beautiful Soup is now usable as a way to clean up invalid XML/HTML (be -sure to pass in an appropriate value for convertEntities, or XML/HTML -entities might stick around that aren't valid in HTML/XML). The result -may not validate, but it should be good enough to not choke a -real-world XML parser. Specifically, the output of a properly -constructed soup object should always be valid as part of an XML -document, but parts may be missing if they were missing in the -original. As always, if the input is valid XML, the output will also -be valid. - -= 3.0.2 (20060602) = - -Previously, Beautiful Soup correctly handled attribute values that -contained embedded quotes (sometimes by escaping), but not other kinds -of XML character. Now, it correctly handles or escapes all special XML -characters in attribute values. - -I aliased methods to the 2.x names (fetch, find, findText, etc.) for -backwards compatibility purposes. Those names are deprecated and if I -ever do a 4.0 I will remove them. I will, I tell you! - -Fixed a bug where the findAll method wasn't passing along any keyword -arguments. - -When run from the command line, Beautiful Soup now acts as an HTML -pretty-printer, not an XML pretty-printer. - -= 3.0.1 (20060530) = - -Reintroduced the "fetch by CSS class" shortcut. I thought keyword -arguments would replace it, but they don't. You can't call soup('a', -class='foo') because class is a Python keyword. - -If Beautiful Soup encounters a meta tag that declares the encoding, -but a SoupStrainer tells it not to parse that tag, Beautiful Soup will -no longer try to rewrite the meta tag to mention the new -encoding. Basically, this makes SoupStrainers work in real-world -applications instead of crashing the parser. - -= 3.0.0 "Who would not give all else for two p" (20060528) = - -This release is not backward-compatible with previous releases. If -you've got code written with a previous version of the library, go -ahead and keep using it, unless one of the features mentioned here -really makes your life easier. Since the library is self-contained, -you can include an old copy of the library in your old applications, -and use the new version for everything else. - -The documentation has been rewritten and greatly expanded with many -more examples. - -Beautiful Soup autodetects the encoding of a document (or uses the one -you specify), and converts it from its native encoding to -Unicode. Internally, it only deals with Unicode strings. When you -print out the document, it converts to UTF-8 (or another encoding you -specify). [Doc reference] - -It's now easy to make large-scale changes to the parse tree without -screwing up the navigation members. The methods are extract, -replaceWith, and insert. [Doc reference. See also Improving Memory -Usage with extract] - -Passing True in as an attribute value gives you tags that have any -value for that attribute. You don't have to create a regular -expression. Passing None for an attribute value gives you tags that -don't have that attribute at all. - -Tag objects now know whether or not they're self-closing. This avoids -the problem where Beautiful Soup thought that tags like <BR /> were -self-closing even in XML documents. You can customize the self-closing -tags for a parser object by passing them in as a list of -selfClosingTags: you don't have to subclass anymore. - -There's a new built-in parser, MinimalSoup, which has most of -BeautifulSoup's HTML-specific rules, but no tag nesting rules. [Doc -reference] - -You can use a SoupStrainer to tell Beautiful Soup to parse only part -of a document. This saves time and memory, often making Beautiful Soup -about as fast as a custom-built SGMLParser subclass. [Doc reference, -SoupStrainer reference] - -You can (usually) use keyword arguments instead of passing a -dictionary of attributes to a search method. That is, you can replace -soup(args={"id" : "5"}) with soup(id="5"). You can still use args if -(for instance) you need to find an attribute whose name clashes with -the name of an argument to findAll. [Doc reference: **kwargs attrs] - -The method names have changed to the better method names used in -Rubyful Soup. Instead of find methods and fetch methods, there are -only find methods. Instead of a scheme where you can't remember which -method finds one element and which one finds them all, we have find -and findAll. In general, if the method name mentions All or a plural -noun (eg. findNextSiblings), then it finds many elements -method. Otherwise, it only finds one element. [Doc reference] - -Some of the argument names have been renamed for clarity. For instance -avoidParserProblems is now parserMassage. - -Beautiful Soup no longer implements a feed method. You need to pass a -string or a filehandle into the soup constructor, not with feed after -the soup has been created. There is still a feed method, but it's the -feed method implemented by SGMLParser and calling it will bypass -Beautiful Soup and cause problems. - -The NavigableText class has been renamed to NavigableString. There is -no NavigableUnicodeString anymore, because every string inside a -Beautiful Soup parse tree is a Unicode string. - -findText and fetchText are gone. Just pass a text argument into find -or findAll. - -Null was more trouble than it was worth, so I got rid of it. Anything -that used to return Null now returns None. - -Special XML constructs like comments and CDATA now have their own -NavigableString subclasses, instead of being treated as oddly-formed -data. If you parse a document that contains CDATA and write it back -out, the CDATA will still be there. - -When you're parsing a document, you can get Beautiful Soup to convert -XML or HTML entities into the corresponding Unicode characters. [Doc -reference] - -= 2.1.1 (20050918) = - -Fixed a serious performance bug in BeautifulStoneSoup which was -causing parsing to be incredibly slow. - -Corrected several entities that were previously being incorrectly -translated from Microsoft smart-quote-like characters. - -Fixed a bug that was breaking text fetch. - -Fixed a bug that crashed the parser when text chunks that look like -HTML tag names showed up within a SCRIPT tag. - -THEAD, TBODY, and TFOOT tags are now nestable within TABLE -tags. Nested tables should parse more sensibly now. - -BASE is now considered a self-closing tag. - -= 2.1.0 "Game, or any other dish?" (20050504) = - -Added a wide variety of new search methods which, given a starting -point inside the tree, follow a particular navigation member (like -nextSibling) over and over again, looking for Tag and NavigableText -objects that match certain criteria. The new methods are findNext, -fetchNext, findPrevious, fetchPrevious, findNextSibling, -fetchNextSiblings, findPreviousSibling, fetchPreviousSiblings, -findParent, and fetchParents. All of these use the same basic code -used by first and fetch, so you can pass your weird ways of matching -things into these methods. - -The fetch method and its derivatives now accept a limit argument. - -You can now pass keyword arguments when calling a Tag object as though -it were a method. - -Fixed a bug that caused all hand-created tags to share a single set of -attributes. - -= 2.0.3 (20050501) = - -Fixed Python 2.2 support for iterators. - -Fixed a bug that gave the wrong representation to tags within quote -tags like <script>. - -Took some code from Mark Pilgrim that treats CDATA declarations as -data instead of ignoring them. - -Beautiful Soup's setup.py will now do an install even if the unit -tests fail. It won't build a source distribution if the unit tests -fail, so I can't release a new version unless they pass. - -= 2.0.2 (20050416) = - -Added the unit tests in a separate module, and packaged it with -distutils. - -Fixed a bug that sometimes caused renderContents() to return a Unicode -string even if there was no Unicode in the original string. - -Added the done() method, which closes all of the parser's open -tags. It gets called automatically when you pass in some text to the -constructor of a parser class; otherwise you must call it yourself. - -Reinstated some backwards compatibility with 1.x versions: referencing -the string member of a NavigableText object returns the NavigableText -object instead of throwing an error. - -= 2.0.1 (20050412) = - -Fixed a bug that caused bad results when you tried to reference a tag -name shorter than 3 characters as a member of a Tag, eg. tag.table.td. - -Made sure all Tags have the 'hidden' attribute so that an attempt to -access tag.hidden doesn't spawn an attempt to find a tag named -'hidden'. - -Fixed a bug in the comparison operator. - -= 2.0.0 "Who cares for fish?" (20050410) - -Beautiful Soup version 1 was very useful but also pretty stupid. I -originally wrote it without noticing any of the problems inherent in -trying to build a parse tree out of ambiguous HTML tags. This version -solves all of those problems to my satisfaction. It also adds many new -clever things to make up for the removal of the stupid things. - -== Parsing == - -The parser logic has been greatly improved, and the BeautifulSoup -class should much more reliably yield a parse tree that looks like -what the page author intended. For a particular class of odd edge -cases that now causes problems, there is a new class, -ICantBelieveItsBeautifulSoup. - -By default, Beautiful Soup now performs some cleanup operations on -text before parsing it. This is to avoid common problems with bad -definitions and self-closing tags that crash SGMLParser. You can -provide your own set of cleanup operations, or turn it off -altogether. The cleanup operations include fixing self-closing tags -that don't close, and replacing Microsoft smart quotes and similar -characters with their HTML entity equivalents. - -You can now get a pretty-print version of parsed HTML to get a visual -picture of how Beautiful Soup parses it, with the Tag.prettify() -method. - -== Strings and Unicode == - -There are separate NavigableText subclasses for ASCII and Unicode -strings. These classes directly subclass the corresponding base data -types. This means you can treat NavigableText objects as strings -instead of having to call methods on them to get the strings. - -str() on a Tag always returns a string, and unicode() always returns -Unicode. Previously it was inconsistent. - -== Tree traversal == - -In a first() or fetch() call, the tag name or the desired value of an -attribute can now be any of the following: - - * A string (matches that specific tag or that specific attribute value) - * A list of strings (matches any tag or attribute value in the list) - * A compiled regular expression object (matches any tag or attribute - value that matches the regular expression) - * A callable object that takes the Tag object or attribute value as a - string. It returns None/false/empty string if the given string - doesn't match, and any other value if it does. - -This is much easier to use than SQL-style wildcards (see, regular -expressions are good for something). Because of this, I took out -SQL-style wildcards. I'll put them back if someone complains, but -their removal simplifies the code a lot. - -You can use fetch() and first() to search for text in the parse tree, -not just tags. There are new alias methods fetchText() and firstText() -designed for this purpose. As with searching for tags, you can pass in -a string, a regular expression object, or a method to match your text. - -If you pass in something besides a map to the attrs argument of -fetch() or first(), Beautiful Soup will assume you want to match that -thing against the "class" attribute. When you're scraping -well-structured HTML, this makes your code a lot cleaner. - -1.x and 2.x both let you call a Tag object as a shorthand for -fetch(). For instance, foo("bar") is a shorthand for -foo.fetch("bar"). In 2.x, you can also access a specially-named member -of a Tag object as a shorthand for first(). For instance, foo.barTag -is a shorthand for foo.first("bar"). By chaining these shortcuts you -traverse a tree in very little code: for header in -soup.bodyTag.pTag.tableTag('th'): - -If an element relationship (like parent or next) doesn't apply to a -tag, it'll now show up Null instead of None. first() will also return -Null if you ask it for a nonexistent tag. Null is an object that's -just like None, except you can do whatever you want to it and it'll -give you Null instead of throwing an error. - -This lets you do tree traversals like soup.htmlTag.headTag.titleTag -without having to worry if the intermediate stages are actually -there. Previously, if there was no 'head' tag in the document, headTag -in that instance would have been None, and accessing its 'titleTag' -member would have thrown an AttributeError. Now, you can get what you -want when it exists, and get Null when it doesn't, without having to -do a lot of conditionals checking to see if every stage is None. - -There are two new relations between page elements: previousSibling and -nextSibling. They reference the previous and next element at the same -level of the parse tree. For instance, if you have HTML like this: - - <p><ul><li>Foo<br /><li>Bar</ul> - -The first 'li' tag has a previousSibling of Null and its nextSibling -is the second 'li' tag. The second 'li' tag has a nextSibling of Null -and its previousSibling is the first 'li' tag. The previousSibling of -the 'ul' tag is the first 'p' tag. The nextSibling of 'Foo' is the -'br' tag. - -I took out the ability to use fetch() to find tags that have a -specific list of contents. See, I can't even explain it well. It was -really difficult to use, I never used it, and I don't think anyone -else ever used it. To the extent anyone did, they can probably use -fetchText() instead. If it turns out someone needs it I'll think of -another solution. - -== Tree manipulation == - -You can add new attributes to a tag, and delete attributes from a -tag. In 1.x you could only change a tag's existing attributes. - -== Porting Considerations == - -There are three changes in 2.0 that break old code: - -In the post-1.2 release you could pass in a function into fetch(). The -function took a string, the tag name. In 2.0, the function takes the -actual Tag object. - -It's no longer to pass in SQL-style wildcards to fetch(). Use a -regular expression instead. - -The different parsing algorithm means the parse tree may not be shaped -like you expect. This will only actually affect you if your code uses -one of the affected parts. I haven't run into this problem yet while -porting my code. - -= Between 1.2 and 2.0 = - -This is the release to get if you want Python 1.5 compatibility. - -The desired value of an attribute can now be any of the following: - - * A string - * A string with SQL-style wildcards - * A compiled RE object - * A callable that returns None/false/empty string if the given value - doesn't match, and any other value otherwise. - -This is much easier to use than SQL-style wildcards (see, regular -expressions are good for something). Because of this, I no longer -recommend you use SQL-style wildcards. They may go away in a future -release to clean up the code. - -Made Beautiful Soup handle processing instructions as text instead of -ignoring them. - -Applied patch from Richie Hindle (richie at entrian dot com) that -makes tag.string a shorthand for tag.contents[0].string when the tag -has only one string-owning child. - -Added still more nestable tags. The nestable tags thing won't work in -a lot of cases and needs to be rethought. - -Fixed an edge case where searching for "%foo" would match any string -shorter than "foo". - -= 1.2 "Who for such dainties would not stoop?" (20040708) = - -Applied patch from Ben Last (ben at benlast dot com) that made -Tag.renderContents() correctly handle Unicode. - -Made BeautifulStoneSoup even dumber by making it not implicitly close -a tag when another tag of the same type is encountered; only when an -actual closing tag is encountered. This change courtesy of Fuzzy (mike -at pcblokes dot com). BeautifulSoup still works as before. - -= 1.1 "Swimming in a hot tureen" = - -Added more 'nestable' tags. Changed popping semantics so that when a -nestable tag is encountered, tags are popped up to the previously -encountered nestable tag (of whatever kind). I will revert this if -enough people complain, but it should make more people's lives easier -than harder. This enhancement was suggested by Anthony Baxter (anthony -at interlink dot com dot au). - -= 1.0 "So rich and green" (20040420) = - -Initial release. diff --git a/libs/bs4/README.txt b/libs/bs4/README.txt deleted file mode 100644 index 305c51e05..000000000 --- a/libs/bs4/README.txt +++ /dev/null @@ -1,63 +0,0 @@ -= Introduction = - - >>> from bs4 import BeautifulSoup - >>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML") - >>> print soup.prettify() - <html> - <body> - <p> - Some - <b> - bad - <i> - HTML - </i> - </b> - </p> - </body> - </html> - >>> soup.find(text="bad") - u'bad' - - >>> soup.i - <i>HTML</i> - - >>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml") - >>> print soup.prettify() - <?xml version="1.0" encoding="utf-8"> - <tag1> - Some - <tag2 /> - bad - <tag3> - XML - </tag3> - </tag1> - -= Full documentation = - -The bs4/doc/ directory contains full documentation in Sphinx -format. Run "make html" in that directory to create HTML -documentation. - -= Running the unit tests = - -Beautiful Soup supports unit test discovery from the project root directory: - - $ nosetests - - $ python -m unittest discover -s bs4 # Python 2.7 and up - -If you checked out the source tree, you should see a script in the -home directory called test-all-versions. This script will run the unit -tests under Python 2.7, then create a temporary Python 3 conversion of -the source and run the unit tests again under Python 3. - -= Links = - -Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/ -Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ - http://readthedocs.org/docs/beautiful-soup-4/ -Discussion group: http://groups.google.com/group/beautifulsoup/ -Development: https://code.launchpad.net/beautifulsoup/ -Bug tracker: https://bugs.launchpad.net/beautifulsoup/ diff --git a/libs/bs4/TODO.txt b/libs/bs4/TODO.txt deleted file mode 100644 index e26d6264d..000000000 --- a/libs/bs4/TODO.txt +++ /dev/null @@ -1,31 +0,0 @@ -Additions ---------- - -More of the jQuery API: nextUntil? - -Optimizations -------------- - -The html5lib tree builder doesn't use the standard tree-building API, -which worries me and has resulted in a number of bugs. - -markup_attr_map can be optimized since it's always a map now. - -Upon encountering UTF-16LE data or some other uncommon serialization -of Unicode, UnicodeDammit will convert the data to Unicode, then -encode it at UTF-8. This is wasteful because it will just get decoded -back to Unicode. - -CDATA ------ - -The elementtree XMLParser has a strip_cdata argument that, when set to -False, should allow Beautiful Soup to preserve CDATA sections instead -of treating them as text. Except it doesn't. (This argument is also -present for HTMLParser, and also does nothing there.) - -Currently, htm5lib converts CDATA sections into comments. An -as-yet-unreleased version of html5lib changes the parser's handling of -CDATA sections to allow CDATA sections in tags like <svg> and -<math>. The HTML5TreeBuilder will need to be updated to create CData -objects instead of Comment objects in this situation. diff --git a/libs/bs4/__init__.py b/libs/bs4/__init__.py index 7a80452f7..95ca229c1 100644 --- a/libs/bs4/__init__.py +++ b/libs/bs4/__init__.py @@ -17,18 +17,17 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - __author__ = "Leonard Richardson ([email protected])" -__version__ = "4.6.0" -__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson" +__version__ = "4.8.0" +__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson" +# Use of this source code is governed by the MIT license. __license__ = "MIT" __all__ = ['BeautifulSoup'] import os import re +import sys import traceback import warnings @@ -50,7 +49,7 @@ from .element import ( # The very first thing we do is give a useful error if someone is # running this code under Python 3 without converting it. -'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' +'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' class BeautifulSoup(Tag): """ @@ -74,7 +73,7 @@ class BeautifulSoup(Tag): like HTML's <br> tag), call handle_starttag and then handle_endtag. """ - ROOT_TAG_NAME = u'[document]' + ROOT_TAG_NAME = '[document]' # If the end-user gives no indication which tree builder they # want, look for one with these features. @@ -82,16 +81,56 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' - NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n" + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, **kwargs): - """The Soup object is initialized as the 'root tag', and the - provided markup (which can be a string or a file-like object) - is fed into the underlying parser.""" + """Constructor. + + :param markup: A string or a file-like object representing + markup to be parsed. + + :param features: Desirable features of the parser to be used. This + may be the name of a specific parser ("lxml", "lxml-xml", + "html.parser", or "html5lib") or it may be the type of markup + to be used ("html", "html5", "xml"). It's recommended that you + name a specific parser, so that Beautiful Soup gives you the + same results across platforms and virtual environments. + + :param builder: A TreeBuilder subclass to instantiate (or + instance to use) instead of looking one up based on + `features`. You only need to use this if you've implemented a + custom TreeBuilder. + + :param parse_only: A SoupStrainer. Only parts of the document + matching the SoupStrainer will be considered. This is useful + when parsing part of a document that would otherwise be too + large to fit into memory. + + :param from_encoding: A string indicating the encoding of the + document to be parsed. Pass this in if Beautiful Soup is + guessing wrongly about the document's encoding. + + :param exclude_encodings: A list of strings indicating + encodings known to be wrong. Pass this in if you don't know + the document's encoding but you know Beautiful Soup's guess is + wrong. + + :param kwargs: For backwards compatibility purposes, the + constructor accepts certain keyword arguments used in + Beautiful Soup 3. None of these arguments do anything in + Beautiful Soup 4; they will result in a warning and then be ignored. + + Apart from this, any keyword arguments passed into the BeautifulSoup + constructor are propagated to the TreeBuilder constructor. This + makes it possible to configure a TreeBuilder beyond saying + which one to use. + + """ if 'convertEntities' in kwargs: + del kwargs['convertEntities'] warnings.warn( "BS4 does not respect the convertEntities argument to the " "BeautifulSoup constructor. Entities are always converted " @@ -142,18 +181,22 @@ class BeautifulSoup(Tag): from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") - if from_encoding and isinstance(markup, unicode): + if from_encoding and isinstance(markup, str): warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") from_encoding = None - if len(kwargs) > 0: - arg = kwargs.keys().pop() - raise TypeError( - "__init__() got an unexpected keyword argument '%s'" % arg) - - if builder is None: - original_features = features - if isinstance(features, basestring): + # We need this information to track whether or not the builder + # was specified well enough that we can omit the 'you need to + # specify a parser' warning. + original_builder = builder + original_features = features + + if isinstance(builder, type): + # A builder class was passed in; it needs to be instantiated. + builder_class = builder + builder = None + elif builder is None: + if isinstance(features, str): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES @@ -163,41 +206,73 @@ class BeautifulSoup(Tag): "Couldn't find a tree builder with the features you " "requested: %s. Do you need to install a parser library?" % ",".join(features)) - builder = builder_class() - if not (original_features == builder.NAME or - original_features in builder.ALTERNATE_NAMES): + + # At this point either we have a TreeBuilder instance in + # builder, or we have a builder_class that we can instantiate + # with the remaining **kwargs. + if builder is None: + builder = builder_class(**kwargs) + if not original_builder and not ( + original_features == builder.NAME or + original_features in builder.ALTERNATE_NAMES + ): if builder.is_xml: markup_type = "XML" else: markup_type = "HTML" - caller = traceback.extract_stack()[0] - filename = caller[0] - line_number = caller[1] - warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( - filename=filename, - line_number=line_number, - parser=builder.NAME, - markup_type=markup_type)) - + # This code adapted from warnings.py so that we get the same line + # of code as our warnings.warn() call gets, even if the answer is wrong + # (as it may be in a multithreading situation). + caller = None + try: + caller = sys._getframe(1) + except ValueError: + pass + if caller: + globals = caller.f_globals + line_number = caller.f_lineno + else: + globals = sys.__dict__ + line_number= 1 + filename = globals.get('__file__') + if filename: + fnl = filename.lower() + if fnl.endswith((".pyc", ".pyo")): + filename = filename[:-1] + if filename: + # If there is no filename at all, the user is most likely in a REPL, + # and the warning is not necessary. + values = dict( + filename=filename, + line_number=line_number, + parser=builder.NAME, + markup_type=markup_type + ) + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) + else: + if kwargs: + warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") + self.builder = builder self.is_xml = builder.is_xml self.known_xml = self.is_xml - self.builder.soup = self - + self._namespaces = dict() self.parse_only = parse_only + self.builder.initialize_soup(self) + if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() elif len(markup) <= 256 and ( (isinstance(markup, bytes) and not b'<' in markup) - or (isinstance(markup, unicode) and not u'<' in markup) + or (isinstance(markup, str) and not '<' in markup) ): # Print out warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, # just in case that's what the user really wants. - if (isinstance(markup, unicode) + if (isinstance(markup, str) and not os.path.supports_unicode_filenames): possible_filename = markup.encode("utf8") else: @@ -205,13 +280,13 @@ class BeautifulSoup(Tag): is_file = False try: is_file = os.path.exists(possible_filename) - except Exception, e: + except Exception as e: # This is almost certainly a problem involving # characters not valid in filenames on this # system. Just let it go. pass if is_file: - if isinstance(markup, unicode): + if isinstance(markup, str): markup = markup.encode("utf8") warnings.warn( '"%s" looks like a filename, not markup. You should' @@ -263,9 +338,9 @@ class BeautifulSoup(Tag): if isinstance(markup, bytes): space = b' ' cant_start_with = (b"http:", b"https:") - elif isinstance(markup, unicode): - space = u' ' - cant_start_with = (u"http:", u"https:") + elif isinstance(markup, str): + space = ' ' + cant_start_with = ("http:", "https:") else: return @@ -302,9 +377,10 @@ class BeautifulSoup(Tag): self.preserve_whitespace_tag_stack = [] self.pushTag(self) - def new_tag(self, name, namespace=None, nsprefix=None, **attrs): + def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs): """Create a new tag associated with this soup.""" - return Tag(None, self.builder, name, namespace, nsprefix, attrs) + kwattrs.update(attrs) + return Tag(None, self.builder, name, namespace, nsprefix, kwattrs) def new_string(self, s, subclass=NavigableString): """Create a new NavigableString associated with this soup.""" @@ -327,7 +403,7 @@ class BeautifulSoup(Tag): def pushTag(self, tag): #print "Push", tag.name - if self.currentTag: + if self.currentTag is not None: self.currentTag.contents.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1] @@ -336,7 +412,7 @@ class BeautifulSoup(Tag): def endData(self, containerClass=NavigableString): if self.current_data: - current_data = u''.join(self.current_data) + current_data = ''.join(self.current_data) # If whitespace is not preserved, and this string contains # nothing but ASCII spaces, replace it with a single space # or newline. @@ -366,60 +442,71 @@ class BeautifulSoup(Tag): def object_was_parsed(self, o, parent=None, most_recent_element=None): """Add an object to the parse tree.""" - parent = parent or self.currentTag - previous_element = most_recent_element or self._most_recent_element + if parent is None: + parent = self.currentTag + if most_recent_element is not None: + previous_element = most_recent_element + else: + previous_element = self._most_recent_element next_element = previous_sibling = next_sibling = None if isinstance(o, Tag): next_element = o.next_element next_sibling = o.next_sibling previous_sibling = o.previous_sibling - if not previous_element: + if previous_element is None: previous_element = o.previous_element + fix = parent.next_element is not None + o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) self._most_recent_element = o parent.contents.append(o) - if parent.next_sibling: - # This node is being inserted into an element that has - # already been parsed. Deal with any dangling references. - index = len(parent.contents)-1 - while index >= 0: - if parent.contents[index] is o: - break - index -= 1 - else: - raise ValueError( - "Error building tree: supposedly %r was inserted " - "into %r after the fact, but I don't see it!" % ( - o, parent - ) - ) - if index == 0: - previous_element = parent - previous_sibling = None - else: - previous_element = previous_sibling = parent.contents[index-1] - if index == len(parent.contents)-1: - next_element = parent.next_sibling - next_sibling = None - else: - next_element = next_sibling = parent.contents[index+1] - - o.previous_element = previous_element - if previous_element: - previous_element.next_element = o - o.next_element = next_element - if next_element: - next_element.previous_element = o - o.next_sibling = next_sibling - if next_sibling: - next_sibling.previous_sibling = o - o.previous_sibling = previous_sibling - if previous_sibling: - previous_sibling.next_sibling = o + # Check if we are inserting into an already parsed node. + if fix: + self._linkage_fixer(parent) + + def _linkage_fixer(self, el): + """Make sure linkage of this fragment is sound.""" + + first = el.contents[0] + child = el.contents[-1] + descendant = child + + if child is first and el.parent is not None: + # Parent should be linked to first child + el.next_element = child + # We are no longer linked to whatever this element is + prev_el = child.previous_element + if prev_el is not None and prev_el is not el: + prev_el.next_element = None + # First child should be linked to the parent, and no previous siblings. + child.previous_element = el + child.previous_sibling = None + + # We have no sibling as we've been appended as the last. + child.next_sibling = None + + # This index is a tag, dig deeper for a "last descendant" + if isinstance(child, Tag) and child.contents: + descendant = child._last_descendant(False) + + # As the final step, link last descendant. It should be linked + # to the parent's next sibling (if found), else walk up the chain + # and find a parent with a sibling. It should have no next sibling. + descendant.next_element = None + descendant.next_sibling = None + target = el + while True: + if target is None: + break + elif target.next_sibling is not None: + descendant.next_element = target.next_sibling + target.next_sibling.previous_element = child + break + target = target.parent def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent @@ -465,7 +552,7 @@ class BeautifulSoup(Tag): self.currentTag, self._most_recent_element) if tag is None: return tag - if self._most_recent_element: + if self._most_recent_element is not None: self._most_recent_element.next_element = tag self._most_recent_element = tag self.pushTag(tag) @@ -490,9 +577,9 @@ class BeautifulSoup(Tag): encoding_part = '' if eventual_encoding != None: encoding_part = ' encoding="%s"' % eventual_encoding - prefix = u'<?xml version="1.0"%s?>\n' % encoding_part + prefix = '<?xml version="1.0"%s?>\n' % encoding_part else: - prefix = u'' + prefix = '' if not pretty_print: indent_level = None else: @@ -526,4 +613,4 @@ class FeatureNotFound(ValueError): if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) - print soup.prettify() + print(soup.prettify()) diff --git a/libs/bs4/builder/__init__.py b/libs/bs4/builder/__init__.py index fdb3362fc..cc497cf0b 100644 --- a/libs/bs4/builder/__init__.py +++ b/libs/bs4/builder/__init__.py @@ -1,5 +1,5 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" from collections import defaultdict import itertools @@ -7,8 +7,7 @@ import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, - HTMLAwareEntitySubstitution, - whitespace_re + nonwhitespace_re ) __all__ = [ @@ -90,18 +89,46 @@ class TreeBuilder(object): is_xml = False picklable = False - preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. - + # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. - cdata_list_attributes = {} - - - def __init__(self): + DEFAULT_CDATA_LIST_ATTRIBUTES = {} + + DEFAULT_PRESERVE_WHITESPACE_TAGS = set() + + USE_DEFAULT = object() + + def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT): + """Constructor. + + :param multi_valued_attributes: If this is set to None, the + TreeBuilder will not turn any values for attributes like + 'class' into lists. Setting this do a dictionary will + customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES + for an example. + + Internally, these are called "CDATA list attributes", but that + probably doesn't make sense to an end-user, so the argument name + is `multi_valued_attributes`. + + :param preserve_whitespace_tags: + """ self.soup = None - + if multi_valued_attributes is self.USE_DEFAULT: + multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES + self.cdata_list_attributes = multi_valued_attributes + if preserve_whitespace_tags is self.USE_DEFAULT: + preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS + self.preserve_whitespace_tags = preserve_whitespace_tags + + def initialize_soup(self, soup): + """The BeautifulSoup object has been initialized and is now + being associated with the TreeBuilder. + """ + self.soup = soup + def reset(self): pass @@ -125,7 +152,7 @@ class TreeBuilder(object): if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags - + def feed(self, markup): raise NotImplementedError() @@ -160,14 +187,14 @@ class TreeBuilder(object): universal = self.cdata_list_attributes.get('*', []) tag_specific = self.cdata_list_attributes.get( tag_name.lower(), None) - for attr in attrs.keys(): + for attr in list(attrs.keys()): if attr in universal or (tag_specific and attr in tag_specific): # We have a "class"-type attribute whose string # value is a whitespace-separated list of # values. Split it into a list. value = attrs[attr] - if isinstance(value, basestring): - values = whitespace_re.split(value) + if isinstance(value, str): + values = nonwhitespace_re.findall(value) else: # html5lib sometimes calls setAttributes twice # for the same tag when rearranging the parse @@ -231,15 +258,20 @@ class HTMLTreeBuilder(TreeBuilder): Such as which tags are empty-element tags. """ - preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags empty_element_tags = set([ # These are from HTML5. 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', - - # These are from HTML4, removed in HTML5. - 'spacer', 'frame' + + # These are from earlier versions of HTML and are removed in HTML5. + 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' ]) + # The HTML standard defines these as block-level elements. Beautiful + # Soup does not treat these elements differently from other elements, + # but it may do so eventually, and this information is available if + # you need to use it. + block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) + # The HTML standard defines these attributes as containing a # space-separated list of values, not a single value. That is, # class="foo bar" means that the 'class' attribute has two values, @@ -247,7 +279,7 @@ class HTMLTreeBuilder(TreeBuilder): # encounter one of these attributes, we will parse its value into # a list of values if possible. Upon output, the list will be # converted back into a string. - cdata_list_attributes = { + DEFAULT_CDATA_LIST_ATTRIBUTES = { "*" : ['class', 'accesskey', 'dropzone'], "a" : ['rel', 'rev'], "link" : ['rel', 'rev'], @@ -264,6 +296,8 @@ class HTMLTreeBuilder(TreeBuilder): "output" : ["for"], } + DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + def set_up_substitutions(self, tag): # We are only interested in <meta> tags if tag.name != 'meta': diff --git a/libs/bs4/builder/_html5lib.py b/libs/bs4/builder/_html5lib.py index 5f5489358..090bb61a8 100644 --- a/libs/bs4/builder/_html5lib.py +++ b/libs/bs4/builder/_html5lib.py @@ -1,5 +1,5 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" __all__ = [ 'HTML5TreeBuilder', @@ -15,7 +15,7 @@ from bs4.builder import ( ) from bs4.element import ( NamespacedAttribute, - whitespace_re, + nonwhitespace_re, ) import html5lib from html5lib.constants import ( @@ -33,7 +33,7 @@ try: # Pre-0.99999999 from html5lib.treebuilders import _base as treebuilder_base new_html5lib = False -except ImportError, e: +except ImportError as e: # 0.99999999 and up from html5lib.treebuilders import base as treebuilder_base new_html5lib = True @@ -64,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): parser = html5lib.HTMLParser(tree=self.create_treebuilder) extra_kwargs = dict() - if not isinstance(markup, unicode): + if not isinstance(markup, str): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: @@ -72,13 +72,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder): doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. - if isinstance(markup, unicode): + if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] - if not isinstance(original_encoding, basestring): + if not isinstance(original_encoding, str): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. @@ -92,7 +92,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<html><head></head><body>%s</body></html>' % fragment + return '<html><head></head><body>%s</body></html>' % fragment class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): @@ -174,7 +174,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): rv.append("|%s<%s>" % (' ' * indent, name)) if element.attrs: attributes = [] - for name, value in element.attrs.items(): + for name, value in list(element.attrs.items()): if isinstance(name, NamespacedAttribute): name = "%s %s" % (prefixes[name.namespace], name.name) if isinstance(value, list): @@ -199,14 +199,14 @@ class AttrList(object): def __setitem__(self, name, value): # If this attribute is a multi-valued attribute for this element, # turn its value into a list. - list_attr = HTML5TreeBuilder.cdata_list_attributes + list_attr = self.element.cdata_list_attributes if (name in list_attr['*'] or (self.element.name in list_attr and name in list_attr[self.element.name])): # A node that is being cloned may have already undergone # this procedure. if not isinstance(value, list): - value = whitespace_re.split(value) + value = nonwhitespace_re.findall(value) self.element[name] = value def items(self): return list(self.attrs.items()) @@ -229,7 +229,7 @@ class Element(treebuilder_base.Node): def appendChild(self, node): string_child = child = None - if isinstance(node, basestring): + if isinstance(node, str): # Some other piece of code decided to pass in a string # instead of creating a TextElement object to contain the # string. @@ -246,10 +246,10 @@ class Element(treebuilder_base.Node): child = node.element node.parent = self - if not isinstance(child, basestring) and child.parent is not None: + if not isinstance(child, str) and child.parent is not None: node.element.extract() - if (string_child and self.element.contents + if (string_child is not None and self.element.contents and self.element.contents[-1].__class__ == NavigableString): # We are appending a string onto another string. # TODO This has O(n^2) performance, for input like @@ -259,7 +259,7 @@ class Element(treebuilder_base.Node): old_element.replace_with(new_element) self.soup._most_recent_element = new_element else: - if isinstance(node, basestring): + if isinstance(node, str): # Create a brand new NavigableString from this string. child = self.soup.new_string(node) @@ -299,7 +299,7 @@ class Element(treebuilder_base.Node): self.soup.builder._replace_cdata_list_attribute_values( self.name, attributes) - for name, value in attributes.items(): + for name, value in list(attributes.items()): self.element[name] = value # The attributes may contain variables that need substitution. @@ -360,16 +360,16 @@ class Element(treebuilder_base.Node): # Set the first child's previous_element and previous_sibling # to elements within the new parent first_child = to_append[0] - if new_parents_last_descendant: + if new_parents_last_descendant is not None: first_child.previous_element = new_parents_last_descendant else: first_child.previous_element = new_parent_element first_child.previous_sibling = new_parents_last_child - if new_parents_last_descendant: + if new_parents_last_descendant is not None: new_parents_last_descendant.next_element = first_child else: new_parent_element.next_element = first_child - if new_parents_last_child: + if new_parents_last_child is not None: new_parents_last_child.next_sibling = first_child # Find the very last element being moved. It is now the @@ -379,7 +379,7 @@ class Element(treebuilder_base.Node): last_childs_last_descendant = to_append[-1]._last_descendant(False, True) last_childs_last_descendant.next_element = new_parents_last_descendant_next_element - if new_parents_last_descendant_next_element: + if new_parents_last_descendant_next_element is not None: # TODO: This code has no test coverage and I'm not sure # how to get html5lib to go through this path, but it's # just the other side of the previous line. diff --git a/libs/bs4/builder/_htmlparser.py b/libs/bs4/builder/_htmlparser.py index 67890b3a3..ea549c356 100644 --- a/libs/bs4/builder/_htmlparser.py +++ b/libs/bs4/builder/_htmlparser.py @@ -1,17 +1,18 @@ +# encoding: utf-8 """Use the HTMLParser library to parse HTML files that aren't too bad.""" -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" __all__ = [ 'HTMLParserTreeBuilder', ] -from HTMLParser import HTMLParser +from html.parser import HTMLParser try: - from HTMLParser import HTMLParseError -except ImportError, e: + from html.parser import HTMLParseError +except ImportError as e: # HTMLParseError is removed in Python 3.5. Since it can never be # thrown in 3.5, we can just define our own class as a placeholder. class HTMLParseError(Exception): @@ -64,7 +65,18 @@ class BeautifulSoupHTMLParser(HTMLParser): # order. It's a list of closing tags we've already handled and # will ignore, assuming they ever show up. self.already_closed_empty_element = [] - + + def error(self, msg): + """In Python 3, HTMLParser subclasses must implement error(), although this + requirement doesn't appear to be documented. + + In Python 2, HTMLParser implements error() as raising an exception. + + In any event, this method is called only on very strange markup and our best strategy + is to pretend it didn't happen and keep going. + """ + warnings.warn(msg) + def handle_startendtag(self, name, attrs): # This is only called when the markup looks like # <tag/>. @@ -129,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser): else: real_name = int(name) - try: - data = unichr(real_name) - except (ValueError, OverflowError), e: - data = u"\N{REPLACEMENT CHARACTER}" - + data = None + if real_name < 256: + # HTML numeric entities are supposed to reference Unicode + # code points, but sometimes they reference code points in + # some other encoding (ahem, Windows-1252). E.g. “ + # instead of É for LEFT DOUBLE QUOTATION MARK. This + # code tries to detect this situation and compensate. + for encoding in (self.soup.original_encoding, 'windows-1252'): + if not encoding: + continue + try: + data = bytearray([real_name]).decode(encoding) + except UnicodeDecodeError as e: + pass + if not data: + try: + data = chr(real_name) + except (ValueError, OverflowError) as e: + pass + data = data or "\N{REPLACEMENT CHARACTER}" self.handle_data(data) def handle_entityref(self, name): @@ -141,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser): if character is not None: data = character else: - data = "&%s;" % name + # If this were XML, it would be ambiguous whether "&foo" + # was an character entity reference with a missing + # semicolon or the literal string "&foo". Since this is + # HTML, we have a complete list of all character entity references, + # and this one wasn't found, so assume it's the literal string "&foo". + data = "&%s" % name self.handle_data(data) def handle_comment(self, data): @@ -182,12 +214,15 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): NAME = HTMLPARSER features = [NAME, HTML, STRICT] - def __init__(self, *args, **kwargs): + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): + super(HTMLParserTreeBuilder, self).__init__(**kwargs) + parser_args = parser_args or [] + parser_kwargs = parser_kwargs or {} if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: - kwargs['strict'] = False + parser_kwargs['strict'] = False if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: - kwargs['convert_charrefs'] = False - self.parser_args = (args, kwargs) + parser_kwargs['convert_charrefs'] = False + self.parser_args = (parser_args, parser_kwargs) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): @@ -196,7 +231,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ - if isinstance(markup, unicode): + if isinstance(markup, str): yield (markup, None, None, False) return @@ -213,7 +248,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): parser.soup = self.soup try: parser.feed(markup) - except HTMLParseError, e: + parser.close() + except HTMLParseError as e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e diff --git a/libs/bs4/builder/_lxml.py b/libs/bs4/builder/_lxml.py index d2ca2872d..a490e2301 100644 --- a/libs/bs4/builder/_lxml.py +++ b/libs/bs4/builder/_lxml.py @@ -1,13 +1,18 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + __all__ = [ 'LXMLTreeBuilderForXML', 'LXMLTreeBuilder', ] +try: + from collections.abc import Callable # Python 3.6 +except ImportError as e: + from collections import Callable + from io import BytesIO -from StringIO import StringIO -import collections +from io import StringIO from lxml import etree from bs4.element import ( Comment, @@ -28,6 +33,10 @@ from bs4.dammit import EncodingDetector LXML = 'lxml' +def _invert(d): + "Invert a dictionary." + return dict((v,k) for k, v in list(d.items())) + class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser @@ -44,7 +53,29 @@ class LXMLTreeBuilderForXML(TreeBuilder): # This namespace mapping is specified in the XML Namespace # standard. - DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} + DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') + + DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) + + def initialize_soup(self, soup): + """Let the BeautifulSoup object know about the standard namespace + mapping. + """ + super(LXMLTreeBuilderForXML, self).initialize_soup(soup) + self._register_namespaces(self.DEFAULT_NSMAPS) + + def _register_namespaces(self, mapping): + """Let the BeautifulSoup object know about namespaces encountered + while parsing the document. + + This might be useful later on when creating CSS selectors. + """ + for key, value in list(mapping.items()): + if key and key not in self.soup._namespaces: + # Let the BeautifulSoup object know about a new namespace. + # If there are multiple namespaces defined with the same + # prefix, the first one in the document takes precedence. + self.soup._namespaces[key] = value def default_parser(self, encoding): # This can either return a parser object or a class, which @@ -58,12 +89,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): # Use the default parser. parser = self.default_parser(encoding) - if isinstance(parser, collections.Callable): + if isinstance(parser, Callable): # Instantiate the parser with default arguments parser = parser(target=self, strip_cdata=False, encoding=encoding) return parser - def __init__(self, parser=None, empty_element_tags=None): + def __init__(self, parser=None, empty_element_tags=None, **kwargs): # TODO: Issue a warning if parser is present but not a # callable, since that means there's no way to create new # parsers for different encodings. @@ -71,8 +102,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): if empty_element_tags is not None: self.empty_element_tags = set(empty_element_tags) self.soup = None - self.nsmaps = [self.DEFAULT_NSMAPS] - + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + super(LXMLTreeBuilderForXML, self).__init__(**kwargs) + def _getNsTag(self, tag): # Split the namespace URL out of a fully-qualified lxml tag # name. Copied from lxml's src/lxml/sax.py. @@ -101,12 +133,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): else: self.processing_instruction_class = XMLProcessingInstruction - if isinstance(markup, unicode): + if isinstance(markup, str): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False - if isinstance(markup, unicode): + if isinstance(markup, str): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", @@ -121,7 +153,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def feed(self, markup): if isinstance(markup, bytes): markup = BytesIO(markup) - elif isinstance(markup, unicode): + elif isinstance(markup, str): markup = StringIO(markup) # Call feed() at least once, even if the markup is empty, @@ -136,30 +168,36 @@ class LXMLTreeBuilderForXML(TreeBuilder): if len(data) != 0: self.parser.feed(data) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e)) def close(self): - self.nsmaps = [self.DEFAULT_NSMAPS] + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] def start(self, name, attrs, nsmap={}): # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. attrs = dict(attrs) nsprefix = None # Invert each namespace map as it comes in. - if len(self.nsmaps) > 1: - # There are no new namespaces for this tag, but - # non-default namespaces are in play, so we need a - # separate tag stack to know when they end. - self.nsmaps.append(None) + if len(nsmap) == 0 and len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. + self.nsmaps.append(None) elif len(nsmap) > 0: # A new namespace mapping has come into play. - inverted_nsmap = dict((value, key) for key, value in nsmap.items()) - self.nsmaps.append(inverted_nsmap) + + # First, Let the BeautifulSoup object know about it. + self._register_namespaces(nsmap) + + # Then, add it to our running list of inverted namespace + # mappings. + self.nsmaps.append(_invert(nsmap)) + # Also treat the namespace mapping as a set of attributes on the # tag, so we can recreate it later. attrs = attrs.copy() - for prefix, namespace in nsmap.items(): + for prefix, namespace in list(nsmap.items()): attribute = NamespacedAttribute( "xmlns", prefix, "http://www.w3.org/2000/xmlns/") attrs[attribute] = namespace @@ -168,7 +206,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): # from lxml with namespaces attached to their names, and # turn then into NamespacedAttribute objects. new_attrs = {} - for attr, value in attrs.items(): + for attr, value in list(attrs.items()): namespace, attr = self._getNsTag(attr) if namespace is None: new_attrs[attr] = value @@ -228,7 +266,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment + return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): @@ -249,10 +287,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): self.parser = self.parser_for(encoding) self.parser.feed(markup) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e)) def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<html><body>%s</body></html>' % fragment + return '<html><body>%s</body></html>' % fragment diff --git a/libs/bs4/dammit.py b/libs/bs4/dammit.py index 7965565f5..c7ac4d431 100644 --- a/libs/bs4/dammit.py +++ b/libs/bs4/dammit.py @@ -6,12 +6,11 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal Feed Parser. It works best on XML and HTML, but it does not rewrite the XML or HTML to reflect a new encoding; that's the tree builder's job. """ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. __license__ = "MIT" import codecs -from htmlentitydefs import codepoint2name +from html.entities import codepoint2name import re import logging import string @@ -46,9 +45,9 @@ except ImportError: pass xml_encoding_re = re.compile( - '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) + '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I) html_meta_re = re.compile( - '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) + '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) class EntitySubstitution(object): @@ -58,15 +57,24 @@ class EntitySubstitution(object): lookup = {} reverse_lookup = {} characters_for_re = [] - for codepoint, name in list(codepoint2name.items()): - character = unichr(codepoint) - if codepoint != 34: + + # &apos is an XHTML entity and an HTML 5, but not an HTML 4 + # entity. We don't want to use it, but we want to recognize it on the way in. + # + # TODO: Ideally we would be able to recognize all HTML 5 named + # entities, but that's a little tricky. + extra = [(39, 'apos')] + for codepoint, name in list(codepoint2name.items()) + extra: + character = chr(codepoint) + if codepoint not in (34, 39): # There's no point in turning the quotation mark into - # ", unless it happens within an attribute value, which - # is handled elsewhere. + # " or the single quote into ', unless it + # happens within an attribute value, which is handled + # elsewhere. characters_for_re.append(character) lookup[character] = name - # But we do want to turn " into the quotation mark. + # But we do want to recognize those entities on the way in and + # convert them to Unicode characters. reverse_lookup[name] = character re_definition = "[%s]" % "".join(characters_for_re) return lookup, reverse_lookup, re.compile(re_definition) @@ -82,7 +90,7 @@ class EntitySubstitution(object): } BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")") AMPERSAND_OR_BRACKET = re.compile("([<>&])") @@ -274,7 +282,7 @@ class EncodingDetector: def strip_byte_order_mark(cls, data): """If a byte-order mark is present, strip it and return the encoding it implies.""" encoding = None - if isinstance(data, unicode): + if isinstance(data, str): # Unicode data cannot have a byte-order mark. return data, encoding if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ @@ -352,9 +360,9 @@ class UnicodeDammit: markup, override_encodings, is_html, exclude_encodings) # Short-circuit if the data is in Unicode to begin with. - if isinstance(markup, unicode) or markup == '': + if isinstance(markup, str) or markup == '': self.markup = markup - self.unicode_markup = unicode(markup) + self.unicode_markup = str(markup) self.original_encoding = None return @@ -438,7 +446,7 @@ class UnicodeDammit: def _to_unicode(self, data, encoding, errors="strict"): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' - return unicode(data, encoding, errors) + return str(data, encoding, errors) @property def declared_html_encoding(self): diff --git a/libs/bs4/diagnose.py b/libs/bs4/diagnose.py index 8768332f5..b5f6e6c8b 100644 --- a/libs/bs4/diagnose.py +++ b/libs/bs4/diagnose.py @@ -1,12 +1,11 @@ """Diagnostic functions, mainly for use when doing tech support.""" -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. __license__ = "MIT" import cProfile -from StringIO import StringIO -from HTMLParser import HTMLParser +from io import StringIO +from html.parser import HTMLParser import bs4 from bs4 import BeautifulSoup, __version__ from bs4.builder import builder_registry @@ -22,8 +21,8 @@ import cProfile def diagnose(data): """Diagnostic suite for isolating common problems.""" - print "Diagnostic running on Beautiful Soup %s" % __version__ - print "Python version %s" % sys.version + print("Diagnostic running on Beautiful Soup %s" % __version__) + print("Python version %s" % sys.version) basic_parsers = ["html.parser", "html5lib", "lxml"] for name in basic_parsers: @@ -32,16 +31,16 @@ def diagnose(data): break else: basic_parsers.remove(name) - print ( + print(( "I noticed that %s is not installed. Installing it may help." % - name) + name)) if 'lxml' in basic_parsers: - basic_parsers.append(["lxml", "xml"]) + basic_parsers.append("lxml-xml") try: from lxml import etree - print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) - except ImportError, e: + print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) + except ImportError as e: print ( "lxml is not installed or couldn't be imported.") @@ -49,37 +48,43 @@ def diagnose(data): if 'html5lib' in basic_parsers: try: import html5lib - print "Found html5lib version %s" % html5lib.__version__ - except ImportError, e: + print("Found html5lib version %s" % html5lib.__version__) + except ImportError as e: print ( "html5lib is not installed or couldn't be imported.") if hasattr(data, 'read'): data = data.read() - elif os.path.exists(data): - print '"%s" looks like a filename. Reading data from the file.' % data - with open(data) as fp: - data = fp.read() elif data.startswith("http:") or data.startswith("https:"): - print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data - print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." + print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) + print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") return - print + else: + try: + if os.path.exists(data): + print('"%s" looks like a filename. Reading data from the file.' % data) + with open(data) as fp: + data = fp.read() + except ValueError: + # This can happen on some platforms when the 'filename' is + # too long. Assume it's data and not a filename. + pass + print() for parser in basic_parsers: - print "Trying to parse your markup with %s" % parser + print("Trying to parse your markup with %s" % parser) success = False try: - soup = BeautifulSoup(data, parser) + soup = BeautifulSoup(data, features=parser) success = True - except Exception, e: - print "%s could not parse the markup." % parser + except Exception as e: + print("%s could not parse the markup." % parser) traceback.print_exc() if success: - print "Here's what %s did with the markup:" % parser - print soup.prettify() + print("Here's what %s did with the markup:" % parser) + print(soup.prettify()) - print "-" * 80 + print("-" * 80) def lxml_trace(data, html=True, **kwargs): """Print out the lxml events that occur during parsing. @@ -89,7 +94,7 @@ def lxml_trace(data, html=True, **kwargs): """ from lxml import etree for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): - print("%s, %4s, %s" % (event, element.tag, element.text)) + print(("%s, %4s, %s" % (event, element.tag, element.text))) class AnnouncingParser(HTMLParser): """Announces HTMLParser parse events, without doing anything else.""" @@ -149,7 +154,7 @@ def rword(length=5): def rsentence(length=4): "Generate a random sentence-like string." - return " ".join(rword(random.randint(4,9)) for i in range(length)) + return " ".join(rword(random.randint(4,9)) for i in list(range(length))) def rdoc(num_elements=1000): """Randomly generate an invalid HTML document.""" @@ -171,9 +176,9 @@ def rdoc(num_elements=1000): def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" - print "Comparative parser benchmark on Beautiful Soup %s" % __version__ + print("Comparative parser benchmark on Beautiful Soup %s" % __version__) data = rdoc(num_elements) - print "Generated a large invalid HTML document (%d bytes)." % len(data) + print("Generated a large invalid HTML document (%d bytes)." % len(data)) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False @@ -182,24 +187,24 @@ def benchmark_parsers(num_elements=100000): soup = BeautifulSoup(data, parser) b = time.time() success = True - except Exception, e: - print "%s could not parse the markup." % parser + except Exception as e: + print("%s could not parse the markup." % parser) traceback.print_exc() if success: - print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) + print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) from lxml import etree a = time.time() etree.HTML(data) b = time.time() - print "Raw lxml parsed the markup in %.2fs." % (b-a) + print("Raw lxml parsed the markup in %.2fs." % (b-a)) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() - print "Raw html5lib parsed the markup in %.2fs." % (b-a) + print("Raw html5lib parsed the markup in %.2fs." % (b-a)) def profile(num_elements=100000, parser="lxml"): diff --git a/libs/bs4/element.py b/libs/bs4/element.py index 9ef75f814..f16b1663e 100644 --- a/libs/bs4/element.py +++ b/libs/bs4/element.py @@ -1,18 +1,35 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. __license__ = "MIT" -import collections +try: + from collections.abc import Callable # Python 3.6 +except ImportError as e: + from collections import Callable import re -import shlex import sys import warnings -from bs4.dammit import EntitySubstitution +try: + import soupsieve +except ImportError as e: + soupsieve = None + warnings.warn( + 'The soupsieve package is not installed. CSS selectors cannot be used.' + ) + +from bs4.formatter import ( + Formatter, + HTMLFormatter, + XMLFormatter, +) DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) -whitespace_re = re.compile("\s+") +nonwhitespace_re = re.compile(r"\S+") + +# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on +# the off chance someone imported it for their own use. +whitespace_re = re.compile(r"\s+") def _alias(attr): """Alias one attribute name to another for backward compatibility""" @@ -26,22 +43,22 @@ def _alias(attr): return alias -class NamespacedAttribute(unicode): +class NamespacedAttribute(str): def __new__(cls, prefix, name, namespace=None): if name is None: - obj = unicode.__new__(cls, prefix) + obj = str.__new__(cls, prefix) elif prefix is None: # Not really namespaced. - obj = unicode.__new__(cls, name) + obj = str.__new__(cls, name) else: - obj = unicode.__new__(cls, prefix + ":" + name) + obj = str.__new__(cls, prefix + ":" + name) obj.prefix = prefix obj.name = name obj.namespace = namespace return obj -class AttributeValueWithCharsetSubstitution(unicode): +class AttributeValueWithCharsetSubstitution(str): """A stand-in object for a character encoding specified in HTML.""" class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): @@ -52,7 +69,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): """ def __new__(cls, original_value): - obj = unicode.__new__(cls, original_value) + obj = str.__new__(cls, original_value) obj.original_value = original_value return obj @@ -69,15 +86,15 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): The value of the 'content' attribute will be one of these objects. """ - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) def __new__(cls, original_value): match = cls.CHARSET_RE.search(original_value) if match is None: # No substitution necessary. - return unicode.__new__(unicode, original_value) + return str.__new__(str, original_value) - obj = unicode.__new__(cls, original_value) + obj = str.__new__(cls, original_value) obj.original_value = original_value return obj @@ -86,94 +103,71 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) -class HTMLAwareEntitySubstitution(EntitySubstitution): - - """Entity substitution rules that are aware of some HTML quirks. - - Specifically, the contents of <script> and <style> tags should not - undergo entity substitution. - - Incoming NavigableString objects are checked to see if they're the - direct children of a <script> or <style> tag. - """ - - cdata_containing_tags = set(["script", "style"]) - - preformatted_tags = set(["pre"]) + +class PageElement(object): + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=None, previous_element=None, next_element=None, + previous_sibling=None, next_sibling=None): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent - preserve_whitespace_tags = set(['pre', 'textarea']) + self.previous_element = previous_element + if previous_element is not None: + self.previous_element.next_element = self - @classmethod - def _substitute_if_appropriate(cls, ns, f): - if (isinstance(ns, NavigableString) - and ns.parent is not None - and ns.parent.name in cls.cdata_containing_tags): - # Do nothing. - return ns - # Substitute. - return f(ns) + self.next_element = next_element + if self.next_element is not None: + self.next_element.previous_element = self - @classmethod - def substitute_html(cls, ns): - return cls._substitute_if_appropriate( - ns, EntitySubstitution.substitute_html) + self.next_sibling = next_sibling + if self.next_sibling is not None: + self.next_sibling.previous_sibling = self - @classmethod - def substitute_xml(cls, ns): - return cls._substitute_if_appropriate( - ns, EntitySubstitution.substitute_xml) + if (previous_sibling is None + and self.parent is not None and self.parent.contents): + previous_sibling = self.parent.contents[-1] -class PageElement(object): - """Contains the navigational information for some part of the page - (either a tag or a piece of text)""" + self.previous_sibling = previous_sibling + if previous_sibling is not None: + self.previous_sibling.next_sibling = self - # There are five possible values for the "formatter" argument passed in - # to methods like encode() and prettify(): - # - # "html" - All Unicode characters with corresponding HTML entities - # are converted to those entities on output. - # "minimal" - Bare ampersands and angle brackets are converted to - # XML entities: & < > - # None - The null formatter. Unicode characters are never - # converted to entities. This is not recommended, but it's - # faster than "minimal". - # A function - This function will be called on every string that - # needs to undergo entity substitution. - # - - # In an HTML document, the default "html" and "minimal" functions - # will leave the contents of <script> and <style> tags alone. For - # an XML document, all tags will be given the same treatment. - - HTML_FORMATTERS = { - "html" : HTMLAwareEntitySubstitution.substitute_html, - "minimal" : HTMLAwareEntitySubstitution.substitute_xml, - None : None - } - - XML_FORMATTERS = { - "html" : EntitySubstitution.substitute_html, - "minimal" : EntitySubstitution.substitute_xml, - None : None - } - - def format_string(self, s, formatter='minimal'): + def format_string(self, s, formatter): """Format the given string using the given formatter.""" - if not callable(formatter): - formatter = self._formatter_for_name(formatter) if formatter is None: - output = s - else: - output = formatter(s) + return s + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + output = formatter.substitute(s) return output + def formatter_for_name(self, formatter): + """Look up or create a Formatter for the given identifier, + if necessary. + + :param formatter: Can be a Formatter object (used as-is), a + function (used as the entity substitution hook for an + XMLFormatter or HTMLFormatter), or a string (used to look up + an XMLFormatter or HTMLFormatter in the appropriate registry. + """ + if isinstance(formatter, Formatter): + return formatter + if self._is_xml: + c = XMLFormatter + else: + c = HTMLFormatter + if callable(formatter): + return c(entity_substitution=formatter) + return c.REGISTRY[formatter] + @property def _is_xml(self): """Is this element part of an XML tree or an HTML tree? - This is used when mapping a formatter name ("minimal") to an - appropriate function (one that performs entity-substitution on - the contents of <script> and <style> tags, or not). It can be + This is used in formatter_for_name, when deciding whether an + XMLFormatter or HTMLFormatter is more appropriate. It can be inefficient, but it should be called very rarely. """ if self.known_xml is not None: @@ -191,48 +185,13 @@ class PageElement(object): return getattr(self, 'is_xml', False) return self.parent._is_xml - def _formatter_for_name(self, name): - "Look up a formatter function based on its name and the tree." - if self._is_xml: - return self.XML_FORMATTERS.get( - name, EntitySubstitution.substitute_xml) - else: - return self.HTML_FORMATTERS.get( - name, HTMLAwareEntitySubstitution.substitute_xml) - - def setup(self, parent=None, previous_element=None, next_element=None, - previous_sibling=None, next_sibling=None): - """Sets up the initial relations between this element and - other elements.""" - self.parent = parent - - self.previous_element = previous_element - if previous_element is not None: - self.previous_element.next_element = self - - self.next_element = next_element - if self.next_element: - self.next_element.previous_element = self - - self.next_sibling = next_sibling - if self.next_sibling: - self.next_sibling.previous_sibling = self - - if (not previous_sibling - and self.parent is not None and self.parent.contents): - previous_sibling = self.parent.contents[-1] - - self.previous_sibling = previous_sibling - if previous_sibling: - self.previous_sibling.next_sibling = self - nextSibling = _alias("next_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3 def replace_with(self, replace_with): - if not self.parent: + if self.parent is None: raise ValueError( - "Cannot replace one element with another when the" + "Cannot replace one element with another when the " "element to be replaced is not part of a tree.") if replace_with is self: return @@ -247,7 +206,7 @@ class PageElement(object): def unwrap(self): my_parent = self.parent - if not self.parent: + if self.parent is None: raise ValueError( "Cannot replace an element with its contents when that" "element is not part of a tree.") @@ -295,7 +254,7 @@ class PageElement(object): def _last_descendant(self, is_initialized=True, accept_self=True): "Finds the last element beneath this object to be parsed." - if is_initialized and self.next_sibling: + if is_initialized and self.next_sibling is not None: last_child = self.next_sibling.previous_element else: last_child = self @@ -312,10 +271,18 @@ class PageElement(object): raise ValueError("Cannot insert None into a tag.") if new_child is self: raise ValueError("Cannot insert a tag into itself.") - if (isinstance(new_child, basestring) + if (isinstance(new_child, str) and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) + from bs4 import BeautifulSoup + if isinstance(new_child, BeautifulSoup): + # We don't want to end up with a situation where one BeautifulSoup + # object contains another. Insert the children one at a time. + for subchild in list(new_child.contents): + self.insert(position, subchild) + position += 1 + return position = min(position, len(self.contents)) if hasattr(new_child, 'parent') and new_child.parent is not None: # We're 'inserting' an element that's already one @@ -377,43 +344,54 @@ class PageElement(object): """Appends the given tag to the contents of this tag.""" self.insert(len(self.contents), tag) - def insert_before(self, predecessor): - """Makes the given element the immediate predecessor of this one. + def extend(self, tags): + """Appends the given tags to the contents of this tag.""" + for tag in tags: + self.append(tag) - The two elements will have the same parent, and the given element + def insert_before(self, *args): + """Makes the given element(s) the immediate predecessor of this one. + + The elements will have the same parent, and the given elements will be immediately before this one. """ - if self is predecessor: - raise ValueError("Can't insert an element before itself.") parent = self.parent if parent is None: raise ValueError( "Element has no parent, so 'before' has no meaning.") - # Extract first so that the index won't be screwed up if they - # are siblings. - if isinstance(predecessor, PageElement): - predecessor.extract() - index = parent.index(self) - parent.insert(index, predecessor) - - def insert_after(self, successor): - """Makes the given element the immediate successor of this one. - - The two elements will have the same parent, and the given element + if any(x is self for x in args): + raise ValueError("Can't insert an element before itself.") + for predecessor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(predecessor, PageElement): + predecessor.extract() + index = parent.index(self) + parent.insert(index, predecessor) + + def insert_after(self, *args): + """Makes the given element(s) the immediate successor of this one. + + The elements will have the same parent, and the given elements will be immediately after this one. """ - if self is successor: - raise ValueError("Can't insert an element after itself.") + # Do all error checking before modifying the tree. parent = self.parent if parent is None: raise ValueError( "Element has no parent, so 'after' has no meaning.") - # Extract first so that the index won't be screwed up if they - # are siblings. - if isinstance(successor, PageElement): - successor.extract() - index = parent.index(self) - parent.insert(index+1, successor) + if any(x is self for x in args): + raise ValueError("Can't insert an element after itself.") + + offset = 0 + for successor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(successor, PageElement): + successor.extract() + index = parent.index(self) + parent.insert(index+1+offset, successor) + offset += 1 def find_next(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and @@ -533,17 +511,24 @@ class PageElement(object): result = (element for element in generator if isinstance(element, Tag)) return ResultSet(strainer, result) - elif isinstance(name, basestring): + elif isinstance(name, str): # Optimization to find all tags with a given name. if name.count(':') == 1: - # This is a name with a prefix. - prefix, name = name.split(':', 1) + # This is a name with a prefix. If this is a namespace-aware document, + # we need to match the local name against tag.name. If not, + # we need to match the fully-qualified name against tag.name. + prefix, local_name = name.split(':', 1) else: prefix = None + local_name = name result = (element for element in generator if isinstance(element, Tag) - and element.name == name - and (prefix is None or element.prefix == prefix) + and ( + element.name == name + ) or ( + element.name == local_name + and (prefix is None or element.prefix == prefix) + ) ) return ResultSet(strainer, result) results = ResultSet(strainer) @@ -597,82 +582,6 @@ class PageElement(object): yield i i = i.parent - # Methods for supporting CSS selectors. - - tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$') - - # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ - # \---------------------------/ \---/\-------------/ \-------/ - # | | | | - # | | | The value - # | | ~,|,^,$,* or = - # | Attribute - # Tag - attribselect_re = re.compile( - r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' + - r'=?"?(?P<value>[^\]"]*)"?\]$' - ) - - def _attr_value_as_string(self, value, default=None): - """Force an attribute value into a string representation. - - A multi-valued attribute will be converted into a - space-separated stirng. - """ - value = self.get(value, default) - if isinstance(value, list) or isinstance(value, tuple): - value =" ".join(value) - return value - - def _tag_name_matches_and(self, function, tag_name): - if not tag_name: - return function - else: - def _match(tag): - return tag.name == tag_name and function(tag) - return _match - - def _attribute_checker(self, operator, attribute, value=''): - """Create a function that performs a CSS selector operation. - - Takes an operator, attribute and optional value. Returns a - function that will return True for elements that match that - combination. - """ - if operator == '=': - # string representation of `attribute` is equal to `value` - return lambda el: el._attr_value_as_string(attribute) == value - elif operator == '~': - # space-separated list representation of `attribute` - # contains `value` - def _includes_value(element): - attribute_value = element.get(attribute, []) - if not isinstance(attribute_value, list): - attribute_value = attribute_value.split() - return value in attribute_value - return _includes_value - elif operator == '^': - # string representation of `attribute` starts with `value` - return lambda el: el._attr_value_as_string( - attribute, '').startswith(value) - elif operator == '$': - # string representation of `attribute` ends with `value` - return lambda el: el._attr_value_as_string( - attribute, '').endswith(value) - elif operator == '*': - # string representation of `attribute` contains `value` - return lambda el: value in el._attr_value_as_string(attribute, '') - elif operator == '|': - # string representation of `attribute` is either exactly - # `value` or starts with `value` and then a dash. - def _is_or_starts_with_dash(element): - attribute_value = element._attr_value_as_string(attribute, '') - return (attribute_value == value or attribute_value.startswith( - value + '-')) - return _is_or_starts_with_dash - else: - return lambda el: el.has_attr(attribute) - # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): @@ -691,7 +600,7 @@ class PageElement(object): return self.parents -class NavigableString(unicode, PageElement): +class NavigableString(str, PageElement): PREFIX = '' SUFFIX = '' @@ -709,10 +618,10 @@ class NavigableString(unicode, PageElement): passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ - if isinstance(value, unicode): - u = unicode.__new__(cls, value) + if isinstance(value, str): + u = str.__new__(cls, value) else: - u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) u.setup() return u @@ -723,7 +632,7 @@ class NavigableString(unicode, PageElement): return type(self)(self) def __getnewargs__(self): - return (unicode(self),) + return (str(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards @@ -737,6 +646,7 @@ class NavigableString(unicode, PageElement): self.__class__.__name__, attr)) def output_ready(self, formatter="minimal"): + """Run the string through the provided formatter.""" output = self.format_string(self, formatter) return self.PREFIX + output + self.SUFFIX @@ -755,37 +665,39 @@ class PreformattedString(NavigableString): but the return value will be ignored. """ - def output_ready(self, formatter="minimal"): - """CData strings are passed into the formatter. - But the return value is ignored.""" - self.format_string(self, formatter) + def output_ready(self, formatter=None): + """CData strings are passed into the formatter, purely + for any side effects. The return value is ignored. + """ + if formatter is not None: + ignore = self.format_string(self, formatter) return self.PREFIX + self + self.SUFFIX class CData(PreformattedString): - PREFIX = u'<![CDATA[' - SUFFIX = u']]>' + PREFIX = '<![CDATA[' + SUFFIX = ']]>' class ProcessingInstruction(PreformattedString): """A SGML processing instruction.""" - PREFIX = u'<?' - SUFFIX = u'>' + PREFIX = '<?' + SUFFIX = '>' class XMLProcessingInstruction(ProcessingInstruction): """An XML processing instruction.""" - PREFIX = u'<?' - SUFFIX = u'?>' + PREFIX = '<?' + SUFFIX = '?>' class Comment(PreformattedString): - PREFIX = u'<!--' - SUFFIX = u'-->' + PREFIX = '<!--' + SUFFIX = '-->' class Declaration(PreformattedString): - PREFIX = u'<?' - SUFFIX = u'?>' + PREFIX = '<?' + SUFFIX = '?>' class Doctype(PreformattedString): @@ -802,8 +714,8 @@ class Doctype(PreformattedString): return Doctype(value) - PREFIX = u'<!DOCTYPE ' - SUFFIX = u'>\n' + PREFIX = '<!DOCTYPE ' + SUFFIX = '>\n' class Tag(PageElement): @@ -826,14 +738,6 @@ class Tag(PageElement): self.name = name self.namespace = namespace self.prefix = prefix - if builder is not None: - preserve_whitespace_tags = builder.preserve_whitespace_tags - else: - if is_xml: - preserve_whitespace_tags = [] - else: - preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags - self.preserve_whitespace_tags = preserve_whitespace_tags if attrs is None: attrs = {} elif attrs: @@ -856,13 +760,32 @@ class Tag(PageElement): self.setup(parent, previous) self.hidden = False - # Set up any substitutions, such as the charset in a META tag. - if builder is not None: + if builder is None: + # In the absence of a TreeBuilder, assume this tag is nothing + # special. + self.can_be_empty_element = False + self.cdata_list_attributes = None + else: + # Set up any substitutions for this tag, such as the charset in a META tag. builder.set_up_substitutions(self) + + # Ask the TreeBuilder whether this tag might be an empty-element tag. self.can_be_empty_element = builder.can_be_empty_element(name) - else: - self.can_be_empty_element = False + # Keep track of the list of attributes of this tag that + # might need to be treated as a list. + # + # For performance reasons, we store the whole data structure + # rather than asking the question of every tag. Asking would + # require building a new data structure every time, and + # (unlike can_be_empty_element), we almost never need + # to check this. + self.cdata_list_attributes = builder.cdata_list_attributes + + # Keep track of the names that might cause this tag to be treated as a + # whitespace-preserved tag. + self.preserve_whitespace_tags = builder.preserve_whitespace_tags + parserClass = _alias("parser_class") # BS3 def __copy__(self): @@ -942,7 +865,7 @@ class Tag(PageElement): for string in self._all_strings(True): yield string - def get_text(self, separator=u"", strip=False, + def get_text(self, separator="", strip=False, types=(NavigableString, CData)): """ Get all child strings, concatenated using the given separator. @@ -976,6 +899,43 @@ class Tag(PageElement): for element in self.contents[:]: element.extract() + def smooth(self): + """Smooth out this element's children by consolidating consecutive strings. + + This makes pretty-printed output look more natural following a + lot of operations that modified the tree. + """ + # Mark the first position of every pair of children that need + # to be consolidated. Do this rather than making a copy of + # self.contents, since in most cases very few strings will be + # affected. + marked = [] + for i, a in enumerate(self.contents): + if isinstance(a, Tag): + # Recursively smooth children. + a.smooth() + if i == len(self.contents)-1: + # This is the last item in .contents, and it's not a + # tag. There's no chance it needs any work. + continue + b = self.contents[i+1] + if (isinstance(a, NavigableString) + and isinstance(b, NavigableString) + and not isinstance(a, PreformattedString) + and not isinstance(b, PreformattedString) + ): + marked.append(i) + + # Go over the marked positions in reverse order, so that + # removing items from .contents won't affect the remaining + # positions. + for i in reversed(marked): + a = self.contents[i] + b = self.contents[i+1] + b.extract() + n = NavigableString(a+b) + a.replace_with(n) + def index(self, element): """ Find the index of a child by identity, not value. Avoids issues with @@ -1021,7 +981,7 @@ class Tag(PageElement): def __contains__(self, x): return x in self.contents - def __nonzero__(self): + def __bool__(self): "A tag is non-None even if it has no contents." return True @@ -1046,8 +1006,10 @@ class Tag(PageElement): # BS3: soup.aTag -> "soup.find("a") tag_name = tag[:-3] warnings.warn( - '.%sTag is deprecated, use .find("%s") instead.' % ( - tag_name, tag_name)) + '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( + name=tag_name + ) + ) return self.find(tag_name) # We special case contents to avoid recursion. elif not tag.startswith("__") and not tag == "contents": @@ -1108,14 +1070,6 @@ class Tag(PageElement): u = self.decode(indent_level, encoding, formatter) return u.encode(encoding, errors) - def _should_pretty_print(self, indent_level): - """Should this tag be pretty-printed?""" - - return ( - indent_level is not None - and self.name not in self.preserve_whitespace_tags - ) - def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): @@ -1129,31 +1083,32 @@ class Tag(PageElement): encoding. """ - # First off, turn a string formatter into a function. This - # will stop the lookup from happening over and over again. - if not callable(formatter): - formatter = self._formatter_for_name(formatter) - + # First off, turn a non-Formatter `formatter` into a Formatter + # object. This will stop the lookup from happening over and + # over again. + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + attributes = formatter.attributes(self) attrs = [] - if self.attrs: - for key, val in sorted(self.attrs.items()): - if val is None: - decoded = key - else: - if isinstance(val, list) or isinstance(val, tuple): - val = ' '.join(val) - elif not isinstance(val, basestring): - val = unicode(val) - elif ( + for key, val in attributes: + if val is None: + decoded = key + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, str): + val = str(val) + elif ( isinstance(val, AttributeValueWithCharsetSubstitution) - and eventual_encoding is not None): - val = val.encode(eventual_encoding) - - text = self.format_string(val, formatter) - decoded = ( - unicode(key) + '=' - + EntitySubstitution.quoted_attribute_value(text)) - attrs.append(decoded) + and eventual_encoding is not None + ): + val = val.encode(eventual_encoding) + + text = formatter.attribute_value(val) + decoded = ( + str(key) + '=' + + formatter.quoted_attribute_value(text)) + attrs.append(decoded) close = '' closeTag = '' @@ -1162,7 +1117,7 @@ class Tag(PageElement): prefix = self.prefix + ":" if self.is_empty_element: - close = '/' + close = formatter.void_element_close_prefix or '' else: closeTag = '</%s%s>' % (prefix, self.name) @@ -1177,7 +1132,8 @@ class Tag(PageElement): else: indent_contents = None contents = self.decode_contents( - indent_contents, eventual_encoding, formatter) + indent_contents, eventual_encoding, formatter + ) if self.hidden: # This is the 'document root' object. @@ -1209,6 +1165,13 @@ class Tag(PageElement): s = ''.join(s) return s + def _should_pretty_print(self, indent_level): + """Should this tag be pretty-printed?""" + return ( + indent_level is not None + and self.name not in self.preserve_whitespace_tags + ) + def prettify(self, encoding=None, formatter="minimal"): if encoding is None: return self.decode(True, formatter=formatter) @@ -1224,19 +1187,19 @@ class Tag(PageElement): indented this many spaces. :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ + encoded into this encoding. decode_contents() is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. - :param formatter: The output formatter responsible for converting - entities to Unicode characters. + :param formatter: A Formatter object, or a string naming one of + the standard Formatters. """ - # First off, turn a string formatter into a function. This + # First off, turn a string formatter into a Formatter object. This # will stop the lookup from happening over and over again. - if not callable(formatter): - formatter = self._formatter_for_name(formatter) + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) pretty_print = (indent_level is not None) s = [] @@ -1247,16 +1210,19 @@ class Tag(PageElement): elif isinstance(c, Tag): s.append(c.decode(indent_level, eventual_encoding, formatter)) - if text and indent_level and not self.name == 'pre': + preserve_whitespace = ( + self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags + ) + if text and indent_level and not preserve_whitespace: text = text.strip() if text: - if pretty_print and not self.name == 'pre': + if pretty_print and not preserve_whitespace: s.append(" " * (indent_level - 1)) s.append(text) - if pretty_print and not self.name == 'pre': + if pretty_print and not preserve_whitespace: s.append("\n") return ''.join(s) - + def encode_contents( self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): @@ -1331,236 +1297,41 @@ class Tag(PageElement): current = current.next_element # CSS selector code - - _selector_combinators = ['>', '+', '~'] - _select_debug = False - quoted_colon = re.compile('"[^"]*:[^"]*"') - def select_one(self, selector): + def select_one(self, selector, namespaces=None, **kwargs): """Perform a CSS selection operation on the current element.""" - value = self.select(selector, limit=1) + value = self.select(selector, namespaces, 1, **kwargs) if value: return value[0] return None - def select(self, selector, _candidate_generator=None, limit=None): - """Perform a CSS selection operation on the current element.""" + def select(self, selector, namespaces=None, limit=None, **kwargs): + """Perform a CSS selection operation on the current element. - # Handle grouping selectors if ',' exists, ie: p,a - if ',' in selector: - context = [] - for partial_selector in selector.split(','): - partial_selector = partial_selector.strip() - if partial_selector == '': - raise ValueError('Invalid group selection syntax: %s' % selector) - candidates = self.select(partial_selector, limit=limit) - for candidate in candidates: - if candidate not in context: - context.append(candidate) - - if limit and len(context) >= limit: - break - return context - tokens = shlex.split(selector) - current_context = [self] - - if tokens[-1] in self._selector_combinators: - raise ValueError( - 'Final combinator "%s" is missing an argument.' % tokens[-1]) + This uses the SoupSieve library. - if self._select_debug: - print 'Running CSS selector "%s"' % selector + :param selector: A string containing a CSS selector. - for index, token in enumerate(tokens): - new_context = [] - new_context_ids = set([]) + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. - if tokens[index-1] in self._selector_combinators: - # This token was consumed by the previous combinator. Skip it. - if self._select_debug: - print ' Token was consumed by the previous combinator.' - continue + :param limit: After finding this number of results, stop looking. - if self._select_debug: - print ' Considering token "%s"' % token - recursive_candidate_generator = None - tag_name = None - - # Each operation corresponds to a checker function, a rule - # for determining whether a candidate matches the - # selector. Candidates are generated by the active - # iterator. - checker = None - - m = self.attribselect_re.match(token) - if m is not None: - # Attribute selector - tag_name, attribute, operator, value = m.groups() - checker = self._attribute_checker(operator, attribute, value) - - elif '#' in token: - # ID selector - tag_name, tag_id = token.split('#', 1) - def id_matches(tag): - return tag.get('id', None) == tag_id - checker = id_matches - - elif '.' in token: - # Class selector - tag_name, klass = token.split('.', 1) - classes = set(klass.split('.')) - def classes_match(candidate): - return classes.issubset(candidate.get('class', [])) - checker = classes_match - - elif ':' in token and not self.quoted_colon.search(token): - # Pseudo-class - tag_name, pseudo = token.split(':', 1) - if tag_name == '': - raise ValueError( - "A pseudo-class must be prefixed with a tag name.") - pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) - found = [] - if pseudo_attributes is None: - pseudo_type = pseudo - pseudo_value = None - else: - pseudo_type, pseudo_value = pseudo_attributes.groups() - if pseudo_type == 'nth-of-type': - try: - pseudo_value = int(pseudo_value) - except: - raise NotImplementedError( - 'Only numeric values are currently supported for the nth-of-type pseudo-class.') - if pseudo_value < 1: - raise ValueError( - 'nth-of-type pseudo-class value must be at least 1.') - class Counter(object): - def __init__(self, destination): - self.count = 0 - self.destination = destination - - def nth_child_of_type(self, tag): - self.count += 1 - if self.count == self.destination: - return True - else: - return False - checker = Counter(pseudo_value).nth_child_of_type - else: - raise NotImplementedError( - 'Only the following pseudo-classes are implemented: nth-of-type.') - - elif token == '*': - # Star selector -- matches everything - pass - elif token == '>': - # Run the next token as a CSS selector against the - # direct children of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.children - elif token == '~': - # Run the next token as a CSS selector against the - # siblings of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.next_siblings - elif token == '+': - # For each tag in the current context, run the next - # token as a CSS selector against the tag's next - # sibling that's a tag. - def next_tag_sibling(tag): - yield tag.find_next_sibling(True) - recursive_candidate_generator = next_tag_sibling - - elif self.tag_name_re.match(token): - # Just a tag name. - tag_name = token - else: - raise ValueError( - 'Unsupported or invalid CSS selector: "%s"' % token) - if recursive_candidate_generator: - # This happens when the selector looks like "> foo". - # - # The generator calls select() recursively on every - # member of the current context, passing in a different - # candidate generator and a different selector. - # - # In the case of "> foo", the candidate generator is - # one that yields a tag's direct children (">"), and - # the selector is "foo". - next_token = tokens[index+1] - def recursive_select(tag): - if self._select_debug: - print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) - print '-' * 40 - for i in tag.select(next_token, recursive_candidate_generator): - if self._select_debug: - print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) - yield i - if self._select_debug: - print '-' * 40 - _use_candidate_generator = recursive_select - elif _candidate_generator is None: - # By default, a tag's candidates are all of its - # children. If tag_name is defined, only yield tags - # with that name. - if self._select_debug: - if tag_name: - check = "[any]" - else: - check = tag_name - print ' Default candidate generator, tag name="%s"' % check - if self._select_debug: - # This is redundant with later code, but it stops - # a bunch of bogus tags from cluttering up the - # debug log. - def default_candidate_generator(tag): - for child in tag.descendants: - if not isinstance(child, Tag): - continue - if tag_name and not child.name == tag_name: - continue - yield child - _use_candidate_generator = default_candidate_generator - else: - _use_candidate_generator = lambda tag: tag.descendants - else: - _use_candidate_generator = _candidate_generator - - count = 0 - for tag in current_context: - if self._select_debug: - print " Running candidate generator on %s %s" % ( - tag.name, repr(tag.attrs)) - for candidate in _use_candidate_generator(tag): - if not isinstance(candidate, Tag): - continue - if tag_name and candidate.name != tag_name: - continue - if checker is not None: - try: - result = checker(candidate) - except StopIteration: - # The checker has decided we should no longer - # run the generator. - break - if checker is None or result: - if self._select_debug: - print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) - if id(candidate) not in new_context_ids: - # If a tag matches a selector more than once, - # don't include it in the context more than once. - new_context.append(candidate) - new_context_ids.add(id(candidate)) - elif self._select_debug: - print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) - - current_context = new_context - if limit and len(current_context) >= limit: - current_context = current_context[:limit] - - if self._select_debug: - print "Final verdict:" - for i in current_context: - print " %s %s" % (i.name, i.attrs) - return current_context + :param kwargs: Any extra arguments you'd like to pass in to + soupsieve.select(). + """ + if namespaces is None: + namespaces = self._namespaces + + if limit is None: + limit = 0 + if soupsieve is None: + raise NotImplementedError( + "Cannot execute CSS selectors because the soupsieve package is not installed." + ) + + return soupsieve.select(selector, self, namespaces, limit, **kwargs) # Old names for backwards compatibility def childGenerator(self): @@ -1603,7 +1374,7 @@ class SoupStrainer(object): else: attrs = kwargs normalized_attrs = {} - for key, value in attrs.items(): + for key, value in list(attrs.items()): normalized_attrs[key] = self._normalize_search_value(value) self.attrs = normalized_attrs @@ -1612,7 +1383,7 @@ class SoupStrainer(object): def _normalize_search_value(self, value): # Leave it alone if it's a Unicode string, a callable, a # regular expression, a boolean, or None. - if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') + if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') or isinstance(value, bool) or value is None): return value @@ -1625,7 +1396,7 @@ class SoupStrainer(object): new_value = [] for v in value: if (hasattr(v, '__iter__') and not isinstance(v, bytes) - and not isinstance(v, unicode)): + and not isinstance(v, str)): # This is almost certainly the user's mistake. In the # interests of avoiding infinite loops, we'll let # it through as-is rather than doing a recursive call. @@ -1637,7 +1408,7 @@ class SoupStrainer(object): # Otherwise, convert it into a Unicode string. # The unicode(str()) thing is so this will do the same thing on Python 2 # and Python 3. - return unicode(str(value)) + return str(str(value)) def __str__(self): if self.text: @@ -1652,7 +1423,7 @@ class SoupStrainer(object): markup = markup_name markup_attrs = markup call_function_with_tag_data = ( - isinstance(self.name, collections.Callable) + isinstance(self.name, Callable) and not isinstance(markup_name, Tag)) if ((not self.name) @@ -1691,7 +1462,7 @@ class SoupStrainer(object): found = None # If given a list of items, scan it for a text element that # matches. - if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): + if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): for element in markup: if isinstance(element, NavigableString) \ and self.search(element): @@ -1704,7 +1475,7 @@ class SoupStrainer(object): found = self.search_tag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ - isinstance(markup, basestring): + isinstance(markup, str): if not self.name and not self.attrs and self._matches(markup, self.text): found = markup else: @@ -1732,7 +1503,7 @@ class SoupStrainer(object): # True matches any non-None value. return markup is not None - if isinstance(match_against, collections.Callable): + if isinstance(match_against, Callable): return match_against(markup) # Custom callables take the tag as an argument, but all @@ -1749,7 +1520,7 @@ class SoupStrainer(object): return not match_against if (hasattr(match_against, '__iter__') - and not isinstance(match_against, basestring)): + and not isinstance(match_against, str)): # We're asked to match against an iterable of items. # The markup must be match at least one item in the # iterable. We'll try each one in turn. @@ -1776,7 +1547,7 @@ class SoupStrainer(object): # the tag's name and once against its prefixed name. match = False - if not match and isinstance(match_against, unicode): + if not match and isinstance(match_against, str): # Exact string match match = markup == match_against diff --git a/libs/bs4/testing.py b/libs/bs4/testing.py index 6ba2506c4..cc9966601 100644 --- a/libs/bs4/testing.py +++ b/libs/bs4/testing.py @@ -1,7 +1,7 @@ +# encoding: utf-8 """Helper classes for tests.""" -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. __license__ = "MIT" import pickle @@ -16,29 +16,66 @@ from bs4.element import ( ContentMetaAttributeValue, Doctype, SoupStrainer, + Tag ) from bs4.builder import HTMLParserTreeBuilder default_builder = HTMLParserTreeBuilder +BAD_DOCUMENT = """A bare string +<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"> +<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd"> +<div><![CDATA[A CDATA section where it doesn't belong]]></div> +<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div> +<div>A <meta> tag</div> +<div>A <br> tag that supposedly has contents.</br></div> +<div>AT&T</div> +<div><textarea>Within a textarea, markup like <b> tags and <&<& should be treated as literal</textarea></div> +<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div> +<div>This numeric entity is missing the final semicolon: <x t="piñata"></div> +<div><a href="http://example.com/</a> that attribute value never got closed</div> +<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div> +<! This document starts with a bogus declaration ><div>a</div> +<div>This document contains <!an incomplete declaration <div>(do you see it?)</div> +<div>This document ends with <!an incomplete declaration +<div><a style={height:21px;}>That attribute value was bogus</a></div> +<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace +<div><table><td nowrap>That boolean attribute had no value</td></table></div> +<div>Here's a nonexistent entity: &#foo; (do you see it?)</div> +<div>This document ends before the entity finishes: > +<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p> +<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b> +<div><table><tr><td>Here's a table</td></tr></table></div> +<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div> +<div>This tag contains nothing but whitespace: <b> </b></div> +<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div> +<div><table><div>This table contains bare markup</div></table></div> +<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div> +<div>This document contains a <!DOCTYPE surprise>surprise doctype</div> +<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div> +<div><our\u2603>Tag name contains Unicode characters</our\u2603></div> +<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +""" + class SoupTest(unittest.TestCase): @property def default_builder(self): - return default_builder() + return default_builder def soup(self, markup, **kwargs): """Build a Beautiful Soup object from markup.""" builder = kwargs.pop('builder', self.default_builder) return BeautifulSoup(markup, builder=builder, **kwargs) - def document_for(self, markup): + def document_for(self, markup, **kwargs): """Turn an HTML fragment into a document. The details depend on the builder. """ - return self.default_builder.test_fragment_to_document(markup) + return self.default_builder(**kwargs).test_fragment_to_document(markup) def assertSoupEquals(self, to_parse, compare_parsed_to=None): builder = self.default_builder @@ -59,6 +96,121 @@ class SoupTest(unittest.TestCase): self.assertEqual(earlier, e.previous_element) earlier = e + def linkage_validator(self, el, _recursive_call=False): + """Ensure proper linkage throughout the document.""" + descendant = None + # Document element should have no previous element or previous sibling. + # It also shouldn't have a next sibling. + if el.parent is None: + assert el.previous_element is None,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_element, None + ) + assert el.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_sibling, None + ) + assert el.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_sibling, None + ) + + idx = 0 + child = None + last_child = None + last_idx = len(el.contents) - 1 + for child in el.contents: + descendant = None + + # Parent should link next element to their first child + # That child should have no previous sibling + if idx == 0: + if el.parent is not None: + assert el.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_element, child + ) + assert child.previous_element is el,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + child, child.previous_element, el + ) + assert child.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format( + child, child.previous_sibling, None + ) + + # If not the first child, previous index should link as sibling to this index + # Previous element should match the last index or the last bubbled up descendant + else: + assert child.previous_sibling is el.contents[idx - 1],\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format( + child, child.previous_sibling, el.contents[idx - 1] + ) + assert el.contents[idx - 1].next_sibling is child,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + el.contents[idx - 1], el.contents[idx - 1].next_sibling, child + ) + + if last_child is not None: + assert child.previous_element is last_child,\ + "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format( + child, child.previous_element, last_child, child.parent.contents + ) + assert last_child.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + last_child, last_child.next_element, child + ) + + if isinstance(child, Tag) and child.contents: + descendant = self.linkage_validator(child, True) + # A bubbled up descendant should have no next siblings + assert descendant.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + descendant, descendant.next_sibling, None + ) + + # Mark last child as either the bubbled up descendant or the current child + if descendant is not None: + last_child = descendant + else: + last_child = child + + # If last child, there are non next siblings + if idx == last_idx: + assert child.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_sibling, None + ) + idx += 1 + + child = descendant if descendant is not None else child + if child is None: + child = el + + if not _recursive_call and child is not None: + target = el + while True: + if target is None: + assert child.next_element is None, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, None + ) + break + elif target.next_sibling is not None: + assert child.next_element is target.next_sibling, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, target.next_sibling + ) + break + target = target.parent + + # We are done, so nothing to return + return None + else: + # Return the child to the recursive caller + return child + + class HTMLTreeBuilderSmokeTest(object): """A basic test of a treebuilder's competence. @@ -80,7 +232,7 @@ class HTMLTreeBuilderSmokeTest(object): soup = self.soup("") new_tag = soup.new_tag(name) self.assertEqual(True, new_tag.is_empty_element) - + def test_pickle_and_unpickle_identity(self): # Pickling a tree, then unpickling it, yields a tree identical # to the original. @@ -150,12 +302,20 @@ class HTMLTreeBuilderSmokeTest(object): soup.encode("utf-8").replace(b"\n", b""), markup.replace(b"\n", b"")) + def test_namespaced_html(self): + """When a namespaced XML document is parsed as HTML it should + be treated as HTML with weird tag names. + """ + markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>""" + soup = self.soup(markup) + self.assertEqual(2, len(soup.find_all("ns1:foo"))) + def test_processing_instruction(self): # We test both Unicode and bytestring to verify that # process_markup correctly sets processing_instruction_class # even when the markup is already Unicode and there is no # need to process anything. - markup = u"""<?PITarget PIContent?>""" + markup = """<?PITarget PIContent?>""" soup = self.soup(markup) self.assertEqual(markup, soup.decode()) @@ -292,6 +452,18 @@ Hello, world! "<tbody><tr><td>Bar</td></tr></tbody>" "<tfoot><tr><td>Baz</td></tr></tfoot></table>") + def test_multivalued_attribute_with_whitespace(self): + # Whitespace separating the values of a multi-valued attribute + # should be ignored. + + markup = '<div class=" foo bar "></a>' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.div['class']) + + # If you search by the literal name of the class it's like the whitespace + # wasn't there. + self.assertEqual(soup.div, soup.find('div', class_="foo bar")) + def test_deeply_nested_multivalued_attribute(self): # html5lib can set the attributes of the same tag many times # as it rearranges the tree. This has caused problems with @@ -311,15 +483,41 @@ Hello, world! def test_angle_brackets_in_attribute_values_are_escaped(self): self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') + def test_strings_resembling_character_entity_references(self): + # "&T" and "&p" look like incomplete character entities, but they are + # not. + self.assertSoupEquals( + "<p>• AT&T is in the s&p 500</p>", + "<p>\u2022 AT&T is in the s&p 500</p>" + ) + + def test_apos_entity(self): + self.assertSoupEquals( + "<p>Bob's Bar</p>", + "<p>Bob's Bar</p>", + ) + + def test_entities_in_foreign_document_encoding(self): + # “ and ” are invalid numeric entities referencing + # Windows-1252 characters. - references a character common + # to Windows-1252 and Unicode, and ☃ references a + # character only found in Unicode. + # + # All of these entities should be converted to Unicode + # characters. + markup = "<p>“Hello” -☃</p>" + soup = self.soup(markup) + self.assertEqual("“Hello” -☃", soup.p.string) + def test_entities_in_attributes_converted_to_unicode(self): - expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' + expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' self.assertSoupEquals('<p id="piñata"></p>', expect) self.assertSoupEquals('<p id="piñata"></p>', expect) self.assertSoupEquals('<p id="piñata"></p>', expect) self.assertSoupEquals('<p id="piñata"></p>', expect) def test_entities_in_text_converted_to_unicode(self): - expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' + expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' self.assertSoupEquals("<p>piñata</p>", expect) self.assertSoupEquals("<p>piñata</p>", expect) self.assertSoupEquals("<p>piñata</p>", expect) @@ -330,11 +528,11 @@ Hello, world! '<p>I said "good day!"</p>') def test_out_of_range_entity(self): - expect = u"\N{REPLACEMENT CHARACTER}" + expect = "\N{REPLACEMENT CHARACTER}" self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) - + def test_multipart_strings(self): "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") @@ -408,9 +606,9 @@ Hello, world! # A seemingly innocuous document... but it's in Unicode! And # it contains characters that can't be represented in the # encoding found in the declaration! The horror! - markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' + markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' soup = self.soup(markup) - self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) + self.assertEqual('Sacr\xe9 bleu!', soup.body.string) def test_soupstrainer(self): """Parsers should be able to work with SoupStrainers.""" @@ -450,7 +648,7 @@ Hello, world! # Both XML and HTML entities are converted to Unicode characters # during parsing. text = "<p><<sacré bleu!>></p>" - expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" + expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" self.assertSoupEquals(text, expected) def test_smart_quotes_converted_on_the_way_in(self): @@ -460,15 +658,15 @@ Hello, world! soup = self.soup(quote) self.assertEqual( soup.p.string, - u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") def test_non_breaking_spaces_converted_on_the_way_in(self): soup = self.soup("<a> </a>") - self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) + self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) def test_entities_converted_on_the_way_out(self): text = "<p><<sacré bleu!>></p>" - expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8") + expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8") soup = self.soup(text) self.assertEqual(soup.p.encode("utf-8"), expected) @@ -477,7 +675,7 @@ Hello, world! # easy-to-understand document. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. - unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' + unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' # That's because we're going to encode it into ISO-Latin-1, and use # that to test. @@ -586,6 +784,13 @@ Hello, world! data.a['foo'] = 'bar' self.assertEqual('<a foo="bar">text</a>', data.a.decode()) + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + class XMLTreeBuilderSmokeTest(object): def test_pickle_and_unpickle_identity(self): @@ -624,6 +829,17 @@ class XMLTreeBuilderSmokeTest(object): self.assertEqual( soup.encode("utf-8"), markup) + def test_nested_namespaces(self): + doc = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> +<parent xmlns="http://ns1/"> +<child xmlns="http://ns2/" xmlns:ns3="http://ns3/"> +<grandchild ns3:attr="value" xmlns="http://ns4/"/> +</child> +</parent>""" + soup = self.soup(doc) + self.assertEqual(doc, soup.encode()) + def test_formatter_processes_script_tag_for_xml_documents(self): doc = """ <script type="text/javascript"> @@ -637,15 +853,15 @@ class XMLTreeBuilderSmokeTest(object): self.assertTrue(b"< < hey > >" in encoded) def test_can_parse_unicode_document(self): - markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' + markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' soup = self.soup(markup) - self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) + self.assertEqual('Sacr\xe9 bleu!', soup.root.string) def test_popping_namespaced_tag(self): markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' soup = self.soup(markup) self.assertEqual( - unicode(soup.rss), markup) + str(soup.rss), markup) def test_docstring_includes_correct_encoding(self): soup = self.soup("<root/>") @@ -676,17 +892,17 @@ class XMLTreeBuilderSmokeTest(object): def test_closing_namespaced_tag(self): markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' soup = self.soup(markup) - self.assertEqual(unicode(soup.p), markup) + self.assertEqual(str(soup.p), markup) def test_namespaced_attributes(self): markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' soup = self.soup(markup) - self.assertEqual(unicode(soup.foo), markup) + self.assertEqual(str(soup.foo), markup) def test_namespaced_attributes_xml_namespace(self): markup = '<foo xml:lang="fr">bar</foo>' soup = self.soup(markup) - self.assertEqual(unicode(soup.foo), markup) + self.assertEqual(str(soup.foo), markup) def test_find_by_prefixed_name(self): doc = """<?xml version="1.0" encoding="utf-8"?> @@ -721,6 +937,12 @@ class XMLTreeBuilderSmokeTest(object): # The two tags have the same namespace prefix. self.assertEqual(tag.prefix, duplicate.prefix) + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" diff --git a/libs/bs4/tests/test_html5lib.py b/libs/bs4/tests/test_html5lib.py index 0f89d6244..96529b0b3 100644 --- a/libs/bs4/tests/test_html5lib.py +++ b/libs/bs4/tests/test_html5lib.py @@ -5,7 +5,7 @@ import warnings try: from bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True -except ImportError, e: +except ImportError as e: HTML5LIB_PRESENT = False from bs4.element import SoupStrainer from bs4.testing import ( @@ -22,7 +22,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): @property def default_builder(self): - return HTML5TreeBuilder() + return HTML5TreeBuilder def test_soupstrainer(self): # The html5lib tree builder does not support SoupStrainers. @@ -74,14 +74,14 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): def test_reparented_markup(self): markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>' soup = self.soup(markup) - self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) + self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) self.assertEqual(2, len(soup.find_all('p'))) def test_reparented_markup_ends_with_whitespace(self): markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n' soup = self.soup(markup) - self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) + self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) self.assertEqual(2, len(soup.find_all('p'))) def test_reparented_markup_containing_identical_whitespace_nodes(self): @@ -127,4 +127,44 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): def test_foster_parenting(self): markup = b"""<table><td></tbody>A""" soup = self.soup(markup) - self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode()) + self.assertEqual("<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode()) + + def test_extraction(self): + """ + Test that extraction does not destroy the tree. + + https://bugs.launchpad.net/beautifulsoup/+bug/1782928 + """ + + markup = """ +<html><head></head> +<style> +</style><script></script><body><p>hello</p></body></html> +""" + soup = self.soup(markup) + [s.extract() for s in soup('script')] + [s.extract() for s in soup('style')] + + self.assertEqual(len(soup.find_all("p")), 1) + + def test_empty_comment(self): + """ + Test that empty comment does not break structure. + + https://bugs.launchpad.net/beautifulsoup/+bug/1806598 + """ + + markup = """ +<html> +<body> +<form> +<!----><input type="text"> +</form> +</body> +</html> +""" + soup = self.soup(markup) + inputs = [] + for form in soup.find_all('form'): + inputs.extend(form.find_all('input')) + self.assertEqual(len(inputs), 1) diff --git a/libs/bs4/tests/test_htmlparser.py b/libs/bs4/tests/test_htmlparser.py index d5cf0253f..790489aa1 100644 --- a/libs/bs4/tests/test_htmlparser.py +++ b/libs/bs4/tests/test_htmlparser.py @@ -5,12 +5,11 @@ from pdb import set_trace import pickle from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.builder import HTMLParserTreeBuilder +from bs4.builder._htmlparser import BeautifulSoupHTMLParser class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): - @property - def default_builder(self): - return HTMLParserTreeBuilder() + default_builder = HTMLParserTreeBuilder def test_namespaced_system_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. @@ -32,3 +31,17 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): def test_redundant_empty_element_closing_tags(self): self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>") self.assertSoupEquals('</br></br></br>', "") + + def test_empty_element(self): + # This verifies that any buffered data present when the parser + # finishes working is handled. + self.assertSoupEquals("foo &# bar", "foo &# bar") + + +class TestHTMLParserSubclass(SoupTest): + def test_error(self): + """Verify that our HTMLParser subclass implements error() in a way + that doesn't cause a crash. + """ + parser = BeautifulSoupHTMLParser() + parser.error("don't crash") diff --git a/libs/bs4/tests/test_lxml.py b/libs/bs4/tests/test_lxml.py index a05870b91..29da71149 100644 --- a/libs/bs4/tests/test_lxml.py +++ b/libs/bs4/tests/test_lxml.py @@ -7,7 +7,7 @@ try: import lxml.etree LXML_PRESENT = True LXML_VERSION = lxml.etree.LXML_VERSION -except ImportError, e: +except ImportError as e: LXML_PRESENT = False LXML_VERSION = (0,) @@ -36,7 +36,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): @property def default_builder(self): - return LXMLTreeBuilder() + return LXMLTreeBuilder def test_out_of_range_entity(self): self.assertSoupEquals( @@ -46,6 +46,12 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertSoupEquals( "<p>foo�bar</p>", "<p>foobar</p>") + def test_entities_in_foreign_document_encoding(self): + # We can't implement this case correctly because by the time we + # hear about markup like "“", it's been (incorrectly) converted into + # a string like u'\x93' + pass + # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this # test if an old version of lxml is installed. @@ -62,7 +68,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): # if one is installed. with warnings.catch_warnings(record=True) as w: soup = BeautifulStoneSoup("<b />") - self.assertEqual(u"<b/>", unicode(soup.b)) + self.assertEqual("<b/>", str(soup.b)) self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) @skipIf( @@ -73,4 +79,22 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): @property def default_builder(self): - return LXMLTreeBuilderForXML() + return LXMLTreeBuilderForXML + + def test_namespace_indexing(self): + # We should not track un-prefixed namespaces as we can only hold one + # and it will be recognized as the default namespace by soupsieve, + # which may be confusing in some situations. When no namespace is provided + # for a selector, the default namespace (if defined) is assumed. + + soup = self.soup( + '<?xml version="1.1"?>\n' + '<root>' + '<tag xmlns="http://unprefixed-namespace.com">content</tag>' + '<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>' + '</root>' + ) + self.assertEqual( + soup._namespaces, + {'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'} + ) diff --git a/libs/bs4/tests/test_soup.py b/libs/bs4/tests/test_soup.py index f3e69edf3..1eda9484b 100644 --- a/libs/bs4/tests/test_soup.py +++ b/libs/bs4/tests/test_soup.py @@ -24,6 +24,7 @@ from bs4.dammit import ( EncodingDetector, ) from bs4.testing import ( + default_builder, SoupTest, skipIf, ) @@ -32,7 +33,7 @@ import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True -except ImportError, e: +except ImportError as e: LXML_PRESENT = False PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) @@ -40,21 +41,86 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) class TestConstructor(SoupTest): def test_short_unicode_input(self): - data = u"<h1>éé</h1>" + data = "<h1>éé</h1>" soup = self.soup(data) - self.assertEqual(u"éé", soup.h1.string) + self.assertEqual("éé", soup.h1.string) def test_embedded_null(self): - data = u"<h1>foo\0bar</h1>" + data = "<h1>foo\0bar</h1>" soup = self.soup(data) - self.assertEqual(u"foo\0bar", soup.h1.string) + self.assertEqual("foo\0bar", soup.h1.string) def test_exclude_encodings(self): - utf8_data = u"Räksmörgås".encode("utf-8") + utf8_data = "Räksmörgås".encode("utf-8") soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) self.assertEqual("windows-1252", soup.original_encoding) - + def test_custom_builder_class(self): + # Verify that you can pass in a custom Builder class and + # it'll be instantiated with the appropriate keyword arguments. + class Mock(object): + def __init__(self, **kwargs): + self.called_with = kwargs + self.is_xml = True + def initialize_soup(self, soup): + pass + def prepare_markup(self, *args, **kwargs): + return '' + + kwargs = dict( + var="value", + # This is a deprecated BS3-era keyword argument, which + # will be stripped out. + convertEntities=True, + ) + with warnings.catch_warnings(record=True): + soup = BeautifulSoup('', builder=Mock, **kwargs) + assert isinstance(soup.builder, Mock) + self.assertEqual(dict(var="value"), soup.builder.called_with) + + # You can also instantiate the TreeBuilder yourself. In this + # case, that specific object is used and any keyword arguments + # to the BeautifulSoup constructor are ignored. + builder = Mock(**kwargs) + with warnings.catch_warnings(record=True) as w: + soup = BeautifulSoup( + '', builder=builder, ignored_value=True, + ) + msg = str(w[0].message) + assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.") + self.assertEqual(builder, soup.builder) + self.assertEqual(kwargs, builder.called_with) + + def test_cdata_list_attributes(self): + # Most attribute values are represented as scalars, but the + # HTML standard says that some attributes, like 'class' have + # space-separated lists as values. + markup = '<a id=" an id " class=" a class "></a>' + soup = self.soup(markup) + + # Note that the spaces are stripped for 'class' but not for 'id'. + a = soup.a + self.assertEqual(" an id ", a['id']) + self.assertEqual(["a", "class"], a['class']) + + # TreeBuilder takes an argument called 'mutli_valued_attributes' which lets + # you customize or disable this. As always, you can customize the TreeBuilder + # by passing in a keyword argument to the BeautifulSoup constructor. + soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None) + self.assertEqual(" a class ", soup.a['class']) + + # Here are two ways of saying that `id` is a multi-valued + # attribute in this context, but 'class' is not. + for switcheroo in ({'*': 'id'}, {'a': 'id'}): + with warnings.catch_warnings(record=True) as w: + # This will create a warning about not explicitly + # specifying a parser, but we'll ignore it. + soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo) + a = soup.a + self.assertEqual(["an", "id"], a['id']) + self.assertEqual(" a class ", a['class']) + + class TestWarnings(SoupTest): def _no_parser_specified(self, s, is_there=True): @@ -129,7 +195,7 @@ class TestWarnings(SoupTest): with warnings.catch_warnings(record=True) as warning_list: # note - this url must differ from the bytes one otherwise # python's warnings system swallows the second warning - soup = self.soup(u"http://www.crummyunicode.com/") + soup = self.soup("http://www.crummyunicode.com/") self.assertTrue(any("looks like a URL" in str(w.message) for w in warning_list)) @@ -141,7 +207,7 @@ class TestWarnings(SoupTest): def test_url_warning_with_unicode_and_space(self): with warnings.catch_warnings(record=True) as warning_list: - soup = self.soup(u"http://www.crummyuncode.com/ is great") + soup = self.soup("http://www.crummyuncode.com/ is great") self.assertFalse(any("looks like a URL" in str(w.message) for w in warning_list)) @@ -163,9 +229,9 @@ class TestEntitySubstitution(unittest.TestCase): def test_simple_html_substitution(self): # Unicode characters corresponding to named HTML entites # are substituted, and no others. - s = u"foo\u2200\N{SNOWMAN}\u00f5bar" + s = "foo\u2200\N{SNOWMAN}\u00f5bar" self.assertEqual(self.sub.substitute_html(s), - u"foo∀\N{SNOWMAN}õbar") + "foo∀\N{SNOWMAN}õbar") def test_smart_quote_substitution(self): # MS smart quotes are a common source of frustration, so we @@ -217,7 +283,7 @@ class TestEntitySubstitution(unittest.TestCase): self.assertEqual( self.sub.substitute_xml_containing_entities("ÁT&T"), "ÁT&T") - + def test_quotes_not_html_substituted(self): """There's no need to do this except inside attribute values.""" text = 'Bob\'s "bar"' @@ -230,7 +296,7 @@ class TestEncodingConversion(SoupTest): def setUp(self): super(TestEncodingConversion, self).setUp() - self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' + self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. self.assertEqual( @@ -250,7 +316,7 @@ class TestEncodingConversion(SoupTest): ascii = b"<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() - self.assertTrue(isinstance(unicode_output, unicode)) + self.assertTrue(isinstance(unicode_output, str)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: @@ -262,7 +328,7 @@ class TestEncodingConversion(SoupTest): # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) - self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') + self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None) def test_utf8_in_unicode_out(self): @@ -270,7 +336,7 @@ class TestEncodingConversion(SoupTest): # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) - self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') + self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') def test_utf8_out(self): # The internal data structures can be encoded as UTF-8. @@ -281,14 +347,14 @@ class TestEncodingConversion(SoupTest): PYTHON_3_PRE_3_2, "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") def test_attribute_name_containing_unicode_characters(self): - markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' + markup = '<div><a \N{SNOWMAN}="snowman"></a></div>' self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) class TestUnicodeDammit(unittest.TestCase): """Standalone tests of UnicodeDammit.""" def test_unicode_input(self): - markup = u"I'm already Unicode! \N{SNOWMAN}" + markup = "I'm already Unicode! \N{SNOWMAN}" dammit = UnicodeDammit(markup) self.assertEqual(dammit.unicode_markup, markup) @@ -296,7 +362,7 @@ class TestUnicodeDammit(unittest.TestCase): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) self.assertEqual( - dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") + dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>") def test_smart_quotes_to_xml_entities(self): markup = b"<foo>\x91\x92\x93\x94</foo>" @@ -320,14 +386,14 @@ class TestUnicodeDammit(unittest.TestCase): utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" dammit = UnicodeDammit(utf8) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}') + self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') - self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') + self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" @@ -336,19 +402,19 @@ class TestUnicodeDammit(unittest.TestCase): self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) def test_ignore_inappropriate_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") + utf8_data = "Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') def test_ignore_invalid_codecs(self): - utf8_data = u"Räksmörgås".encode("utf-8") + utf8_data = "Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') def test_exclude_encodings(self): # This is UTF-8. - utf8_data = u"Räksmörgås".encode("utf-8") + utf8_data = "Räksmörgås".encode("utf-8") # But if we exclude UTF-8 from consideration, the guess is # Windows-1252. @@ -364,7 +430,7 @@ class TestUnicodeDammit(unittest.TestCase): detected = EncodingDetector( b'<?xml version="1.0" encoding="UTF-\xdb" ?>') encodings = list(detected.encodings) - assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings + assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings def test_detect_html5_style_meta_tag(self): @@ -404,7 +470,7 @@ class TestUnicodeDammit(unittest.TestCase): bs4.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) - self.assertTrue(u"\ufffd" in dammit.unicode_markup) + self.assertTrue("\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) @@ -416,17 +482,17 @@ class TestUnicodeDammit(unittest.TestCase): # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) - self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) + self.assertEqual("<a>áé</a>", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding) def test_detwingle(self): # Here's a UTF8 document. - utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") + utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") # Here's a Windows-1252 document. windows_1252 = ( - u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" - u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") + "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" + "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") # Through some unholy alchemy, they've been stuck together. doc = utf8 + windows_1252 + utf8 @@ -441,7 +507,7 @@ class TestUnicodeDammit(unittest.TestCase): fixed = UnicodeDammit.detwingle(doc) self.assertEqual( - u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) + "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) def test_detwingle_ignores_multibyte_characters(self): # Each of these characters has a UTF-8 representation ending @@ -449,9 +515,9 @@ class TestUnicodeDammit(unittest.TestCase): # Windows-1252. But our code knows to skip over multibyte # UTF-8 characters, so they'll survive the process unscathed. for tricky_unicode_char in ( - u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' - u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' - u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. + "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' + "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' + "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. ): input = tricky_unicode_char.encode("utf8") self.assertTrue(input.endswith(b'\x93')) diff --git a/libs/bs4/tests/test_tree.py b/libs/bs4/tests/test_tree.py index c0e7c4080..3b4beeb8f 100644 --- a/libs/bs4/tests/test_tree.py +++ b/libs/bs4/tests/test_tree.py @@ -1,4 +1,3 @@ - # -*- coding: utf-8 -*- """Tests for Beautiful Soup's tree traversal methods. @@ -26,6 +25,7 @@ from bs4.element import ( Comment, Declaration, Doctype, + Formatter, NavigableString, SoupStrainer, Tag, @@ -71,13 +71,13 @@ class TestFind(TreeTest): self.assertEqual(soup.find("b").string, "2") def test_unicode_text_find(self): - soup = self.soup(u'<h1>Räksmörgås</h1>') - self.assertEqual(soup.find(string=u'Räksmörgås'), u'Räksmörgås') + soup = self.soup('<h1>Räksmörgås</h1>') + self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås') def test_unicode_attribute_find(self): - soup = self.soup(u'<h1 id="Räksmörgås">here it is</h1>') + soup = self.soup('<h1 id="Räksmörgås">here it is</h1>') str(soup) - self.assertEqual("here it is", soup.find(id=u'Räksmörgås').text) + self.assertEqual("here it is", soup.find(id='Räksmörgås').text) def test_find_everything(self): @@ -97,17 +97,17 @@ class TestFindAll(TreeTest): """You can search the tree for text nodes.""" soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") # Exact match. - self.assertEqual(soup.find_all(string="bar"), [u"bar"]) - self.assertEqual(soup.find_all(text="bar"), [u"bar"]) + self.assertEqual(soup.find_all(string="bar"), ["bar"]) + self.assertEqual(soup.find_all(text="bar"), ["bar"]) # Match any of a number of strings. self.assertEqual( - soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) + soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"]) # Match a regular expression. self.assertEqual(soup.find_all(text=re.compile('.*')), - [u"Foo", u"bar", u'\xbb']) + ["Foo", "bar", '\xbb']) # Match anything. self.assertEqual(soup.find_all(text=True), - [u"Foo", u"bar", u'\xbb']) + ["Foo", "bar", '\xbb']) def test_find_all_limit(self): """You can limit the number of items returned by find_all.""" @@ -250,8 +250,8 @@ class TestFindAllByAttribute(TreeTest): ["Matching a.", "Matching b."]) def test_find_all_by_utf8_attribute_value(self): - peace = u"םולש".encode("utf8") - data = u'<a title="םולש"></a>'.encode("utf8") + peace = "םולש".encode("utf8") + data = '<a title="םולש"></a>'.encode("utf8") soup = self.soup(data) self.assertEqual([soup.a], soup.find_all(title=peace)) self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) @@ -417,6 +417,48 @@ class TestFindAllByAttribute(TreeTest): self.assertEqual([], soup.find_all(id=1, text="bar")) +class TestSmooth(TreeTest): + """Test Tag.smooth.""" + + def test_smooth(self): + soup = self.soup("<div>a</div>") + div = soup.div + div.append("b") + div.append("c") + div.append(Comment("Comment 1")) + div.append(Comment("Comment 2")) + div.append("d") + builder = self.default_builder() + span = Tag(soup, builder, 'span') + span.append('1') + span.append('2') + div.append(span) + + # At this point the tree has a bunch of adjacent + # NavigableStrings. This is normal, but it has no meaning in + # terms of HTML, so we may want to smooth things out for + # output. + + # Since the <span> tag has two children, its .string is None. + self.assertEqual(None, div.span.string) + + self.assertEqual(7, len(div.contents)) + div.smooth() + self.assertEqual(5, len(div.contents)) + + # The three strings at the beginning of div.contents have been + # merged into on string. + # + self.assertEqual('abc', div.contents[0]) + + # The call is recursive -- the <span> tag was also smoothed. + self.assertEqual('12', div.span.string) + + # The two comments have _not_ been merged, even though + # comments are strings. Merging comments would change the + # meaning of the HTML. + self.assertEqual('Comment 1', div.contents[1]) + self.assertEqual('Comment 2', div.contents[2]) class TestIndex(TreeTest): @@ -605,7 +647,7 @@ class SiblingTest(TreeTest): </html>''' # All that whitespace looks good but makes the tests more # difficult. Get rid of it. - markup = re.compile("\n\s*").sub("", markup) + markup = re.compile(r"\n\s*").sub("", markup) self.tree = self.soup(markup) @@ -703,12 +745,12 @@ class TestTagCreation(SoupTest): """Test the ability to create new tags.""" def test_new_tag(self): soup = self.soup("") - new_tag = soup.new_tag("foo", bar="baz") + new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"}) self.assertTrue(isinstance(new_tag, Tag)) self.assertEqual("foo", new_tag.name) - self.assertEqual(dict(bar="baz"), new_tag.attrs) + self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs) self.assertEqual(None, new_tag.parent) - + def test_tag_inherits_self_closing_rules_from_builder(self): if XML_BUILDER_PRESENT: xml_soup = BeautifulSoup("", "lxml-xml") @@ -821,6 +863,26 @@ class TestTreeModification(SoupTest): soup = self.soup(text) self.assertRaises(ValueError, soup.a.insert, 0, soup.a) + def test_insert_beautifulsoup_object_inserts_children(self): + """Inserting one BeautifulSoup object into another actually inserts all + of its children -- you'll never combine BeautifulSoup objects. + """ + soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>") + + text = "<p>p2</p><p>p3</p>" + to_insert = self.soup(text) + soup.insert(1, to_insert) + + for i in soup.descendants: + assert not isinstance(i, BeautifulSoup) + + p1, p2, p3, p4 = list(soup.children) + self.assertEqual("And now, a word:", p1.string) + self.assertEqual("p2", p2.string) + self.assertEqual("p3", p3.string) + self.assertEqual("And we're back.", p4.string) + + def test_replace_with_maintains_next_element_throughout(self): soup = self.soup('<p><a>one</a><b>three</b></p>') a = soup.a @@ -877,7 +939,7 @@ class TestTreeModification(SoupTest): self.assertEqual(soup.a.contents[0].next_element, "bar") def test_insert_tag(self): - builder = self.default_builder + builder = self.default_builder() soup = self.soup( "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) magic_tag = Tag(soup, builder, 'magictag') @@ -912,6 +974,13 @@ class TestTreeModification(SoupTest): soup.a.append(soup.b) self.assertEqual(data, soup.decode()) + def test_extend(self): + data = "<a><b><c><d><e><f><g></g></f></e></d></c></b></a>" + soup = self.soup(data) + l = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b] + soup.a.extend(l) + self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode()) + def test_move_tag_to_beginning_of_parent(self): data = "<a><b></b><c></c><d></d></a>" soup = self.soup(data) @@ -938,6 +1007,29 @@ class TestTreeModification(SoupTest): self.assertEqual( soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) + # Can't insert an element before itself. + b = soup.b + self.assertRaises(ValueError, b.insert_before, b) + + # Can't insert before if an element has no parent. + b.extract() + self.assertRaises(ValueError, b.insert_before, "nope") + + # Can insert an identical element + soup = self.soup("<a>") + soup.a.insert_before(soup.new_tag("a")) + + def test_insert_multiple_before(self): + soup = self.soup("<a>foo</a><b>bar</b>") + soup.b.insert_before("BAZ", " ", "QUUX") + soup.a.insert_before("QUUX", " ", "BAZ") + self.assertEqual( + soup.decode(), self.document_for("QUUX BAZ<a>foo</a>BAZ QUUX<b>bar</b>")) + + soup.a.insert_before(soup.b, "FOO") + self.assertEqual( + soup.decode(), self.document_for("QUUX BAZ<b>bar</b>FOO<a>foo</a>BAZ QUUX")) + def test_insert_after(self): soup = self.soup("<a>foo</a><b>bar</b>") soup.b.insert_after("BAZ") @@ -948,6 +1040,28 @@ class TestTreeModification(SoupTest): self.assertEqual( soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) + # Can't insert an element after itself. + b = soup.b + self.assertRaises(ValueError, b.insert_after, b) + + # Can't insert after if an element has no parent. + b.extract() + self.assertRaises(ValueError, b.insert_after, "nope") + + # Can insert an identical element + soup = self.soup("<a>") + soup.a.insert_before(soup.new_tag("a")) + + def test_insert_multiple_after(self): + soup = self.soup("<a>foo</a><b>bar</b>") + soup.b.insert_after("BAZ", " ", "QUUX") + soup.a.insert_after("QUUX", " ", "BAZ") + self.assertEqual( + soup.decode(), self.document_for("<a>foo</a>QUUX BAZ<b>bar</b>BAZ QUUX")) + soup.b.insert_after(soup.a, "FOO ") + self.assertEqual( + soup.decode(), self.document_for("QUUX BAZ<b>bar</b><a>foo</a>FOO BAZ QUUX")) + def test_insert_after_raises_exception_if_after_has_no_meaning(self): soup = self.soup("") tag = soup.new_tag("a") @@ -1111,7 +1225,7 @@ class TestTreeModification(SoupTest): <script>baz</script> </html>""") [soup.script.extract() for i in soup.find_all("script")] - self.assertEqual("<body>\n\n<a></a>\n</body>", unicode(soup.body)) + self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body)) def test_extract_works_when_element_is_surrounded_by_identical_strings(self): @@ -1186,7 +1300,7 @@ class TestElementObjects(SoupTest): tag = soup.bTag self.assertEqual(soup.b, tag) self.assertEqual( - '.bTag is deprecated, use .find("b") instead.', + '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")', str(w[0].message)) def test_has_attr(self): @@ -1349,19 +1463,19 @@ class TestPersistence(SoupTest): soup = BeautifulSoup(b'<p> </p>', 'html.parser') encoding = soup.original_encoding copy = soup.__copy__() - self.assertEqual(u"<p> </p>", unicode(copy)) + self.assertEqual("<p> </p>", str(copy)) self.assertEqual(encoding, copy.original_encoding) def test_unicode_pickle(self): # A tree containing Unicode characters can be pickled. - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) loaded = pickle.loads(dumped) self.assertEqual(loaded.decode(), soup.decode()) def test_copy_navigablestring_is_not_attached_to_tree(self): - html = u"<b>Foo<a></a></b><b>Bar</b>" + html = "<b>Foo<a></a></b><b>Bar</b>" soup = self.soup(html) s1 = soup.find(string="Foo") s2 = copy.copy(s1) @@ -1373,7 +1487,7 @@ class TestPersistence(SoupTest): self.assertEqual(None, s2.previous_element) def test_copy_navigablestring_subclass_has_same_type(self): - html = u"<b><!--Foo--></b>" + html = "<b><!--Foo--></b>" soup = self.soup(html) s1 = soup.string s2 = copy.copy(s1) @@ -1381,19 +1495,19 @@ class TestPersistence(SoupTest): self.assertTrue(isinstance(s2, Comment)) def test_copy_entire_soup(self): - html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end" + html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" soup = self.soup(html) soup_copy = copy.copy(soup) self.assertEqual(soup, soup_copy) def test_copy_tag_copies_contents(self): - html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end" + html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" soup = self.soup(html) div = soup.div div_copy = copy.copy(div) # The two tags look the same, and evaluate to equal. - self.assertEqual(unicode(div), unicode(div_copy)) + self.assertEqual(str(div), str(div_copy)) self.assertEqual(div, div_copy) # But they're not the same object. @@ -1409,67 +1523,75 @@ class TestPersistence(SoupTest): class TestSubstitutions(SoupTest): def test_default_formatter_is_minimal(self): - markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" soup = self.soup(markup) decoded = soup.decode(formatter="minimal") # The < is converted back into < but the e-with-acute is left alone. self.assertEqual( decoded, self.document_for( - u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) + "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) def test_formatter_html(self): - markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" soup = self.soup(markup) decoded = soup.decode(formatter="html") self.assertEqual( decoded, - self.document_for("<b><<Sacré bleu!>></b>")) + self.document_for("<br/><b><<Sacré bleu!>></b>")) + def test_formatter_html5(self): + markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html5") + self.assertEqual( + decoded, + self.document_for("<br><b><<Sacré bleu!>></b>")) + def test_formatter_minimal(self): - markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" soup = self.soup(markup) decoded = soup.decode(formatter="minimal") # The < is converted back into < but the e-with-acute is left alone. self.assertEqual( decoded, self.document_for( - u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) + "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) def test_formatter_null(self): - markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" soup = self.soup(markup) decoded = soup.decode(formatter=None) # Neither the angle brackets nor the e-with-acute are converted. # This is not valid HTML, but it's what the user wanted. self.assertEqual(decoded, - self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) + self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) def test_formatter_custom(self): - markup = u"<b><foo></b><b>bar</b>" + markup = "<b><foo></b><b>bar</b><br/>" soup = self.soup(markup) decoded = soup.decode(formatter = lambda x: x.upper()) # Instead of normal entity conversion code, the custom # callable is called on every string. self.assertEqual( decoded, - self.document_for(u"<b><FOO></b><b>BAR</b>")) + self.document_for("<b><FOO></b><b>BAR</b><br/>")) def test_formatter_is_run_on_attribute_values(self): - markup = u'<a href="http://a.com?a=b&c=é">e</a>' + markup = '<a href="http://a.com?a=b&c=é">e</a>' soup = self.soup(markup) a = soup.a - expect_minimal = u'<a href="http://a.com?a=b&c=é">e</a>' + expect_minimal = '<a href="http://a.com?a=b&c=é">e</a>' self.assertEqual(expect_minimal, a.decode()) self.assertEqual(expect_minimal, a.decode(formatter="minimal")) - expect_html = u'<a href="http://a.com?a=b&c=é">e</a>' + expect_html = '<a href="http://a.com?a=b&c=é">e</a>' self.assertEqual(expect_html, a.decode(formatter="html")) self.assertEqual(markup, a.decode(formatter=None)) - expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>' + expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>' self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) def test_formatter_skips_script_tag_for_html_documents(self): @@ -1491,28 +1613,28 @@ class TestSubstitutions(SoupTest): self.assertTrue(b"< < hey > >" in encoded) def test_prettify_leaves_preformatted_text_alone(self): - soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ") + soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>") # Everything outside the <pre> tag is reformatted, but everything # inside is left alone. self.assertEqual( - u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', + '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>', soup.div.prettify()) - def test_prettify_accepts_formatter(self): + def test_prettify_accepts_formatter_function(self): soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') pretty = soup.prettify(formatter = lambda x: x.upper()) self.assertTrue("FOO" in pretty) def test_prettify_outputs_unicode_by_default(self): soup = self.soup("<a></a>") - self.assertEqual(unicode, type(soup.prettify())) + self.assertEqual(str, type(soup.prettify())) def test_prettify_can_encode_data(self): soup = self.soup("<a></a>") self.assertEqual(bytes, type(soup.prettify("utf-8"))) def test_html_entity_substitution_off_by_default(self): - markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" + markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" soup = self.soup(markup) encoded = soup.b.encode("utf-8") self.assertEqual(encoded, markup.encode('utf-8')) @@ -1556,54 +1678,77 @@ class TestEncoding(SoupTest): """Test the ability to encode objects into strings.""" def test_unicode_string_can_be_encoded(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) self.assertEqual(soup.b.string.encode("utf-8"), - u"\N{SNOWMAN}".encode("utf-8")) + "\N{SNOWMAN}".encode("utf-8")) def test_tag_containing_unicode_string_can_be_encoded(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) self.assertEqual( soup.b.encode("utf-8"), html.encode("utf-8")) def test_encoding_substitutes_unrecognized_characters_by_default(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>") def test_encoding_can_be_made_strict(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) self.assertRaises( UnicodeEncodeError, soup.encode, "ascii", errors="strict") def test_decode_contents(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) - self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) + self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents()) def test_encode_contents(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) self.assertEqual( - u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( + "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( encoding="utf8")) def test_deprecated_renderContents(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) self.assertEqual( - u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) + "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) def test_repr(self): - html = u"<b>\N{SNOWMAN}</b>" + html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) if PY3K: self.assertEqual(html, repr(soup)) else: self.assertEqual(b'<b>\\u2603</b>', repr(soup)) +class TestFormatter(SoupTest): + + def test_sort_attributes(self): + # Test the ability to override Formatter.attributes() to, + # e.g., disable the normal sorting of attributes. + class UnsortedFormatter(Formatter): + def attributes(self, tag): + self.called_with = tag + for k, v in sorted(tag.attrs.items()): + if k == 'ignore': + continue + yield k,v + + soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>') + formatter = UnsortedFormatter() + decoded = soup.decode(formatter=formatter) + + # attributes() was called on the <p> tag. It filtered out one + # attribute and sorted the other two. + self.assertEqual(formatter.called_with, soup.p) + self.assertEqual('<p aval="2" cval="1"></p>', decoded) + + class TestNavigableStringSubclasses(SoupTest): def test_cdata(self): @@ -1720,7 +1865,7 @@ class TestSoupSelector(TreeTest): els = self.soup.select('title') self.assertEqual(len(els), 1) self.assertEqual(els[0].name, 'title') - self.assertEqual(els[0].contents, [u'The title']) + self.assertEqual(els[0].contents, ['The title']) def test_one_tag_many(self): els = self.soup.select('div') @@ -1755,7 +1900,7 @@ class TestSoupSelector(TreeTest): self.assertEqual(len(self.soup.select('del')), 0) def test_invalid_tag(self): - self.assertRaises(ValueError, self.soup.select, 'tag%t') + self.assertRaises(SyntaxError, self.soup.select, 'tag%t') def test_select_dashed_tag_ids(self): self.assertSelects('custom-dashed-tag', ['dash1', 'dash2']) @@ -1766,7 +1911,7 @@ class TestSoupSelector(TreeTest): self.assertEqual(dashed[0]['id'], 'dash2') def test_dashed_tag_text(self): - self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, u'Hello there.') + self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.') def test_select_dashed_matches_find_all(self): self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag')) @@ -1946,32 +2091,31 @@ class TestSoupSelector(TreeTest): NotImplementedError, self.soup.select, "a:no-such-pseudoclass") self.assertRaises( - NotImplementedError, self.soup.select, "a:nth-of-type(a)") - + SyntaxError, self.soup.select, "a:nth-of-type(a)") def test_nth_of_type(self): # Try to select first paragraph els = self.soup.select('div#inner p:nth-of-type(1)') self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Some text') + self.assertEqual(els[0].string, 'Some text') # Try to select third paragraph els = self.soup.select('div#inner p:nth-of-type(3)') self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Another') + self.assertEqual(els[0].string, 'Another') # Try to select (non-existent!) fourth paragraph els = self.soup.select('div#inner p:nth-of-type(4)') self.assertEqual(len(els), 0) - # Pass in an invalid value. - self.assertRaises( - ValueError, self.soup.select, 'div p:nth-of-type(0)') + # Zero will select no tags. + els = self.soup.select('div p:nth-of-type(0)') + self.assertEqual(len(els), 0) def test_nth_of_type_direct_descendant(self): els = self.soup.select('div#inner > p:nth-of-type(1)') self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Some text') + self.assertEqual(els[0].string, 'Some text') def test_id_child_selector_nth_of_type(self): self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) @@ -2003,7 +2147,7 @@ class TestSoupSelector(TreeTest): self.assertEqual([], self.soup.select('#inner ~ h2')) def test_dangling_combinator(self): - self.assertRaises(ValueError, self.soup.select, 'h1 >') + self.assertRaises(SyntaxError, self.soup.select, 'h1 >') def test_sibling_combinator_wont_select_same_tag_twice(self): self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) @@ -2034,8 +2178,8 @@ class TestSoupSelector(TreeTest): self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) def test_invalid_multiple_select(self): - self.assertRaises(ValueError, self.soup.select, ',x, y') - self.assertRaises(ValueError, self.soup.select, 'x,,y') + self.assertRaises(SyntaxError, self.soup.select, ',x, y') + self.assertRaises(SyntaxError, self.soup.select, 'x,,y') def test_multiple_select_attrs(self): self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb']) @@ -2046,5 +2190,16 @@ class TestSoupSelector(TreeTest): def test_multiple_select_nested(self): self.assertSelects('body > div > x, y > z', ['xid', 'zidb']) - - + def test_select_duplicate_elements(self): + # When markup contains duplicate elements, a multiple select + # will find all of them. + markup = '<div class="c1"/><div class="c2"/><div class="c1"/>' + soup = BeautifulSoup(markup, 'html.parser') + selected = soup.select(".c1, .c2") + self.assertEqual(3, len(selected)) + + # Verify that find_all finds the same elements, though because + # of an implementation detail it finds them in a different + # order. + for element in soup.find_all(class_=['c1', 'c2']): + assert element in selected diff --git a/libs/concurrent/__init__.py b/libs/concurrent/__init__.py deleted file mode 100644 index b36383a61..000000000 --- a/libs/concurrent/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from pkgutil import extend_path - -__path__ = extend_path(__path__, __name__) diff --git a/libs/concurrent/futures/__init__.py b/libs/concurrent/futures/__init__.py deleted file mode 100644 index 428b14bdf..000000000 --- a/libs/concurrent/futures/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2009 Brian Quinlan. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -"""Execute computations asynchronously using threads or processes.""" - -__author__ = 'Brian Quinlan ([email protected])' - -from concurrent.futures._base import (FIRST_COMPLETED, - FIRST_EXCEPTION, - ALL_COMPLETED, - CancelledError, - TimeoutError, - Future, - Executor, - wait, - as_completed) -from concurrent.futures.thread import ThreadPoolExecutor - -try: - from concurrent.futures.process import ProcessPoolExecutor -except ImportError: - # some platforms don't have multiprocessing - pass diff --git a/libs/concurrent/futures/_base.py b/libs/concurrent/futures/_base.py deleted file mode 100644 index 2936c46b1..000000000 --- a/libs/concurrent/futures/_base.py +++ /dev/null @@ -1,607 +0,0 @@ -# Copyright 2009 Brian Quinlan. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -import collections -import logging -import threading -import itertools -import time - -__author__ = 'Brian Quinlan ([email protected])' - -FIRST_COMPLETED = 'FIRST_COMPLETED' -FIRST_EXCEPTION = 'FIRST_EXCEPTION' -ALL_COMPLETED = 'ALL_COMPLETED' -_AS_COMPLETED = '_AS_COMPLETED' - -# Possible future states (for internal use by the futures package). -PENDING = 'PENDING' -RUNNING = 'RUNNING' -# The future was cancelled by the user... -CANCELLED = 'CANCELLED' -# ...and _Waiter.add_cancelled() was called by a worker. -CANCELLED_AND_NOTIFIED = 'CANCELLED_AND_NOTIFIED' -FINISHED = 'FINISHED' - -_FUTURE_STATES = [ - PENDING, - RUNNING, - CANCELLED, - CANCELLED_AND_NOTIFIED, - FINISHED -] - -_STATE_TO_DESCRIPTION_MAP = { - PENDING: "pending", - RUNNING: "running", - CANCELLED: "cancelled", - CANCELLED_AND_NOTIFIED: "cancelled", - FINISHED: "finished" -} - -# Logger for internal use by the futures package. -LOGGER = logging.getLogger("concurrent.futures") - -class Error(Exception): - """Base class for all future-related exceptions.""" - pass - -class CancelledError(Error): - """The Future was cancelled.""" - pass - -class TimeoutError(Error): - """The operation exceeded the given deadline.""" - pass - -class _Waiter(object): - """Provides the event that wait() and as_completed() block on.""" - def __init__(self): - self.event = threading.Event() - self.finished_futures = [] - - def add_result(self, future): - self.finished_futures.append(future) - - def add_exception(self, future): - self.finished_futures.append(future) - - def add_cancelled(self, future): - self.finished_futures.append(future) - -class _AsCompletedWaiter(_Waiter): - """Used by as_completed().""" - - def __init__(self): - super(_AsCompletedWaiter, self).__init__() - self.lock = threading.Lock() - - def add_result(self, future): - with self.lock: - super(_AsCompletedWaiter, self).add_result(future) - self.event.set() - - def add_exception(self, future): - with self.lock: - super(_AsCompletedWaiter, self).add_exception(future) - self.event.set() - - def add_cancelled(self, future): - with self.lock: - super(_AsCompletedWaiter, self).add_cancelled(future) - self.event.set() - -class _FirstCompletedWaiter(_Waiter): - """Used by wait(return_when=FIRST_COMPLETED).""" - - def add_result(self, future): - super(_FirstCompletedWaiter, self).add_result(future) - self.event.set() - - def add_exception(self, future): - super(_FirstCompletedWaiter, self).add_exception(future) - self.event.set() - - def add_cancelled(self, future): - super(_FirstCompletedWaiter, self).add_cancelled(future) - self.event.set() - -class _AllCompletedWaiter(_Waiter): - """Used by wait(return_when=FIRST_EXCEPTION and ALL_COMPLETED).""" - - def __init__(self, num_pending_calls, stop_on_exception): - self.num_pending_calls = num_pending_calls - self.stop_on_exception = stop_on_exception - self.lock = threading.Lock() - super(_AllCompletedWaiter, self).__init__() - - def _decrement_pending_calls(self): - with self.lock: - self.num_pending_calls -= 1 - if not self.num_pending_calls: - self.event.set() - - def add_result(self, future): - super(_AllCompletedWaiter, self).add_result(future) - self._decrement_pending_calls() - - def add_exception(self, future): - super(_AllCompletedWaiter, self).add_exception(future) - if self.stop_on_exception: - self.event.set() - else: - self._decrement_pending_calls() - - def add_cancelled(self, future): - super(_AllCompletedWaiter, self).add_cancelled(future) - self._decrement_pending_calls() - -class _AcquireFutures(object): - """A context manager that does an ordered acquire of Future conditions.""" - - def __init__(self, futures): - self.futures = sorted(futures, key=id) - - def __enter__(self): - for future in self.futures: - future._condition.acquire() - - def __exit__(self, *args): - for future in self.futures: - future._condition.release() - -def _create_and_install_waiters(fs, return_when): - if return_when == _AS_COMPLETED: - waiter = _AsCompletedWaiter() - elif return_when == FIRST_COMPLETED: - waiter = _FirstCompletedWaiter() - else: - pending_count = sum( - f._state not in [CANCELLED_AND_NOTIFIED, FINISHED] for f in fs) - - if return_when == FIRST_EXCEPTION: - waiter = _AllCompletedWaiter(pending_count, stop_on_exception=True) - elif return_when == ALL_COMPLETED: - waiter = _AllCompletedWaiter(pending_count, stop_on_exception=False) - else: - raise ValueError("Invalid return condition: %r" % return_when) - - for f in fs: - f._waiters.append(waiter) - - return waiter - -def as_completed(fs, timeout=None): - """An iterator over the given futures that yields each as it completes. - - Args: - fs: The sequence of Futures (possibly created by different Executors) to - iterate over. - timeout: The maximum number of seconds to wait. If None, then there - is no limit on the wait time. - - Returns: - An iterator that yields the given Futures as they complete (finished or - cancelled). If any given Futures are duplicated, they will be returned - once. - - Raises: - TimeoutError: If the entire result iterator could not be generated - before the given timeout. - """ - if timeout is not None: - end_time = timeout + time.time() - - fs = set(fs) - with _AcquireFutures(fs): - finished = set( - f for f in fs - if f._state in [CANCELLED_AND_NOTIFIED, FINISHED]) - pending = fs - finished - waiter = _create_and_install_waiters(fs, _AS_COMPLETED) - - try: - for future in finished: - yield future - - while pending: - if timeout is None: - wait_timeout = None - else: - wait_timeout = end_time - time.time() - if wait_timeout < 0: - raise TimeoutError( - '%d (of %d) futures unfinished' % ( - len(pending), len(fs))) - - waiter.event.wait(wait_timeout) - - with waiter.lock: - finished = waiter.finished_futures - waiter.finished_futures = [] - waiter.event.clear() - - for future in finished: - yield future - pending.remove(future) - - finally: - for f in fs: - with f._condition: - f._waiters.remove(waiter) - -DoneAndNotDoneFutures = collections.namedtuple( - 'DoneAndNotDoneFutures', 'done not_done') -def wait(fs, timeout=None, return_when=ALL_COMPLETED): - """Wait for the futures in the given sequence to complete. - - Args: - fs: The sequence of Futures (possibly created by different Executors) to - wait upon. - timeout: The maximum number of seconds to wait. If None, then there - is no limit on the wait time. - return_when: Indicates when this function should return. The options - are: - - FIRST_COMPLETED - Return when any future finishes or is - cancelled. - FIRST_EXCEPTION - Return when any future finishes by raising an - exception. If no future raises an exception - then it is equivalent to ALL_COMPLETED. - ALL_COMPLETED - Return when all futures finish or are cancelled. - - Returns: - A named 2-tuple of sets. The first set, named 'done', contains the - futures that completed (is finished or cancelled) before the wait - completed. The second set, named 'not_done', contains uncompleted - futures. - """ - with _AcquireFutures(fs): - done = set(f for f in fs - if f._state in [CANCELLED_AND_NOTIFIED, FINISHED]) - not_done = set(fs) - done - - if (return_when == FIRST_COMPLETED) and done: - return DoneAndNotDoneFutures(done, not_done) - elif (return_when == FIRST_EXCEPTION) and done: - if any(f for f in done - if not f.cancelled() and f.exception() is not None): - return DoneAndNotDoneFutures(done, not_done) - - if len(done) == len(fs): - return DoneAndNotDoneFutures(done, not_done) - - waiter = _create_and_install_waiters(fs, return_when) - - waiter.event.wait(timeout) - for f in fs: - with f._condition: - f._waiters.remove(waiter) - - done.update(waiter.finished_futures) - return DoneAndNotDoneFutures(done, set(fs) - done) - -class Future(object): - """Represents the result of an asynchronous computation.""" - - def __init__(self): - """Initializes the future. Should not be called by clients.""" - self._condition = threading.Condition() - self._state = PENDING - self._result = None - self._exception = None - self._traceback = None - self._waiters = [] - self._done_callbacks = [] - - def _invoke_callbacks(self): - for callback in self._done_callbacks: - try: - callback(self) - except Exception: - LOGGER.exception('exception calling callback for %r', self) - - def __repr__(self): - with self._condition: - if self._state == FINISHED: - if self._exception: - return '<Future at %s state=%s raised %s>' % ( - hex(id(self)), - _STATE_TO_DESCRIPTION_MAP[self._state], - self._exception.__class__.__name__) - else: - return '<Future at %s state=%s returned %s>' % ( - hex(id(self)), - _STATE_TO_DESCRIPTION_MAP[self._state], - self._result.__class__.__name__) - return '<Future at %s state=%s>' % ( - hex(id(self)), - _STATE_TO_DESCRIPTION_MAP[self._state]) - - def cancel(self): - """Cancel the future if possible. - - Returns True if the future was cancelled, False otherwise. A future - cannot be cancelled if it is running or has already completed. - """ - with self._condition: - if self._state in [RUNNING, FINISHED]: - return False - - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - return True - - self._state = CANCELLED - self._condition.notify_all() - - self._invoke_callbacks() - return True - - def cancelled(self): - """Return True if the future has cancelled.""" - with self._condition: - return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED] - - def running(self): - """Return True if the future is currently executing.""" - with self._condition: - return self._state == RUNNING - - def done(self): - """Return True of the future was cancelled or finished executing.""" - with self._condition: - return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED] - - def __get_result(self): - if self._exception: - raise type(self._exception), self._exception, self._traceback - else: - return self._result - - def add_done_callback(self, fn): - """Attaches a callable that will be called when the future finishes. - - Args: - fn: A callable that will be called with this future as its only - argument when the future completes or is cancelled. The callable - will always be called by a thread in the same process in which - it was added. If the future has already completed or been - cancelled then the callable will be called immediately. These - callables are called in the order that they were added. - """ - with self._condition: - if self._state not in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]: - self._done_callbacks.append(fn) - return - fn(self) - - def result(self, timeout=None): - """Return the result of the call that the future represents. - - Args: - timeout: The number of seconds to wait for the result if the future - isn't done. If None, then there is no limit on the wait time. - - Returns: - The result of the call that the future represents. - - Raises: - CancelledError: If the future was cancelled. - TimeoutError: If the future didn't finish executing before the given - timeout. - Exception: If the call raised then that exception will be raised. - """ - with self._condition: - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self.__get_result() - - self._condition.wait(timeout) - - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self.__get_result() - else: - raise TimeoutError() - - def exception_info(self, timeout=None): - """Return a tuple of (exception, traceback) raised by the call that the - future represents. - - Args: - timeout: The number of seconds to wait for the exception if the - future isn't done. If None, then there is no limit on the wait - time. - - Returns: - The exception raised by the call that the future represents or None - if the call completed without raising. - - Raises: - CancelledError: If the future was cancelled. - TimeoutError: If the future didn't finish executing before the given - timeout. - """ - with self._condition: - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self._exception, self._traceback - - self._condition.wait(timeout) - - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self._exception, self._traceback - else: - raise TimeoutError() - - def exception(self, timeout=None): - """Return the exception raised by the call that the future represents. - - Args: - timeout: The number of seconds to wait for the exception if the - future isn't done. If None, then there is no limit on the wait - time. - - Returns: - The exception raised by the call that the future represents or None - if the call completed without raising. - - Raises: - CancelledError: If the future was cancelled. - TimeoutError: If the future didn't finish executing before the given - timeout. - """ - return self.exception_info(timeout)[0] - - # The following methods should only be used by Executors and in tests. - def set_running_or_notify_cancel(self): - """Mark the future as running or process any cancel notifications. - - Should only be used by Executor implementations and unit tests. - - If the future has been cancelled (cancel() was called and returned - True) then any threads waiting on the future completing (though calls - to as_completed() or wait()) are notified and False is returned. - - If the future was not cancelled then it is put in the running state - (future calls to running() will return True) and True is returned. - - This method should be called by Executor implementations before - executing the work associated with this future. If this method returns - False then the work should not be executed. - - Returns: - False if the Future was cancelled, True otherwise. - - Raises: - RuntimeError: if this method was already called or if set_result() - or set_exception() was called. - """ - with self._condition: - if self._state == CANCELLED: - self._state = CANCELLED_AND_NOTIFIED - for waiter in self._waiters: - waiter.add_cancelled(self) - # self._condition.notify_all() is not necessary because - # self.cancel() triggers a notification. - return False - elif self._state == PENDING: - self._state = RUNNING - return True - else: - LOGGER.critical('Future %s in unexpected state: %s', - id(self), - self._state) - raise RuntimeError('Future in unexpected state') - - def set_result(self, result): - """Sets the return value of work associated with the future. - - Should only be used by Executor implementations and unit tests. - """ - with self._condition: - self._result = result - self._state = FINISHED - for waiter in self._waiters: - waiter.add_result(self) - self._condition.notify_all() - self._invoke_callbacks() - - def set_exception_info(self, exception, traceback): - """Sets the result of the future as being the given exception - and traceback. - - Should only be used by Executor implementations and unit tests. - """ - with self._condition: - self._exception = exception - self._traceback = traceback - self._state = FINISHED - for waiter in self._waiters: - waiter.add_exception(self) - self._condition.notify_all() - self._invoke_callbacks() - - def set_exception(self, exception): - """Sets the result of the future as being the given exception. - - Should only be used by Executor implementations and unit tests. - """ - self.set_exception_info(exception, None) - -class Executor(object): - """This is an abstract base class for concrete asynchronous executors.""" - - def submit(self, fn, *args, **kwargs): - """Submits a callable to be executed with the given arguments. - - Schedules the callable to be executed as fn(*args, **kwargs) and returns - a Future instance representing the execution of the callable. - - Returns: - A Future representing the given call. - """ - raise NotImplementedError() - - def map(self, fn, *iterables, **kwargs): - """Returns a iterator equivalent to map(fn, iter). - - Args: - fn: A callable that will take as many arguments as there are - passed iterables. - timeout: The maximum number of seconds to wait. If None, then there - is no limit on the wait time. - - Returns: - An iterator equivalent to: map(func, *iterables) but the calls may - be evaluated out-of-order. - - Raises: - TimeoutError: If the entire result iterator could not be generated - before the given timeout. - Exception: If fn(*args) raises for any values. - """ - timeout = kwargs.get('timeout') - if timeout is not None: - end_time = timeout + time.time() - - fs = [self.submit(fn, *args) for args in itertools.izip(*iterables)] - - # Yield must be hidden in closure so that the futures are submitted - # before the first iterator value is required. - def result_iterator(): - try: - for future in fs: - if timeout is None: - yield future.result() - else: - yield future.result(end_time - time.time()) - finally: - for future in fs: - future.cancel() - return result_iterator() - - def shutdown(self, wait=True): - """Clean-up the resources associated with the Executor. - - It is safe to call this method several times. Otherwise, no other - methods can be called after this one. - - Args: - wait: If True then shutdown will not return until all running - futures have finished executing and the resources used by the - executor have been reclaimed. - """ - pass - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.shutdown(wait=True) - return False diff --git a/libs/concurrent/futures/process.py b/libs/concurrent/futures/process.py deleted file mode 100644 index 72528410c..000000000 --- a/libs/concurrent/futures/process.py +++ /dev/null @@ -1,359 +0,0 @@ -# Copyright 2009 Brian Quinlan. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -"""Implements ProcessPoolExecutor. - -The follow diagram and text describe the data-flow through the system: - -|======================= In-process =====================|== Out-of-process ==| - -+----------+ +----------+ +--------+ +-----------+ +---------+ -| | => | Work Ids | => | | => | Call Q | => | | -| | +----------+ | | +-----------+ | | -| | | ... | | | | ... | | | -| | | 6 | | | | 5, call() | | | -| | | 7 | | | | ... | | | -| Process | | ... | | Local | +-----------+ | Process | -| Pool | +----------+ | Worker | | #1..n | -| Executor | | Thread | | | -| | +----------- + | | +-----------+ | | -| | <=> | Work Items | <=> | | <= | Result Q | <= | | -| | +------------+ | | +-----------+ | | -| | | 6: call() | | | | ... | | | -| | | future | | | | 4, result | | | -| | | ... | | | | 3, except | | | -+----------+ +------------+ +--------+ +-----------+ +---------+ - -Executor.submit() called: -- creates a uniquely numbered _WorkItem and adds it to the "Work Items" dict -- adds the id of the _WorkItem to the "Work Ids" queue - -Local worker thread: -- reads work ids from the "Work Ids" queue and looks up the corresponding - WorkItem from the "Work Items" dict: if the work item has been cancelled then - it is simply removed from the dict, otherwise it is repackaged as a - _CallItem and put in the "Call Q". New _CallItems are put in the "Call Q" - until "Call Q" is full. NOTE: the size of the "Call Q" is kept small because - calls placed in the "Call Q" can no longer be cancelled with Future.cancel(). -- reads _ResultItems from "Result Q", updates the future stored in the - "Work Items" dict and deletes the dict entry - -Process #1..n: -- reads _CallItems from "Call Q", executes the calls, and puts the resulting - _ResultItems in "Request Q" -""" - -import atexit -from concurrent.futures import _base -import Queue as queue -import multiprocessing -import threading -import weakref -import sys - -__author__ = 'Brian Quinlan ([email protected])' - -# Workers are created as daemon threads and processes. This is done to allow the -# interpreter to exit when there are still idle processes in a -# ProcessPoolExecutor's process pool (i.e. shutdown() was not called). However, -# allowing workers to die with the interpreter has two undesirable properties: -# - The workers would still be running during interpretor shutdown, -# meaning that they would fail in unpredictable ways. -# - The workers could be killed while evaluating a work item, which could -# be bad if the callable being evaluated has external side-effects e.g. -# writing to a file. -# -# To work around this problem, an exit handler is installed which tells the -# workers to exit when their work queues are empty and then waits until the -# threads/processes finish. - -_threads_queues = weakref.WeakKeyDictionary() -_shutdown = False - -def _python_exit(): - global _shutdown - _shutdown = True - items = list(_threads_queues.items()) if _threads_queues else () - for t, q in items: - q.put(None) - for t, q in items: - t.join(sys.maxint) - -# Controls how many more calls than processes will be queued in the call queue. -# A smaller number will mean that processes spend more time idle waiting for -# work while a larger number will make Future.cancel() succeed less frequently -# (Futures in the call queue cannot be cancelled). -EXTRA_QUEUED_CALLS = 1 - -class _WorkItem(object): - def __init__(self, future, fn, args, kwargs): - self.future = future - self.fn = fn - self.args = args - self.kwargs = kwargs - -class _ResultItem(object): - def __init__(self, work_id, exception=None, result=None): - self.work_id = work_id - self.exception = exception - self.result = result - -class _CallItem(object): - def __init__(self, work_id, fn, args, kwargs): - self.work_id = work_id - self.fn = fn - self.args = args - self.kwargs = kwargs - -def _process_worker(call_queue, result_queue): - """Evaluates calls from call_queue and places the results in result_queue. - - This worker is run in a separate process. - - Args: - call_queue: A multiprocessing.Queue of _CallItems that will be read and - evaluated by the worker. - result_queue: A multiprocessing.Queue of _ResultItems that will written - to by the worker. - shutdown: A multiprocessing.Event that will be set as a signal to the - worker that it should exit when call_queue is empty. - """ - while True: - call_item = call_queue.get(block=True) - if call_item is None: - # Wake up queue management thread - result_queue.put(None) - return - try: - r = call_item.fn(*call_item.args, **call_item.kwargs) - except BaseException: - e = sys.exc_info()[1] - result_queue.put(_ResultItem(call_item.work_id, - exception=e)) - else: - result_queue.put(_ResultItem(call_item.work_id, - result=r)) - -def _add_call_item_to_queue(pending_work_items, - work_ids, - call_queue): - """Fills call_queue with _WorkItems from pending_work_items. - - This function never blocks. - - Args: - pending_work_items: A dict mapping work ids to _WorkItems e.g. - {5: <_WorkItem...>, 6: <_WorkItem...>, ...} - work_ids: A queue.Queue of work ids e.g. Queue([5, 6, ...]). Work ids - are consumed and the corresponding _WorkItems from - pending_work_items are transformed into _CallItems and put in - call_queue. - call_queue: A multiprocessing.Queue that will be filled with _CallItems - derived from _WorkItems. - """ - while True: - if call_queue.full(): - return - try: - work_id = work_ids.get(block=False) - except queue.Empty: - return - else: - work_item = pending_work_items[work_id] - - if work_item.future.set_running_or_notify_cancel(): - call_queue.put(_CallItem(work_id, - work_item.fn, - work_item.args, - work_item.kwargs), - block=True) - else: - del pending_work_items[work_id] - continue - -def _queue_management_worker(executor_reference, - processes, - pending_work_items, - work_ids_queue, - call_queue, - result_queue): - """Manages the communication between this process and the worker processes. - - This function is run in a local thread. - - Args: - executor_reference: A weakref.ref to the ProcessPoolExecutor that owns - this thread. Used to determine if the ProcessPoolExecutor has been - garbage collected and that this function can exit. - process: A list of the multiprocessing.Process instances used as - workers. - pending_work_items: A dict mapping work ids to _WorkItems e.g. - {5: <_WorkItem...>, 6: <_WorkItem...>, ...} - work_ids_queue: A queue.Queue of work ids e.g. Queue([5, 6, ...]). - call_queue: A multiprocessing.Queue that will be filled with _CallItems - derived from _WorkItems for processing by the process workers. - result_queue: A multiprocessing.Queue of _ResultItems generated by the - process workers. - """ - nb_shutdown_processes = [0] - def shutdown_one_process(): - """Tell a worker to terminate, which will in turn wake us again""" - call_queue.put(None) - nb_shutdown_processes[0] += 1 - while True: - _add_call_item_to_queue(pending_work_items, - work_ids_queue, - call_queue) - - result_item = result_queue.get(block=True) - if result_item is not None: - work_item = pending_work_items[result_item.work_id] - del pending_work_items[result_item.work_id] - - if result_item.exception: - work_item.future.set_exception(result_item.exception) - else: - work_item.future.set_result(result_item.result) - # Delete references to object. See issue16284 - del work_item - # Check whether we should start shutting down. - executor = executor_reference() - # No more work items can be added if: - # - The interpreter is shutting down OR - # - The executor that owns this worker has been collected OR - # - The executor that owns this worker has been shutdown. - if _shutdown or executor is None or executor._shutdown_thread: - # Since no new work items can be added, it is safe to shutdown - # this thread if there are no pending work items. - if not pending_work_items: - while nb_shutdown_processes[0] < len(processes): - shutdown_one_process() - # If .join() is not called on the created processes then - # some multiprocessing.Queue methods may deadlock on Mac OS - # X. - for p in processes: - p.join() - call_queue.close() - return - del executor - -_system_limits_checked = False -_system_limited = None -def _check_system_limits(): - global _system_limits_checked, _system_limited - if _system_limits_checked: - if _system_limited: - raise NotImplementedError(_system_limited) - _system_limits_checked = True - try: - import os - nsems_max = os.sysconf("SC_SEM_NSEMS_MAX") - except (AttributeError, ValueError): - # sysconf not available or setting not available - return - if nsems_max == -1: - # indetermine limit, assume that limit is determined - # by available memory only - return - if nsems_max >= 256: - # minimum number of semaphores available - # according to POSIX - return - _system_limited = "system provides too few semaphores (%d available, 256 necessary)" % nsems_max - raise NotImplementedError(_system_limited) - -class ProcessPoolExecutor(_base.Executor): - def __init__(self, max_workers=None): - """Initializes a new ProcessPoolExecutor instance. - - Args: - max_workers: The maximum number of processes that can be used to - execute the given calls. If None or not given then as many - worker processes will be created as the machine has processors. - """ - _check_system_limits() - - if max_workers is None: - self._max_workers = multiprocessing.cpu_count() - else: - self._max_workers = max_workers - - # Make the call queue slightly larger than the number of processes to - # prevent the worker processes from idling. But don't make it too big - # because futures in the call queue cannot be cancelled. - self._call_queue = multiprocessing.Queue(self._max_workers + - EXTRA_QUEUED_CALLS) - self._result_queue = multiprocessing.Queue() - self._work_ids = queue.Queue() - self._queue_management_thread = None - self._processes = set() - - # Shutdown is a two-step process. - self._shutdown_thread = False - self._shutdown_lock = threading.Lock() - self._queue_count = 0 - self._pending_work_items = {} - - def _start_queue_management_thread(self): - # When the executor gets lost, the weakref callback will wake up - # the queue management thread. - def weakref_cb(_, q=self._result_queue): - q.put(None) - if self._queue_management_thread is None: - self._queue_management_thread = threading.Thread( - target=_queue_management_worker, - args=(weakref.ref(self, weakref_cb), - self._processes, - self._pending_work_items, - self._work_ids, - self._call_queue, - self._result_queue)) - self._queue_management_thread.daemon = True - self._queue_management_thread.start() - _threads_queues[self._queue_management_thread] = self._result_queue - - def _adjust_process_count(self): - for _ in range(len(self._processes), self._max_workers): - p = multiprocessing.Process( - target=_process_worker, - args=(self._call_queue, - self._result_queue)) - p.start() - self._processes.add(p) - - def submit(self, fn, *args, **kwargs): - with self._shutdown_lock: - if self._shutdown_thread: - raise RuntimeError('cannot schedule new futures after shutdown') - - f = _base.Future() - w = _WorkItem(f, fn, args, kwargs) - - self._pending_work_items[self._queue_count] = w - self._work_ids.put(self._queue_count) - self._queue_count += 1 - # Wake up queue management thread - self._result_queue.put(None) - - self._start_queue_management_thread() - self._adjust_process_count() - return f - submit.__doc__ = _base.Executor.submit.__doc__ - - def shutdown(self, wait=True): - with self._shutdown_lock: - self._shutdown_thread = True - if self._queue_management_thread: - # Wake up queue management thread - self._result_queue.put(None) - if wait: - self._queue_management_thread.join(sys.maxint) - # To reduce the risk of openning too many files, remove references to - # objects that use file descriptors. - self._queue_management_thread = None - self._call_queue = None - self._result_queue = None - self._processes = None - shutdown.__doc__ = _base.Executor.shutdown.__doc__ - -atexit.register(_python_exit) diff --git a/libs/concurrent/futures/thread.py b/libs/concurrent/futures/thread.py deleted file mode 100644 index 85ab4b743..000000000 --- a/libs/concurrent/futures/thread.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2009 Brian Quinlan. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -"""Implements ThreadPoolExecutor.""" - -import atexit -from concurrent.futures import _base -import Queue as queue -import threading -import weakref -import sys - -__author__ = 'Brian Quinlan ([email protected])' - -# Workers are created as daemon threads. This is done to allow the interpreter -# to exit when there are still idle threads in a ThreadPoolExecutor's thread -# pool (i.e. shutdown() was not called). However, allowing workers to die with -# the interpreter has two undesirable properties: -# - The workers would still be running during interpretor shutdown, -# meaning that they would fail in unpredictable ways. -# - The workers could be killed while evaluating a work item, which could -# be bad if the callable being evaluated has external side-effects e.g. -# writing to a file. -# -# To work around this problem, an exit handler is installed which tells the -# workers to exit when their work queues are empty and then waits until the -# threads finish. - -_threads_queues = weakref.WeakKeyDictionary() -_shutdown = False - -def _python_exit(): - global _shutdown - _shutdown = True - items = list(_threads_queues.items()) if _threads_queues else () - for t, q in items: - q.put(None) - for t, q in items: - t.join(sys.maxint) - -atexit.register(_python_exit) - -class _WorkItem(object): - def __init__(self, future, fn, args, kwargs): - self.future = future - self.fn = fn - self.args = args - self.kwargs = kwargs - - def run(self): - if not self.future.set_running_or_notify_cancel(): - return - - try: - result = self.fn(*self.args, **self.kwargs) - except BaseException: - e, tb = sys.exc_info()[1:] - self.future.set_exception_info(e, tb) - else: - self.future.set_result(result) - -def _worker(executor_reference, work_queue): - try: - while True: - work_item = work_queue.get(block=True) - if work_item is not None: - work_item.run() - # Delete references to object. See issue16284 - del work_item - continue - executor = executor_reference() - # Exit if: - # - The interpreter is shutting down OR - # - The executor that owns the worker has been collected OR - # - The executor that owns the worker has been shutdown. - if _shutdown or executor is None or executor._shutdown: - # Notice other workers - work_queue.put(None) - return - del executor - except BaseException: - _base.LOGGER.critical('Exception in worker', exc_info=True) - -class ThreadPoolExecutor(_base.Executor): - def __init__(self, max_workers): - """Initializes a new ThreadPoolExecutor instance. - - Args: - max_workers: The maximum number of threads that can be used to - execute the given calls. - """ - self._max_workers = max_workers - self._work_queue = queue.Queue() - self._threads = set() - self._shutdown = False - self._shutdown_lock = threading.Lock() - - def submit(self, fn, *args, **kwargs): - with self._shutdown_lock: - if self._shutdown: - raise RuntimeError('cannot schedule new futures after shutdown') - - f = _base.Future() - w = _WorkItem(f, fn, args, kwargs) - - self._work_queue.put(w) - self._adjust_thread_count() - return f - submit.__doc__ = _base.Executor.submit.__doc__ - - def _adjust_thread_count(self): - # When the executor gets lost, the weakref callback will wake up - # the worker threads. - def weakref_cb(_, q=self._work_queue): - q.put(None) - # TODO(bquinlan): Should avoid creating new threads if there are more - # idle threads than items in the work queue. - if len(self._threads) < self._max_workers: - t = threading.Thread(target=_worker, - args=(weakref.ref(self, weakref_cb), - self._work_queue)) - t.daemon = True - t.start() - self._threads.add(t) - _threads_queues[t] = self._work_queue - - def shutdown(self, wait=True): - with self._shutdown_lock: - self._shutdown = True - self._work_queue.put(None) - if wait: - for t in self._threads: - t.join(sys.maxint) - shutdown.__doc__ = _base.Executor.shutdown.__doc__ diff --git a/libs/deathbycaptcha.py b/libs/deathbycaptcha.py index 3c2fafb77..8bc04ab93 100644 --- a/libs/deathbycaptcha.py +++ b/libs/deathbycaptcha.py @@ -1,73 +1,6 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- -"""Death by Captcha HTTP and socket API clients. - -There are two types of Death by Captcha (DBC hereinafter) API: HTTP and -socket ones. Both offer the same functionalily, with the socket API -sporting faster responses and using way less connections. - -To access the socket API, use SocketClient class; for the HTTP API, use -HttpClient class. Both are thread-safe. SocketClient keeps a persistent -connection opened and serializes all API requests sent through it, thus -it is advised to keep a pool of them if you're script is heavily -multithreaded. - -Both SocketClient and HttpClient give you the following methods: - -get_user() - Returns your DBC account details as a dict with the following keys: - - "user": your account numeric ID; if login fails, it will be the only - item with the value of 0; - "rate": your CAPTCHA rate, i.e. how much you will be charged for one - solved CAPTCHA in US cents; - "balance": your DBC account balance in US cents; - "is_banned": flag indicating whether your account is suspended or not. - -get_balance() - Returns your DBC account balance in US cents. - -get_captcha(cid) - Returns an uploaded CAPTCHA details as a dict with the following keys: - - "captcha": the CAPTCHA numeric ID; if no such CAPTCHAs found, it will - be the only item with the value of 0; - "text": the CAPTCHA text, if solved, otherwise None; - "is_correct": flag indicating whether the CAPTCHA was solved correctly - (DBC can detect that in rare cases). - - The only argument `cid` is the CAPTCHA numeric ID. - -get_text(cid) - Returns an uploaded CAPTCHA text (None if not solved). The only argument - `cid` is the CAPTCHA numeric ID. - -report(cid) - Reports an incorrectly solved CAPTCHA. The only argument `cid` is the - CAPTCHA numeric ID. Returns True on success, False otherwise. - -upload(captcha) - Uploads a CAPTCHA. The only argument `captcha` can be either file-like - object (any object with `read` method defined, actually, so StringIO - will do), or CAPTCHA image file name. On successul upload you'll get - the CAPTCHA details dict (see get_captcha() method). - - NOTE: AT THIS POINT THE UPLOADED CAPTCHA IS NOT SOLVED YET! You have - to poll for its status periodically using get_captcha() or get_text() - method until the CAPTCHA is solved and you get the text. - -decode(captcha, timeout=DEFAULT_TIMEOUT) - A convenient method that uploads a CAPTCHA and polls for its status - periodically, but no longer than `timeout` (defaults to 60 seconds). - If solved, you'll get the CAPTCHA details dict (see get_captcha() - method for details). See upload() method for details on `captcha` - argument. - -Visit http://www.deathbycaptcha.com/user/api for updates. - -""" - import base64 import binascii import errno @@ -79,8 +12,7 @@ import socket import sys import threading import time -import urllib -import urllib2 + try: from json import read as json_decode, write as json_encode except ImportError: @@ -89,64 +21,71 @@ except ImportError: except ImportError: from simplejson import loads as json_decode, dumps as json_encode +try: + from urllib2 import build_opener, HTTPRedirectHandler, Request, HTTPError + from urllib import urlencode, urlopen +except ImportError: + from urllib.request import build_opener, HTTPRedirectHandler, Request, urlopen + from urllib.error import HTTPError + from urllib.parse import urlencode # API version and unique software ID -API_VERSION = 'DBC/Python v4.6' +API_VERSION = 'DBC/Python v4.0.11' +SOFTWARE_VENDOR_ID = 0 # Default CAPTCHA timeout and decode() polling interval DEFAULT_TIMEOUT = 60 -DEFAULT_TOKEN_TIMEOUT = 120 -POLLS_INTERVAL = [1, 1, 2, 3, 2, 2, 3, 2, 2] -DFLT_POLL_INTERVAL = 3 +POLLS_INTERVAL = 5 # Base HTTP API url -HTTP_BASE_URL = 'http://api.dbcapi.me/api' +HTTP_BASE_URL = 'http://api.deathbycaptcha.com/api' # Preferred HTTP API server's response content type, do not change HTTP_RESPONSE_TYPE = 'application/json' # Socket API server's host & ports range -SOCKET_HOST = 'api.dbcapi.me' +SOCKET_HOST = 'api.deathbycaptcha.com' SOCKET_PORTS = range(8123, 8131) - -def _load_image(captcha): - if hasattr(captcha, 'read'): - img = captcha.read() - elif type(captcha) == bytearray: - img = captcha - else: - img = '' - try: - captcha_file = open(captcha, 'rb') - except Exception: - raise - else: - img = captcha_file.read() - captcha_file.close() - if not len(img): - raise ValueError('CAPTCHA image is empty') - elif imghdr.what(None, img) is None: - raise TypeError('Unknown CAPTCHA image type') - else: - return img - - class AccessDeniedException(Exception): pass - class Client(object): - - """Death by Captcha API Client.""" + """Death by Captcha API Client""" def __init__(self, username, password): self.is_verbose = False - self.userpwd = {'username': username, 'password': password} + self.userpwd = {'username': username, + 'password': password} + + def _load_file(self, captcha): + if hasattr(captcha, 'read'): + raw_captcha = captcha.read() + elif isinstance(captcha, bytearray): + raw_captcha = captcha + elif os.path.isfile(captcha): + raw_captcha = '' + try: + f = open(captcha, 'rb') + except Exception as e: + raise e + else: + raw_captcha = f.read() + f.close() + else: + f_stream = urlopen(captcha) + raw_captcha = f_stream.read() + + if not len(raw_captcha): + raise ValueError('CAPTCHA image is empty') + elif imghdr.what(None, raw_captcha) is None: + raise TypeError('Unknown CAPTCHA image type') + else: + return raw_captcha def _log(self, cmd, msg=''): if self.is_verbose: - print '%d %s %s' % (time.time(), cmd, msg.rstrip()) + print('%d %s %s' % (time.time(), cmd, msg.rstrip())) return self def close(self): @@ -156,16 +95,16 @@ class Client(object): pass def get_user(self): - """Fetch user details -- ID, balance, rate and banned status.""" - raise NotImplementedError() + """Fetch the user's details dict -- balance, rate and banned status.""" + raise NotImplemented() def get_balance(self): - """Fetch user balance (in US cents).""" + """Fetch the user's balance (in US cents).""" return self.get_user().get('balance') def get_captcha(self, cid): - """Fetch a CAPTCHA details -- ID, text and correctness flag.""" - raise NotImplementedError() + """Fetch a CAPTCHA details dict -- its ID, text and correctness.""" + raise NotImplemented() def get_text(self, cid): """Fetch a CAPTCHA text.""" @@ -173,7 +112,11 @@ class Client(object): def report(self, cid): """Report a CAPTCHA as incorrectly solved.""" - raise NotImplementedError() + raise NotImplemented() + + def remove(self, cid): + """Remove an unsolved CAPTCHA.""" + raise NotImplemented() def upload(self, captcha): """Upload a CAPTCHA. @@ -182,56 +125,32 @@ class Client(object): dict on success. """ - raise NotImplementedError() + raise NotImplemented() - def decode(self, captcha=None, timeout=None, **kwargs): - """ - Try to solve a CAPTCHA. + def decode(self, captcha, timeout=DEFAULT_TIMEOUT): + """Try to solve a CAPTCHA. See Client.upload() for arguments details. Uploads a CAPTCHA, polls for its status periodically with arbitrary timeout (in seconds), returns CAPTCHA details if (correctly) solved. - """ - if not timeout: - if not captcha: - timeout = DEFAULT_TOKEN_TIMEOUT - else: - timeout = DEFAULT_TIMEOUT + """ deadline = time.time() + (max(0, timeout) or DEFAULT_TIMEOUT) - uploaded_captcha = self.upload(captcha, **kwargs) - if uploaded_captcha: - intvl_idx = 0 # POLL_INTERVAL index - while deadline > time.time() and not uploaded_captcha.get('text'): - intvl, intvl_idx = self._get_poll_interval(intvl_idx) - time.sleep(intvl) - pulled = self.get_captcha(uploaded_captcha['captcha']) - if pulled['captcha'] == uploaded_captcha['captcha']: - uploaded_captcha = pulled - if uploaded_captcha.get('text') and \ - uploaded_captcha.get('is_correct'): - return uploaded_captcha - - def _get_poll_interval(self, idx): - """Returns poll interval and next index depending on index provided""" - - if len(POLLS_INTERVAL) > idx: - intvl = POLLS_INTERVAL[idx] - else: - intvl = DFLT_POLL_INTERVAL - idx += 1 - - return intvl, idx - + c = self.upload(captcha) + if c: + while deadline > time.time() and not c.get('text'): + time.sleep(POLLS_INTERVAL) + c = self.get_captcha(c['captcha']) + if c.get('text') and c.get('is_correct'): + return c class HttpClient(Client): - """Death by Captcha HTTP API client.""" def __init__(self, *args): Client.__init__(self, *args) - self.opener = urllib2.build_opener(urllib2.HTTPRedirectHandler()) + self.opener = build_opener(HTTPRedirectHandler()) def _call(self, cmd, payload=None, headers=None): if headers is None: @@ -239,30 +158,22 @@ class HttpClient(Client): headers['Accept'] = HTTP_RESPONSE_TYPE headers['User-Agent'] = API_VERSION if hasattr(payload, 'items'): - payload = urllib.urlencode(payload) + payload = urlencode(payload) self._log('SEND', '%s %d %s' % (cmd, len(payload), payload)) - else: - self._log('SEND', '%s' % cmd) if payload is not None: headers['Content-Length'] = len(payload) try: - response = self.opener.open(urllib2.Request( + response = self.opener.open(Request( HTTP_BASE_URL + '/' + cmd.strip('/'), data=payload, headers=headers )).read() - except urllib2.HTTPError, err: - if 403 == err.code: - raise AccessDeniedException('Access denied, please check' - ' your credentials and/or balance') - elif 400 == err.code or 413 == err.code: - raise ValueError("CAPTCHA was rejected by the service, check" - " if it's a valid image") - elif 503 == err.code: - raise OverflowError("CAPTCHA was rejected due to service" - " overload, try again later") - else: - raise err + except HTTPError as e: + if 403 == e.code: + raise AccessDeniedException( + 'Access denied, please check your credentials and/or balance') + elif 400 == e.code or 413 == e.code: + raise ValueError("CAPTCHA was rejected by the service, check if it's a valid image") else: self._log('RECV', '%d %s' % (len(response), response)) try: @@ -281,53 +192,38 @@ class HttpClient(Client): return not self._call('captcha/%d/report' % cid, self.userpwd.copy()).get('is_correct') - def upload(self, captcha=None, **kwargs): - boundary = binascii.hexlify(os.urandom(16)) - banner = kwargs.get('banner', '') - if banner: - kwargs['banner'] = 'base64:' + base64.b64encode(_load_image(banner)) - body = '\r\n'.join(('\r\n'.join(( - '--%s' % boundary, - 'Content-Disposition: form-data; name="%s"' % k, - 'Content-Type: text/plain', - 'Content-Length: %d' % len(str(v)), - '', - str(v) - ))) for k, v in self.userpwd.items()) - - body += '\r\n'.join(('\r\n'.join(( - '--%s' % boundary, - 'Content-Disposition: form-data; name="%s"' % k, - 'Content-Type: text/plain', - 'Content-Length: %d' % len(str(v)), - '', - str(v) - ))) for k, v in kwargs.items()) - - if captcha: - img = _load_image(captcha) - body += '\r\n'.join(( - '', - '--%s' % boundary, - 'Content-Disposition: form-data; name="captchafile"; ' - 'filename="captcha"', - 'Content-Type: application/octet-stream', - 'Content-Length: %d' % len(img), - '', - img, - '--%s--' % boundary, - '' - )) + def remove(self, cid): + return not self._call('captcha/%d/remove' % cid, + self.userpwd.copy()).get('captcha') + def upload(self, captcha): + boundary = binascii.hexlify(os.urandom(16)) + data = self.userpwd.copy() + data['swid'] = SOFTWARE_VENDOR_ID + body = '\r\n'.join(('\r\n'.join(('--%s' % boundary, + 'Content-Disposition: form-data; name="%s"' % k, + 'Content-Type: text/plain', + 'Content-Length: %d' % len(str(v)), + '', + str(v)))) + for k, v in data.items()) + captcha = self._load_file(captcha) + body += '\r\n'.join(('', + '--%s' % boundary, + 'Content-Disposition: form-data; name="captchafile"; filename="captcha"', + 'Content-Type: application/octet-stream', + 'Content-Length: %d' % len(captcha), + '', + captcha, + '--%s--' % boundary, + '')) response = self._call('captcha', body, { 'Content-Type': 'multipart/form-data; boundary="%s"' % boundary }) or {} if response.get('captcha'): return response - class SocketClient(Client): - """Death by Captcha socket API client.""" TERMINATOR = '\r\n' @@ -357,11 +253,12 @@ class SocketClient(Client): self.socket.settimeout(0) try: self.socket.connect(host) - except socket.error, err: - if (err.args[0] not in - (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)): + except socket.error as e: + if errno.EINPROGRESS == e[0]: + pass + else: self.close() - raise err + raise e return self.socket def __del__(self): @@ -372,30 +269,27 @@ class SocketClient(Client): fds = [sock] buf += self.TERMINATOR response = '' - intvl_idx = 0 while True: - intvl, intvl_idx = self._get_poll_interval(intvl_idx) - rds, wrs, exs = select.select((not buf and fds) or [], - (buf and fds) or [], - fds, - intvl) - if exs: + rd, wr, ex = select.select((not buf and fds) or [], + (buf and fds) or [], + fds, + POLLS_INTERVAL) + if ex: raise IOError('select() failed') try: - if wrs: + if wr: while buf: - buf = buf[wrs[0].send(buf):] - elif rds: + buf = buf[wr[0].send(buf):] + elif rd: while True: - s = rds[0].recv(256) + s = rd[0].recv(256) if not s: raise IOError('recv(): connection lost') else: response += s - except socket.error, err: - if (err.args[0] not in - (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)): - raise err + except socket.error as e: + if e[0] not in (errno.EAGAIN, errno.EINPROGRESS): + raise e if response.endswith(self.TERMINATOR): self._log('RECV', response) return response.rstrip(self.TERMINATOR) @@ -409,18 +303,16 @@ class SocketClient(Client): request = json_encode(data) response = None - for _ in range(2): - if not self.socket and cmd != 'login': - self._call('login', self.userpwd.copy()) + for i in range(2): self.socket_lock.acquire() try: sock = self.connect() response = self._sendrecv(sock, request) - except IOError, err: - sys.stderr.write(str(err) + "\n") + except IOError as e: + sys.stderr.write(str(e) + "\n") self.close() - except socket.error, err: - sys.stderr.write(str(err) + "\n") + except socket.error as e: + sys.stderr.write(str(e) + "\n") self.close() raise IOError('Connection refused') else: @@ -428,89 +320,84 @@ class SocketClient(Client): finally: self.socket_lock.release() - if response is None: - raise IOError('Connection lost or timed out during API request') - try: - response = json_decode(response) - except Exception: - raise RuntimeError('Invalid API response') - - if not response.get('error'): - return response - - error = response['error'] - if error in ('not-logged-in', 'invalid-credentials'): - raise AccessDeniedException('Access denied, check your credentials') - elif 'banned' == error: - raise AccessDeniedException('Access denied, account is suspended') - elif 'insufficient-funds' == error: - raise AccessDeniedException( - 'CAPTCHA was rejected due to low balance') - elif 'invalid-captcha' == error: - raise ValueError('CAPTCHA is not a valid image') - elif 'service-overload' == error: - raise OverflowError( - 'CAPTCHA was rejected due to service overload, try again later') - else: + if response is None: + raise IOError('Connection lost timed out during API request') + try: + response = json_decode(response) + except Exception: + raise RuntimeError('Invalid API response') + if 'error' in response: + error = response['error'] + if 'not-logged-in' == error: + raise AccessDeniedException('Access denied, check your credentials') + elif 'banned' == error: + raise AccessDeniedException('Access denied, account is suspended') + elif 'insufficient-funds' == error: + raise AccessDeniedException('CAPTCHA was rejected due to low balance') + elif 'invalid-captcha' == error: + raise ValueError('CAPTCHA is not a valid image') + elif 'service-overload' == error: + raise ValueError( + 'CAPTCHA was rejected due to service overload, try again later') + else: + raise RuntimeError('API server error occured: %s' % error) + except Exception as e: self.socket_lock.acquire() self.close() self.socket_lock.release() - raise RuntimeError('API server error occured: %s' % error) + raise e + else: + return response def get_user(self): - return self._call('user') or {'user': 0} + return self._call('user', self.userpwd.copy()) or {'user': 0} def get_captcha(self, cid): return self._call('captcha', {'captcha': cid}) or {'captcha': 0} - def upload(self, captcha=None, **kwargs): - data = {} - if captcha: - data['captcha'] = base64.b64encode(_load_image(captcha)) - if kwargs: - banner = kwargs.get('banner', '') - if banner: - kwargs['banner'] = base64.b64encode(_load_image(banner)) - data.update(kwargs) + def upload(self, captcha): + data = self.userpwd.copy() + data['captcha'] = base64.b64encode(self._load_file(captcha)) response = self._call('upload', data) if response.get('captcha'): - uploaded_captcha = dict( - (k, response.get(k)) - for k in ('captcha', 'text', 'is_correct') - ) - if not uploaded_captcha['text']: - uploaded_captcha['text'] = None - return uploaded_captcha + return dict((k, response.get(k)) for k in ('captcha', 'text', 'is_correct')) def report(self, cid): - return not self._call('report', {'captcha': cid}).get('is_correct') + data = self.userpwd.copy() + data['captcha'] = cid + return not self._call('report', data).get('is_correct') + def remove(self, cid): + data = self.userpwd.copy() + data['captcha'] = cid + return not self._call('remove', data).get('captcha') if '__main__' == __name__: + import sys + # Put your DBC username & password here: - # client = HttpClient(sys.argv[1], sys.argv[2]) + #client = HttpClient(sys.argv[1], sys.argv[2]) client = SocketClient(sys.argv[1], sys.argv[2]) client.is_verbose = True - print 'Your balance is %s US cents' % client.get_balance() + print('Your balance is %s US cents' % client.get_balance()) for fn in sys.argv[3:]: try: # Put your CAPTCHA image file name or file-like object, and optional # solving timeout (in seconds) here: captcha = client.decode(fn, DEFAULT_TIMEOUT) - except Exception, e: + except Exception as e: sys.stderr.write('Failed uploading CAPTCHA: %s\n' % (e, )) captcha = None if captcha: - print 'CAPTCHA %d solved: %s' % \ - (captcha['captcha'], captcha['text']) + print('CAPTCHA %d solved: %s' % (captcha['captcha'], captcha['text'])) # Report as incorrectly solved if needed. Make sure the CAPTCHA was # in fact incorrectly solved! - # try: - # client.report(captcha['captcha']) - # except Exception, e: - # sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, )) + try: + client.report(captcha['captcha']) + except Exception as e: + sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, )) diff --git a/libs/decorator.py b/libs/decorator.py index 44303eedc..34fd527c9 100644 --- a/libs/decorator.py +++ b/libs/decorator.py @@ -40,7 +40,7 @@ import operator import itertools import collections -__version__ = '4.3.0' +__version__ = '4.4.0' if sys.version >= '3': from inspect import getfullargspec @@ -65,6 +65,12 @@ except AttributeError: # let's assume there are no coroutine functions in old Python def iscoroutinefunction(f): return False +try: + from inspect import isgeneratorfunction +except ImportError: + # assume no generator function in old Python versions + def isgeneratorfunction(caller): + return False DEF = re.compile(r'\s*def\s*([_\w][_\w\d]*)\s*\(') @@ -173,7 +179,8 @@ class FunctionMaker(object): # Ensure each generated function has a unique filename for profilers # (such as cProfile) that depend on the tuple of (<filename>, # <definition line>, <function name>) being unique. - filename = '<decorator-gen-%d>' % (next(self._compile_count),) + filename = '<%s:decorator-gen-%d>' % ( + __file__, next(self._compile_count)) try: code = compile(src, filename, 'single') exec(code, evaldict) @@ -218,6 +225,8 @@ class FunctionMaker(object): def decorate(func, caller, extras=()): """ decorate(func, caller) decorates a function using a caller. + If the caller is a generator function, the resulting function + will be a generator function. """ evaldict = dict(_call_=caller, _func_=func) es = '' @@ -225,9 +234,23 @@ def decorate(func, caller, extras=()): ex = '_e%d_' % i evaldict[ex] = extra es += ex + ', ' - fun = FunctionMaker.create( - func, "return _call_(_func_, %s%%(shortsignature)s)" % es, - evaldict, __wrapped__=func) + + if '3.5' <= sys.version < '3.6': + # with Python 3.5 isgeneratorfunction returns True for all coroutines + # however we know that it is NOT possible to have a generator + # coroutine in python 3.5: PEP525 was not there yet + generatorcaller = isgeneratorfunction( + caller) and not iscoroutinefunction(caller) + else: + generatorcaller = isgeneratorfunction(caller) + if generatorcaller: + fun = FunctionMaker.create( + func, "for res in _call_(_func_, %s%%(shortsignature)s):\n" + " yield res" % es, evaldict, __wrapped__=func) + else: + fun = FunctionMaker.create( + func, "return _call_(_func_, %s%%(shortsignature)s)" % es, + evaldict, __wrapped__=func) if hasattr(func, '__qualname__'): fun.__qualname__ = func.__qualname__ return fun @@ -261,12 +284,12 @@ def decorator(caller, _func=None): doc = caller.__call__.__doc__ evaldict = dict(_call=caller, _decorate_=decorate) dec = FunctionMaker.create( - '%s(%s func)' % (name, defaultargs), + '%s(func, %s)' % (name, defaultargs), 'if func is None: return lambda func: _decorate_(func, _call, (%s))\n' 'return _decorate_(func, _call, (%s))' % (defaultargs, defaultargs), evaldict, doc=doc, module=caller.__module__, __wrapped__=caller) if defaults: - dec.__defaults__ = defaults + (None,) + dec.__defaults__ = (None,) + defaults return dec diff --git a/libs/dogpile/__init__.py b/libs/dogpile/__init__.py index 99c384364..fc8fd4524 100644 --- a/libs/dogpile/__init__.py +++ b/libs/dogpile/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.6.5' +__version__ = '0.7.1' from .lock import Lock # noqa from .lock import NeedRegenerationException # noqa diff --git a/libs/dogpile/cache/region.py b/libs/dogpile/cache/region.py index 1896cbd81..261a8db48 100644 --- a/libs/dogpile/cache/region.py +++ b/libs/dogpile/cache/region.py @@ -10,8 +10,9 @@ from ..util import compat import time import datetime from numbers import Number -from functools import wraps +from functools import wraps, partial import threading +from decorator import decorate _backend_loader = PluginLoader("dogpile.cache") register_backend = _backend_loader.register @@ -188,7 +189,7 @@ class DefaultInvalidationStrategy(RegionInvalidationStrategy): class CacheRegion(object): - """A front end to a particular cache backend. + r"""A front end to a particular cache backend. :param name: Optional, a string name for the region. This isn't used internally @@ -484,6 +485,26 @@ class CacheRegion(object): else: return self._LockWrapper() + # cached value + _actual_backend = None + + @property + def actual_backend(self): + """Return the ultimate backend underneath any proxies. + + The backend might be the result of one or more ``proxy.wrap`` + applications. If so, derive the actual underlying backend. + + .. versionadded:: 0.6.6 + + """ + if self._actual_backend is None: + _backend = self.backend + while hasattr(_backend, 'proxied'): + _backend = _backend.proxied + self._actual_backend = _backend + return self._actual_backend + def invalidate(self, hard=True): """Invalidate this :class:`.CacheRegion`. @@ -723,7 +744,8 @@ class CacheRegion(object): ] def get_or_create( - self, key, creator, expiration_time=None, should_cache_fn=None): + self, key, creator, expiration_time=None, should_cache_fn=None, + creator_args=None): """Return a cached value based on the given key. If the value does not exist or is considered to be expired @@ -759,6 +781,11 @@ class CacheRegion(object): :param creator: function which creates a new value. + :param creator_args: optional tuple of (args, kwargs) that will be + passed to the creator function if present. + + .. versionadded:: 0.7.0 + :param expiration_time: optional expiration time which will overide the expiration time already configured on this :class:`.CacheRegion` if not None. To set no expiration, use the value -1. @@ -799,7 +826,7 @@ class CacheRegion(object): value = self.backend.get(key) if (value is NO_VALUE or value.metadata['v'] != value_version or self.region_invalidator.is_hard_invalidated( - value.metadata["ct"])): + value.metadata["ct"])): raise NeedRegenerationException() ct = value.metadata["ct"] if self.region_invalidator.is_soft_invalidated(ct): @@ -808,7 +835,10 @@ class CacheRegion(object): return value.payload, ct def gen_value(): - created_value = creator() + if creator_args: + created_value = creator(*creator_args[0], **creator_args[1]) + else: + created_value = creator() value = self._value(created_value) if not should_cache_fn or \ @@ -831,8 +861,13 @@ class CacheRegion(object): if self.async_creation_runner: def async_creator(mutex): - return self.async_creation_runner( - self, orig_key, creator, mutex) + if creator_args: + @wraps(creator) + def go(): + return creator(*creator_args[0], **creator_args[1]) + else: + go = creator + return self.async_creation_runner(self, orig_key, go, mutex) else: async_creator = None @@ -896,7 +931,7 @@ class CacheRegion(object): if (value is NO_VALUE or value.metadata['v'] != value_version or self.region_invalidator.is_hard_invalidated( - value.metadata['v'])): + value.metadata['ct'])): # dogpile.core understands a 0 here as # "the value is not available", e.g. # _has_value() will return False. @@ -1228,26 +1263,31 @@ class CacheRegion(object): if function_key_generator is None: function_key_generator = self.function_key_generator - def decorator(fn): + def get_or_create_for_user_func(key_generator, user_func, *arg, **kw): + key = key_generator(*arg, **kw) + + timeout = expiration_time() if expiration_time_is_callable \ + else expiration_time + return self.get_or_create(key, user_func, timeout, + should_cache_fn, (arg, kw)) + + def cache_decorator(user_func): if to_str is compat.string_type: # backwards compatible - key_generator = function_key_generator(namespace, fn) + key_generator = function_key_generator(namespace, user_func) else: key_generator = function_key_generator( - namespace, fn, + namespace, user_func, to_str=to_str) - @wraps(fn) - def decorate(*arg, **kw): + def refresh(*arg, **kw): + """ + Like invalidate, but regenerates the value instead + """ key = key_generator(*arg, **kw) - - @wraps(fn) - def creator(): - return fn(*arg, **kw) - timeout = expiration_time() if expiration_time_is_callable \ - else expiration_time - return self.get_or_create(key, creator, timeout, - should_cache_fn) + value = user_func(*arg, **kw) + self.set(key, value) + return value def invalidate(*arg, **kw): key = key_generator(*arg, **kw) @@ -1261,20 +1301,18 @@ class CacheRegion(object): key = key_generator(*arg, **kw) return self.get(key) - def refresh(*arg, **kw): - key = key_generator(*arg, **kw) - value = fn(*arg, **kw) - self.set(key, value) - return value + user_func.set = set_ + user_func.invalidate = invalidate + user_func.get = get + user_func.refresh = refresh + user_func.original = user_func - decorate.set = set_ - decorate.invalidate = invalidate - decorate.refresh = refresh - decorate.get = get - decorate.original = fn + # Use `decorate` to preserve the signature of :param:`user_func`. - return decorate - return decorator + return decorate(user_func, partial( + get_or_create_for_user_func, key_generator)) + + return cache_decorator def cache_multi_on_arguments( self, namespace=None, expiration_time=None, @@ -1402,50 +1440,49 @@ class CacheRegion(object): if function_multi_key_generator is None: function_multi_key_generator = self.function_multi_key_generator - def decorator(fn): - key_generator = function_multi_key_generator( - namespace, fn, - to_str=to_str) - - @wraps(fn) - def decorate(*arg, **kw): - cache_keys = arg - keys = key_generator(*arg, **kw) - key_lookup = dict(zip(keys, cache_keys)) - - @wraps(fn) - def creator(*keys_to_create): - return fn(*[key_lookup[k] for k in keys_to_create]) - - timeout = expiration_time() if expiration_time_is_callable \ - else expiration_time + def get_or_create_for_user_func(key_generator, user_func, *arg, **kw): + cache_keys = arg + keys = key_generator(*arg, **kw) + key_lookup = dict(zip(keys, cache_keys)) + + @wraps(user_func) + def creator(*keys_to_create): + return user_func(*[key_lookup[k] for k in keys_to_create]) + + timeout = expiration_time() if expiration_time_is_callable \ + else expiration_time + + if asdict: + def dict_create(*keys): + d_values = creator(*keys) + return [ + d_values.get(key_lookup[k], NO_VALUE) + for k in keys] + + def wrap_cache_fn(value): + if value is NO_VALUE: + return False + elif not should_cache_fn: + return True + else: + return should_cache_fn(value) + + result = self.get_or_create_multi( + keys, dict_create, timeout, wrap_cache_fn) + result = dict( + (k, v) for k, v in zip(cache_keys, result) + if v is not NO_VALUE) + else: + result = self.get_or_create_multi( + keys, creator, timeout, + should_cache_fn) - if asdict: - def dict_create(*keys): - d_values = creator(*keys) - return [ - d_values.get(key_lookup[k], NO_VALUE) - for k in keys] - - def wrap_cache_fn(value): - if value is NO_VALUE: - return False - elif not should_cache_fn: - return True - else: - return should_cache_fn(value) - - result = self.get_or_create_multi( - keys, dict_create, timeout, wrap_cache_fn) - result = dict( - (k, v) for k, v in zip(cache_keys, result) - if v is not NO_VALUE) - else: - result = self.get_or_create_multi( - keys, creator, timeout, - should_cache_fn) + return result - return result + def cache_decorator(user_func): + key_generator = function_multi_key_generator( + namespace, user_func, + to_str=to_str) def invalidate(*arg): keys = key_generator(*arg) @@ -1466,7 +1503,7 @@ class CacheRegion(object): def refresh(*arg): keys = key_generator(*arg) - values = fn(*arg) + values = user_func(*arg) if asdict: self.set_multi( dict(zip(keys, [values[a] for a in arg])) @@ -1478,13 +1515,18 @@ class CacheRegion(object): ) return values - decorate.set = set_ - decorate.invalidate = invalidate - decorate.refresh = refresh - decorate.get = get + user_func.set = set_ + user_func.invalidate = invalidate + user_func.refresh = refresh + user_func.get = get + + # Use `decorate` to preserve the signature of :param:`user_func`. + + return decorate(user_func, partial(get_or_create_for_user_func, key_generator)) + + return cache_decorator + - return decorate - return decorator def make_region(*arg, **kw): diff --git a/libs/dogpile/cache/util.py b/libs/dogpile/cache/util.py index 4b56f6429..16bcd1c97 100644 --- a/libs/dogpile/cache/util.py +++ b/libs/dogpile/cache/util.py @@ -1,5 +1,4 @@ from hashlib import sha1 -import inspect from ..util import compat from ..util import langhelpers @@ -28,7 +27,7 @@ def function_key_generator(namespace, fn, to_str=compat.string_type): else: namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace) - args = inspect.getargspec(fn) + args = compat.inspect_getargspec(fn) has_self = args[0] and args[0][0] in ('self', 'cls') def generate_key(*args, **kw): @@ -50,7 +49,7 @@ def function_multi_key_generator(namespace, fn, to_str=compat.string_type): else: namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace) - args = inspect.getargspec(fn) + args = compat.inspect_getargspec(fn) has_self = args[0] and args[0][0] in ('self', 'cls') def generate_keys(*args, **kw): @@ -88,7 +87,7 @@ def kwarg_function_key_generator(namespace, fn, to_str=compat.string_type): else: namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace) - argspec = inspect.getargspec(fn) + argspec = compat.inspect_getargspec(fn) default_list = list(argspec.defaults or []) # Reverse the list, as we want to compare the argspec by negative index, # meaning default_list[0] should be args[-1], which works well with diff --git a/libs/dogpile/lock.py b/libs/dogpile/lock.py index 29f342d84..2ac22dcfe 100644 --- a/libs/dogpile/lock.py +++ b/libs/dogpile/lock.py @@ -69,11 +69,10 @@ class Lock(object): """Return true if the expiration time is reached, or no value is available.""" - return not self._has_value(createdtime) or \ - ( - self.expiretime is not None and - time.time() - createdtime > self.expiretime - ) + return not self._has_value(createdtime) or ( + self.expiretime is not None and + time.time() - createdtime > self.expiretime + ) def _has_value(self, createdtime): """Return true if the creation function has proceeded @@ -91,68 +90,100 @@ class Lock(object): value = NOT_REGENERATED createdtime = -1 - generated = self._enter_create(createdtime) + generated = self._enter_create(value, createdtime) if generated is not NOT_REGENERATED: generated, createdtime = generated return generated elif value is NOT_REGENERATED: + # we called upon the creator, and it said that it + # didn't regenerate. this typically means another + # thread is running the creation function, and that the + # cache should still have a value. However, + # we don't have a value at all, which is unusual since we just + # checked for it, so check again (TODO: is this a real codepath?) try: value, createdtime = value_fn() return value except NeedRegenerationException: - raise Exception("Generation function should " - "have just been called by a concurrent " - "thread.") + raise Exception( + "Generation function should " + "have just been called by a concurrent " + "thread.") else: return value - def _enter_create(self, createdtime): - + def _enter_create(self, value, createdtime): if not self._is_expired(createdtime): return NOT_REGENERATED - async = False + _async = False if self._has_value(createdtime): + has_value = True if not self.mutex.acquire(False): - log.debug("creation function in progress " - "elsewhere, returning") + log.debug( + "creation function in progress " + "elsewhere, returning") return NOT_REGENERATED else: + has_value = False log.debug("no value, waiting for create lock") self.mutex.acquire() try: log.debug("value creation lock %r acquired" % self.mutex) - # see if someone created the value already - try: - value, createdtime = self.value_and_created_fn() - except NeedRegenerationException: - pass - else: - if not self._is_expired(createdtime): - log.debug("value already present") - return value, createdtime - elif self.async_creator: - log.debug("Passing creation lock to async runner") - self.async_creator(self.mutex) - async = True - return value, createdtime - - log.debug("Calling creation function") - created = self.creator() - return created + if not has_value: + # we entered without a value, or at least with "creationtime == + # 0". Run the "getter" function again, to see if another + # thread has already generated the value while we waited on the + # mutex, or if the caller is otherwise telling us there is a + # value already which allows us to use async regeneration. (the + # latter is used by the multi-key routine). + try: + value, createdtime = self.value_and_created_fn() + except NeedRegenerationException: + # nope, nobody created the value, we're it. + # we must create it right now + pass + else: + has_value = True + # caller is telling us there is a value and that we can + # use async creation if it is expired. + if not self._is_expired(createdtime): + # it's not expired, return it + log.debug("Concurrent thread created the value") + return value, createdtime + + # otherwise it's expired, call creator again + + if has_value and self.async_creator: + # we have a value we can return, safe to use async_creator + log.debug("Passing creation lock to async runner") + + # so...run it! + self.async_creator(self.mutex) + _async = True + + # and return the expired value for now + return value, createdtime + + # it's expired, and it's our turn to create it synchronously, *or*, + # there's no value at all, and we have to create it synchronously + log.debug( + "Calling creation function for %s value", + "not-yet-present" if not has_value else + "previously expired" + ) + return self.creator() finally: - if not async: + if not _async: self.mutex.release() log.debug("Released creation lock") - def __enter__(self): return self._enter() def __exit__(self, type, value, traceback): pass - diff --git a/libs/dogpile/util/compat.py b/libs/dogpile/util/compat.py index d29bb1dac..198c76276 100644 --- a/libs/dogpile/util/compat.py +++ b/libs/dogpile/util/compat.py @@ -51,11 +51,33 @@ else: import thread # noqa +if py3k: + import collections + ArgSpec = collections.namedtuple( + "ArgSpec", + ["args", "varargs", "keywords", "defaults"]) + + from inspect import getfullargspec as inspect_getfullargspec + + def inspect_getargspec(func): + return ArgSpec( + *inspect_getfullargspec(func)[0:4] + ) +else: + from inspect import getargspec as inspect_getargspec # noqa + if py3k or jython: import pickle else: import cPickle as pickle # noqa +if py3k: + def read_config_file(config, fileobj): + return config.read_file(fileobj) +else: + def read_config_file(config, fileobj): + return config.readfp(fileobj) + def timedelta_total_seconds(td): if py27: diff --git a/libs/dogpile/util/nameregistry.py b/libs/dogpile/util/nameregistry.py index a5102b238..7087f7cd6 100644 --- a/libs/dogpile/util/nameregistry.py +++ b/libs/dogpile/util/nameregistry.py @@ -50,7 +50,7 @@ class NameRegistry(object): self.creator = creator def get(self, identifier, *args, **kw): - """Get and possibly create the value. + r"""Get and possibly create the value. :param identifier: Hash key for the value. If the creation function is called, this identifier @@ -75,10 +75,12 @@ class NameRegistry(object): if identifier in self._values: return self._values[identifier] else: - self._values[identifier] = value = self.creator(identifier, *args, **kw) + self._values[identifier] = value = self.creator( + identifier, *args, **kw) return value except KeyError: - self._values[identifier] = value = self.creator(identifier, *args, **kw) + self._values[identifier] = value = self.creator( + identifier, *args, **kw) return value finally: self._mutex.release() diff --git a/libs/dogpile/util/readwrite_lock.py b/libs/dogpile/util/readwrite_lock.py index 2196ed7d7..9b953edb8 100644 --- a/libs/dogpile/util/readwrite_lock.py +++ b/libs/dogpile/util/readwrite_lock.py @@ -23,7 +23,7 @@ class ReadWriteMutex(object): def __init__(self): # counts how many asynchronous methods are executing - self.async = 0 + self.async_ = 0 # pointer to thread that is the current sync operation self.current_sync_operation = None @@ -31,7 +31,7 @@ class ReadWriteMutex(object): # condition object to lock on self.condition = threading.Condition(threading.Lock()) - def acquire_read_lock(self, wait = True): + def acquire_read_lock(self, wait=True): """Acquire the 'read' lock.""" self.condition.acquire() try: @@ -45,7 +45,7 @@ class ReadWriteMutex(object): if self.current_sync_operation is not None: return False - self.async += 1 + self.async_ += 1 log.debug("%s acquired read lock", self) finally: self.condition.release() @@ -57,23 +57,23 @@ class ReadWriteMutex(object): """Release the 'read' lock.""" self.condition.acquire() try: - self.async -= 1 + self.async_ -= 1 # check if we are the last asynchronous reader thread # out the door. - if self.async == 0: + if self.async_ == 0: # yes. so if a sync operation is waiting, notifyAll to wake # it up if self.current_sync_operation is not None: self.condition.notifyAll() - elif self.async < 0: + elif self.async_ < 0: raise LockError("Synchronizer error - too many " "release_read_locks called") log.debug("%s released read lock", self) finally: self.condition.release() - def acquire_write_lock(self, wait = True): + def acquire_write_lock(self, wait=True): """Acquire the 'write' lock.""" self.condition.acquire() try: @@ -96,7 +96,7 @@ class ReadWriteMutex(object): self.current_sync_operation = threading.currentThread() # now wait again for asyncs to finish - if self.async > 0: + if self.async_ > 0: if wait: # wait self.condition.wait() diff --git a/libs/enum/LICENSE b/libs/enum2.7/LICENSE index 9003b8850..9003b8850 100644 --- a/libs/enum/LICENSE +++ b/libs/enum2.7/LICENSE diff --git a/libs/enum/README b/libs/enum2.7/README index aa2333d8d..aa2333d8d 100644 --- a/libs/enum/README +++ b/libs/enum2.7/README diff --git a/libs/enum/__init__.py b/libs/enum2.7/__init__.py index d6ffb3a40..d6ffb3a40 100644 --- a/libs/enum/__init__.py +++ b/libs/enum2.7/__init__.py diff --git a/libs/enum/doc/enum.pdf b/libs/enum2.7/doc/enum.pdf index 3fb6ec264..3fb6ec264 100644 --- a/libs/enum/doc/enum.pdf +++ b/libs/enum2.7/doc/enum.pdf diff --git a/libs/enum/doc/enum.rst b/libs/enum2.7/doc/enum.rst index 3afc23821..3afc23821 100644 --- a/libs/enum/doc/enum.rst +++ b/libs/enum2.7/doc/enum.rst diff --git a/libs/enum/test.py b/libs/enum2.7/test.py index d9edfaee4..d9edfaee4 100644 --- a/libs/enum/test.py +++ b/libs/enum2.7/test.py diff --git a/libs/ipaddress.py b/libs/ipaddress.py index 26d84ed76..b024bf73c 100644 --- a/libs/ipaddress.py +++ b/libs/ipaddress.py @@ -6,8 +6,16 @@ # s/class \(\w\+\):/class \1(object):/ # Use iterator versions of map and range: -from itertools import imap as map -range = xrange +try: + from itertools import imap as map +except ImportError: + imap = map + +try: + import xrange + range = xrange +except ImportError: + pass # Except that xrange only supports machine integers, not longs, so... def long_range(start, end): diff --git a/libs/simpleconfigparser/__init__.py b/libs/simpleconfigparser/__init__.py index 1a515024c..959ebda6a 100644 --- a/libs/simpleconfigparser/__init__.py +++ b/libs/simpleconfigparser/__init__.py @@ -23,12 +23,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -# Bazarr patch to use custom ConfigParser2: -from ConfigParser2 import ConfigParser as configparser, NoOptionError, NoSectionError -#try: -# from configparser2 import ConfigParser as configparser, NoOptionError, NoSectionError -#except ImportError: -# from ConfigParser import SafeConfigParser as configparser, NoOptionError, NoSectionError +try: + from backports.configparser2 import ConfigParser as configparser, NoOptionError, NoSectionError +except ImportError: + from ConfigParser import SafeConfigParser as configparser, NoOptionError, NoSectionError class simpleconfigparser(configparser): diff --git a/libs/six.py b/libs/six.py index 6bf4fd381..89b2188fd 100644 --- a/libs/six.py +++ b/libs/six.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2017 Benjamin Peterson +# Copyright (c) 2010-2018 Benjamin Peterson # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -29,7 +29,7 @@ import sys import types __author__ = "Benjamin Peterson <[email protected]>" -__version__ = "1.11.0" +__version__ = "1.12.0" # Useful for very coarse version differentiation. @@ -844,10 +844,71 @@ def add_metaclass(metaclass): orig_vars.pop(slots_var) orig_vars.pop('__dict__', None) orig_vars.pop('__weakref__', None) + if hasattr(cls, '__qualname__'): + orig_vars['__qualname__'] = cls.__qualname__ return metaclass(cls.__name__, cls.__bases__, orig_vars) return wrapper +def ensure_binary(s, encoding='utf-8', errors='strict'): + """Coerce **s** to six.binary_type. + + For Python 2: + - `unicode` -> encoded to `str` + - `str` -> `str` + + For Python 3: + - `str` -> encoded to `bytes` + - `bytes` -> `bytes` + """ + if isinstance(s, text_type): + return s.encode(encoding, errors) + elif isinstance(s, binary_type): + return s + else: + raise TypeError("not expecting type '%s'" % type(s)) + + +def ensure_str(s, encoding='utf-8', errors='strict'): + """Coerce *s* to `str`. + + For Python 2: + - `unicode` -> encoded to `str` + - `str` -> `str` + + For Python 3: + - `str` -> `str` + - `bytes` -> decoded to `str` + """ + if not isinstance(s, (text_type, binary_type)): + raise TypeError("not expecting type '%s'" % type(s)) + if PY2 and isinstance(s, text_type): + s = s.encode(encoding, errors) + elif PY3 and isinstance(s, binary_type): + s = s.decode(encoding, errors) + return s + + +def ensure_text(s, encoding='utf-8', errors='strict'): + """Coerce *s* to six.text_type. + + For Python 2: + - `unicode` -> `unicode` + - `str` -> `unicode` + + For Python 3: + - `str` -> `str` + - `bytes` -> decoded to `str` + """ + if isinstance(s, binary_type): + return s.decode(encoding, errors) + elif isinstance(s, text_type): + return s + else: + raise TypeError("not expecting type '%s'" % type(s)) + + + def python_2_unicode_compatible(klass): """ A decorator that defines __unicode__ and __str__ methods under Python 2. diff --git a/libs/subliminal/__init__.py b/libs/subliminal/__init__.py index 73b137e98..7ff8ac348 100644 --- a/libs/subliminal/__init__.py +++ b/libs/subliminal/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- __title__ = 'subliminal' -__version__ = '2.1.0.dev' +__version__ = '2.0.5' __short_version__ = '.'.join(__version__.split('.')[:2]) __author__ = 'Antoine Bertin' __license__ = 'MIT' diff --git a/libs/subliminal/cli.py b/libs/subliminal/cli.py index 2eb502fbc..cc24853c2 100644 --- a/libs/subliminal/cli.py +++ b/libs/subliminal/cli.py @@ -219,12 +219,13 @@ config_file = 'config.ini' @click.option('--legendastv', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='LegendasTV configuration.') @click.option('--opensubtitles', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='OpenSubtitles configuration.') [email protected]('--subscenter', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='SubsCenter configuration.') @click.option('--cache-dir', type=click.Path(writable=True, file_okay=False), default=dirs.user_cache_dir, show_default=True, expose_value=True, help='Path to the cache directory.') @click.option('--debug', is_flag=True, help='Print useful information for debugging subliminal and for reporting bugs.') @click.version_option(__version__) @click.pass_context -def subliminal(ctx, addic7ed, legendastv, opensubtitles, cache_dir, debug): +def subliminal(ctx, addic7ed, legendastv, opensubtitles, subscenter, cache_dir, debug): """Subtitles, faster than your thoughts.""" # create cache directory try: @@ -252,6 +253,8 @@ def subliminal(ctx, addic7ed, legendastv, opensubtitles, cache_dir, debug): ctx.obj['provider_configs']['legendastv'] = {'username': legendastv[0], 'password': legendastv[1]} if opensubtitles: ctx.obj['provider_configs']['opensubtitles'] = {'username': opensubtitles[0], 'password': opensubtitles[1]} + if subscenter: + ctx.obj['provider_configs']['subscenter'] = {'username': subscenter[0], 'password': subscenter[1]} @subliminal.command() diff --git a/libs/subliminal/core.py b/libs/subliminal/core.py index e61502525..c516c49d3 100644 --- a/libs/subliminal/core.py +++ b/libs/subliminal/core.py @@ -1,38 +1,19 @@ # -*- coding: utf-8 -*- from collections import defaultdict - -import platform -is_windows_special_path = False - -if platform.system() == "Windows": - try: - __file__.decode("ascii") - except UnicodeDecodeError: - is_windows_special_path = True - -if not is_windows_special_path: - from concurrent.futures import ThreadPoolExecutor -else: - ThreadPoolExecutor = object - - +from concurrent.futures import ThreadPoolExecutor from datetime import datetime import io import itertools import logging import operator -import os +import os.path import socket from babelfish import Language, LanguageReverseError from guessit import guessit -from six.moves.xmlrpc_client import ProtocolError -from rarfile import BadRarFile, NotRarFile, RarCannotExec, RarFile -from zipfile import BadZipfile -from ssl import SSLError +from rarfile import NotRarFile, RarCannotExec, RarFile import requests -from .exceptions import ServiceUnavailable from .extensions import provider_manager, refiner_manager from .score import compute_score as default_compute_score from .subtitle import SUBTITLE_EXTENSIONS, get_subtitle_path @@ -98,18 +79,6 @@ class ProviderPool(object): self.initialized_providers[name].terminate() except (requests.Timeout, socket.timeout): logger.error('Provider %r timed out, improperly terminated', name) - except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable - logger.error('Provider %r unavailable, improperly terminated', name) - except requests.exceptions.HTTPError as e: - if e.response.status_code in range(500, 600): - logger.error('Provider %r unavailable, improperly terminated', name) - else: - logger.exception('Provider %r http error %r, improperly terminated', name, e.response.status_code) - except SSLError as e: - if e.args[0] == 'The read operation timed out': - logger.error('Provider %r unavailable, improperly terminated', name) - else: - logger.exception('Provider %r SSL error %r, improperly terminated', name, e.args[0]) except: logger.exception('Provider %r terminated unexpectedly', name) @@ -149,18 +118,6 @@ class ProviderPool(object): return self[provider].list_subtitles(video, provider_languages) except (requests.Timeout, socket.timeout): logger.error('Provider %r timed out', provider) - except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable - logger.error('Provider %r unavailable', provider) - except requests.exceptions.HTTPError as e: - if e.response.status_code in range(500, 600): - logger.error('Provider %r unavailable', provider) - else: - logger.exception('Provider %r http error %r', provider, e.response.status_code) - except SSLError as e: - if e.args[0] == 'The read operation timed out': - logger.error('Provider %r unavailable', provider) - else: - logger.exception('Provider %r SSL error %r', provider, e.args[0]) except: logger.exception('Unexpected error in provider %r', provider) @@ -216,28 +173,6 @@ class ProviderPool(object): logger.error('Provider %r timed out, discarding it', subtitle.provider_name) self.discarded_providers.add(subtitle.provider_name) return False - except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable - logger.error('Provider %r unavailable, discarding it', subtitle.provider_name) - self.discarded_providers.add(subtitle.provider_name) - return False - except requests.exceptions.HTTPError as e: - if e.response.status_code in range(500, 600): - logger.error('Provider %r unavailable, discarding it', subtitle.provider_name) - else: - logger.exception('Provider %r http error %r, discarding it', subtitle.provider_name, - e.response.status_code) - self.discarded_providers.add(subtitle.provider_name) - return False - except SSLError as e: - if e.args[0] == 'The read operation timed out': - logger.error('Provider %r unavailable, discarding it', subtitle.provider_name) - else: - logger.exception('Provider %r SSL error %r, discarding it', subtitle.provider_name, e.args[0]) - self.discarded_providers.add(subtitle.provider_name) - return False - except (BadRarFile, BadZipfile): - logger.error('Bad archive for %r', subtitle) - return False except: logger.exception('Unexpected error in provider %r, discarding it', subtitle.provider_name) self.discarded_providers.add(subtitle.provider_name) @@ -557,15 +492,9 @@ def scan_videos(path, age=None, archives=True): continue # skip old files - try: - file_age = datetime.utcfromtimestamp(os.path.getmtime(filepath)) - except ValueError: - logger.warning('Could not get age of file %r in %r', filename, dirpath) + if age and datetime.utcnow() - datetime.utcfromtimestamp(os.path.getmtime(filepath)) > age: + logger.debug('Skipping old file %r in %r', filename, dirpath) continue - else: - if age and datetime.utcnow() - file_age > age: - logger.debug('Skipping old file %r in %r', filename, dirpath) - continue # scan if filename.endswith(VIDEO_EXTENSIONS): # video @@ -612,8 +541,7 @@ def refine(video, episode_refiners=None, movie_refiners=None, **kwargs): try: refiner_manager[refiner].plugin(video, **kwargs) except: - logger.error('Failed to refine video %r', video.name) - logger.debug('Refiner exception:', exc_info=True) + logger.exception('Failed to refine video') def list_subtitles(videos, languages, pool_class=ProviderPool, **kwargs): diff --git a/libs/subliminal/exceptions.py b/libs/subliminal/exceptions.py index 14d4f6412..5f5c7a773 100644 --- a/libs/subliminal/exceptions.py +++ b/libs/subliminal/exceptions.py @@ -19,8 +19,8 @@ class AuthenticationError(ProviderError): pass -class ServiceUnavailable(ProviderError): - """Exception raised when status is '503 Service Unavailable'.""" +class TooManyRequests(ProviderError): + """Exception raised by providers when too many requests are made.""" pass diff --git a/libs/subliminal/extensions.py b/libs/subliminal/extensions.py index 495b68efc..1f378b7f7 100644 --- a/libs/subliminal/extensions.py +++ b/libs/subliminal/extensions.py @@ -29,9 +29,9 @@ class RegistrableExtensionManager(ExtensionManager): super(RegistrableExtensionManager, self).__init__(namespace, **kwargs) - def list_entry_points(self): + def _find_entry_points(self, namespace): # copy of default extensions - eps = list(super(RegistrableExtensionManager, self).list_entry_points()) + eps = list(super(RegistrableExtensionManager, self)._find_entry_points(namespace)) # internal extensions for iep in self.internal_extensions: @@ -93,6 +93,7 @@ provider_manager = RegistrableExtensionManager('subliminal.providers', [ 'opensubtitles = subliminal.providers.opensubtitles:OpenSubtitlesProvider', 'podnapisi = subliminal.providers.podnapisi:PodnapisiProvider', 'shooter = subliminal.providers.shooter:ShooterProvider', + 'subscenter = subliminal.providers.subscenter:SubsCenterProvider', 'thesubdb = subliminal.providers.thesubdb:TheSubDBProvider', 'tvsubtitles = subliminal.providers.tvsubtitles:TVsubtitlesProvider' ]) diff --git a/libs/subliminal/providers/__init__.py b/libs/subliminal/providers/__init__.py index 3ac78665f..9d2fd6d2c 100644 --- a/libs/subliminal/providers/__init__.py +++ b/libs/subliminal/providers/__init__.py @@ -68,9 +68,6 @@ class Provider(object): #: Required hash, if any required_hash = None - #: Subtitle class to use - subtitle_class = None - def __enter__(self): self.initialize() return self diff --git a/libs/subliminal/providers/addic7ed.py b/libs/subliminal/providers/addic7ed.py index 2926081e0..0d4a58fda 100644 --- a/libs/subliminal/providers/addic7ed.py +++ b/libs/subliminal/providers/addic7ed.py @@ -9,7 +9,7 @@ from requests import Session from . import ParserBeautifulSoup, Provider from .. import __short_version__ from ..cache import SHOW_EXPIRATION_TIME, region -from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded +from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, TooManyRequests from ..score import get_equivalent_release_groups from ..subtitle import Subtitle, fix_line_ending, guess_matches from ..utils import sanitize, sanitize_release_group @@ -19,11 +19,8 @@ logger = logging.getLogger(__name__) language_converters.register('addic7ed = subliminal.converters.addic7ed:Addic7edConverter') -# Series cell matching regex -show_cells_re = re.compile(b'<td class="version">.*?</td>', re.DOTALL) - #: Series header parsing regex -series_year_re = re.compile(r'^(?P<series>[ \w\'.:(),*&!?-]+?)(?: \((?P<year>\d{4})\))?$') +series_year_re = re.compile(r'^(?P<series>[ \w\'.:(),&!?-]+?)(?: \((?P<year>\d{4})\))?$') class Addic7edSubtitle(Subtitle): @@ -32,7 +29,7 @@ class Addic7edSubtitle(Subtitle): def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, year, version, download_link): - super(Addic7edSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link) + super(Addic7edSubtitle, self).__init__(language, hearing_impaired, page_link) self.series = series self.season = season self.episode = episode @@ -48,9 +45,8 @@ class Addic7edSubtitle(Subtitle): def get_matches(self, video): matches = set() - # series name - if video.series and sanitize(self.series) in ( - sanitize(name) for name in [video.series] + video.alternative_series): + # series + if video.series and sanitize(self.series) == sanitize(video.series): matches.add('series') # season if video.season and self.season == video.season: @@ -58,7 +54,7 @@ class Addic7edSubtitle(Subtitle): # episode if video.episode and self.episode == video.episode: matches.add('episode') - # title of the episode + # title if video.title and sanitize(self.title) == sanitize(video.title): matches.add('title') # year @@ -90,23 +86,21 @@ class Addic7edProvider(Provider): ]} video_types = (Episode,) server_url = 'http://www.addic7ed.com/' - subtitle_class = Addic7edSubtitle def __init__(self, username=None, password=None): - if any((username, password)) and not all((username, password)): + if username is not None and password is None or username is None and password is not None: raise ConfigurationError('Username and password must be specified') self.username = username self.password = password self.logged_in = False - self.session = None def initialize(self): self.session = Session() self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ # login - if self.username and self.password: + if self.username is not None and self.password is not None: logger.info('Logging in') data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'} r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10) @@ -140,16 +134,7 @@ class Addic7edProvider(Provider): logger.info('Getting show ids') r = self.session.get(self.server_url + 'shows.php', timeout=10) r.raise_for_status() - - # LXML parser seems to fail when parsing Addic7ed.com HTML markup. - # Last known version to work properly is 3.6.4 (next version, 3.7.0, fails) - # Assuming the site's markup is bad, and stripping it down to only contain what's needed. - show_cells = re.findall(show_cells_re, r.content) - if show_cells: - soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser']) - else: - # If RegEx fails, fall back to original r.content and use 'html.parser' - soup = ParserBeautifulSoup(r.content, ['html.parser']) + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # populate the show ids show_ids = {} @@ -181,6 +166,8 @@ class Addic7edProvider(Provider): logger.info('Searching show ids with %r', params) r = self.session.get(self.server_url + 'search.php', params=params, timeout=10) r.raise_for_status() + if r.status_code == 304: + raise TooManyRequests() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # get the suggestion @@ -231,23 +218,24 @@ class Addic7edProvider(Provider): # search as last resort if not show_id: - logger.warning('Series %s not found in show ids', series) + logger.warning('Series not found in show ids') show_id = self._search_show_id(series) return show_id - def query(self, show_id, series, season, year=None, country=None): + def query(self, series, season, year=None, country=None): + # get the show id + show_id = self.get_show_id(series, year, country) + if show_id is None: + logger.error('No show id found for %r (%r)', series, {'year': year, 'country': country}) + return [] + # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10) r.raise_for_status() - - if not r.content: - # Provider returns a status of 304 Not Modified with an empty content - # raise_for_status won't raise exception for that status code - logger.debug('No data returned from provider') - return [] - + if r.status_code == 304: + raise TooManyRequests() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitle rows @@ -274,32 +262,16 @@ class Addic7edProvider(Provider): version = cells[4].text download_link = cells[9].a['href'][1:] - subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year, - version, download_link) + subtitle = Addic7edSubtitle(language, hearing_impaired, page_link, series, season, episode, title, year, + version, download_link) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles def list_subtitles(self, video, languages): - # lookup show_id - titles = [video.series] + video.alternative_series - show_id = None - for title in titles: - show_id = self.get_show_id(title, video.year) - if show_id is not None: - break - - # query for subtitles with the show_id - if show_id is not None: - subtitles = [s for s in self.query(show_id, title, video.season, video.year) - if s.language in languages and s.episode == video.episode] - if subtitles: - return subtitles - else: - logger.error('No show id found for %r (%r)', video.series, {'year': video.year}) - - return [] + return [s for s in self.query(video.series, video.season, video.year) + if s.language in languages and s.episode == video.episode] def download_subtitle(self, subtitle): # download the subtitle @@ -308,12 +280,6 @@ class Addic7edProvider(Provider): timeout=10) r.raise_for_status() - if not r.content: - # Provider returns a status of 304 Not Modified with an empty content - # raise_for_status won't raise exception for that status code - logger.debug('Unable to download subtitle. No data returned from provider') - return - # detect download limit exceeded if r.headers['Content-Type'] == 'text/html': raise DownloadLimitExceeded diff --git a/libs/subliminal/providers/legendastv.py b/libs/subliminal/providers/legendastv.py index 925bb9ead..cdd16aca2 100644 --- a/libs/subliminal/providers/legendastv.py +++ b/libs/subliminal/providers/legendastv.py @@ -18,7 +18,7 @@ from zipfile import ZipFile, is_zipfile from . import ParserBeautifulSoup, Provider from .. import __short_version__ from ..cache import SHOW_EXPIRATION_TIME, region -from ..exceptions import AuthenticationError, ConfigurationError, ProviderError, ServiceUnavailable +from ..exceptions import AuthenticationError, ConfigurationError, ProviderError from ..subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches, sanitize from ..video import Episode, Movie @@ -44,11 +44,8 @@ rating_re = re.compile(r'nota (?P<rating>\d+)') #: Timestamp parsing regex timestamp_re = re.compile(r'(?P<day>\d+)/(?P<month>\d+)/(?P<year>\d+) - (?P<hour>\d+):(?P<minute>\d+)') -#: Title with year/country regex -title_re = re.compile(r'^(?P<series>.*?)(?: \((?:(?P<year>\d{4})|(?P<country>[A-Z]{2}))\))?$') - #: Cache key for releases -releases_key = __name__ + ':releases|{archive_id}|{archive_name}' +releases_key = __name__ + ':releases|{archive_id}' class LegendasTVArchive(object): @@ -63,8 +60,8 @@ class LegendasTVArchive(object): :param int rating: rating (0-10). :param timestamp: timestamp. :type timestamp: datetime.datetime - """ + """ def __init__(self, id, name, pack, featured, link, downloads=0, rating=0, timestamp=None): #: Identifier self.id = id @@ -99,11 +96,10 @@ class LegendasTVArchive(object): class LegendasTVSubtitle(Subtitle): """LegendasTV Subtitle.""" - provider_name = 'legendastv' def __init__(self, language, type, title, year, imdb_id, season, archive, name): - super(LegendasTVSubtitle, self).__init__(language, page_link=archive.link) + super(LegendasTVSubtitle, self).__init__(language, archive.link) self.type = type self.title = title self.year = year @@ -122,12 +118,11 @@ class LegendasTVSubtitle(Subtitle): # episode if isinstance(video, Episode) and self.type == 'episode': # series - if video.series and (sanitize(self.title) in ( - sanitize(name) for name in [video.series] + video.alternative_series)): + if video.series and sanitize(self.title) == sanitize(video.series): matches.add('series') - # year - if video.original_series and self.year is None or video.year and video.year == self.year: + # year (year is based on season air date hence the adjustment) + if video.original_series and self.year is None or video.year and video.year == self.year - self.season + 1: matches.add('year') # imdb_id @@ -137,8 +132,7 @@ class LegendasTVSubtitle(Subtitle): # movie elif isinstance(video, Movie) and self.type == 'movie': # title - if video.title and (sanitize(self.title) in ( - sanitize(name) for name in [video.title] + video.alternative_titles)): + if video.title and sanitize(self.title) == sanitize(video.title): matches.add('title') # year @@ -149,6 +143,9 @@ class LegendasTVSubtitle(Subtitle): if video.imdb_id and self.imdb_id == video.imdb_id: matches.add('imdb_id') + # archive name + matches |= guess_matches(video, guessit(self.archive.name, {'type': self.type})) + # name matches |= guess_matches(video, guessit(self.name, {'type': self.type})) @@ -160,38 +157,29 @@ class LegendasTVProvider(Provider): :param str username: username. :param str password: password. - """ + """ languages = {Language.fromlegendastv(l) for l in language_converters['legendastv'].codes} server_url = 'http://legendas.tv/' - subtitle_class = LegendasTVSubtitle def __init__(self, username=None, password=None): - - # Provider needs UNRAR installed. If not available raise ConfigurationError - try: - rarfile.custom_check(rarfile.UNRAR_TOOL) - except rarfile.RarExecError: - raise ConfigurationError('UNRAR tool not available') - - if any((username, password)) and not all((username, password)): + if username and not password or not username and password: raise ConfigurationError('Username and password must be specified') self.username = username self.password = password self.logged_in = False - self.session = None def initialize(self): self.session = Session() self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ # login - if self.username and self.password: + if self.username is not None and self.password is not None: logger.info('Logging in') data = {'_method': 'POST', 'data[User][username]': self.username, 'data[User][password]': self.password} r = self.session.post(self.server_url + 'login', data, allow_redirects=False, timeout=10) - raise_for_status(r) + r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['html.parser']) if soup.find('div', {'class': 'alert-error'}, string=re.compile(u'Usuário ou senha inválidos')): @@ -205,174 +193,94 @@ class LegendasTVProvider(Provider): if self.logged_in: logger.info('Logging out') r = self.session.get(self.server_url + 'users/logout', allow_redirects=False, timeout=10) - raise_for_status(r) + r.raise_for_status() logger.debug('Logged out') self.logged_in = False self.session.close() - @staticmethod - def is_valid_title(title, title_id, sanitized_title, season, year): - """Check if is a valid title.""" - sanitized_result = sanitize(title['title']) - if sanitized_result != sanitized_title: - logger.debug("Mismatched title, discarding title %d (%s)", - title_id, sanitized_result) - return - - # episode type - if season: - # discard mismatches on type - if title['type'] != 'episode': - logger.debug("Mismatched 'episode' type, discarding title %d (%s)", title_id, sanitized_result) - return - - # discard mismatches on season - if 'season' not in title or title['season'] != season: - logger.debug('Mismatched season %s, discarding title %d (%s)', - title.get('season'), title_id, sanitized_result) - return - # movie type - else: - # discard mismatches on type - if title['type'] != 'movie': - logger.debug("Mismatched 'movie' type, discarding title %d (%s)", title_id, sanitized_result) - return - - # discard mismatches on year - if year is not None and 'year' in title and title['year'] != year: - logger.debug("Mismatched movie year, discarding title %d (%s)", title_id, sanitized_result) - return - return True - - @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, should_cache_fn=lambda value: value) - def search_titles(self, title, season, title_year): + @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) + def search_titles(self, title): """Search for titles matching the `title`. - For episodes, each season has it own title :param str title: the title to search for. - :param int season: season of the title - :param int title_year: year of the title :return: found titles. :rtype: dict - """ - titles = {} - sanitized_titles = [sanitize(title)] - ignore_characters = {'\'', '.'} - if any(c in title for c in ignore_characters): - sanitized_titles.append(sanitize(title, ignore_characters=ignore_characters)) - - for sanitized_title in sanitized_titles: - # make the query - if season: - logger.info('Searching episode title %r for season %r', sanitized_title, season) - else: - logger.info('Searching movie title %r', sanitized_title) - - r = self.session.get(self.server_url + 'legenda/sugestao/{}'.format(sanitized_title), timeout=10) - raise_for_status(r) - results = json.loads(r.text) - - # loop over results - for result in results: - source = result['_source'] - - # extract id - title_id = int(source['id_filme']) - - # extract type - title = {'type': type_map[source['tipo']]} - # extract title, year and country - name, year, country = title_re.match(source['dsc_nome']).groups() - title['title'] = name + """ + # make the query + logger.info('Searching title %r', title) + r = self.session.get(self.server_url + 'legenda/sugestao/{}'.format(title), timeout=10) + r.raise_for_status() + results = json.loads(r.text) - # extract imdb_id - if source['id_imdb'] != '0': - if not source['id_imdb'].startswith('tt'): - title['imdb_id'] = 'tt' + source['id_imdb'].zfill(7) + # loop over results + titles = {} + for result in results: + source = result['_source'] + + # extract id + title_id = int(source['id_filme']) + + # extract type and title + title = {'type': type_map[source['tipo']], 'title': source['dsc_nome']} + + # extract year + if source['dsc_data_lancamento'] and source['dsc_data_lancamento'].isdigit(): + title['year'] = int(source['dsc_data_lancamento']) + + # extract imdb_id + if source['id_imdb'] != '0': + if not source['id_imdb'].startswith('tt'): + title['imdb_id'] = 'tt' + source['id_imdb'].zfill(7) + else: + title['imdb_id'] = source['id_imdb'] + + # extract season + if title['type'] == 'episode': + if source['temporada'] and source['temporada'].isdigit(): + title['season'] = int(source['temporada']) + else: + match = season_re.search(source['dsc_nome_br']) + if match: + title['season'] = int(match.group('season')) else: - title['imdb_id'] = source['id_imdb'] + logger.warning('No season detected for title %d', title_id) - # extract season - if title['type'] == 'episode': - if source['temporada'] and source['temporada'].isdigit(): - title['season'] = int(source['temporada']) - else: - match = season_re.search(source['dsc_nome_br']) - if match: - title['season'] = int(match.group('season')) - else: - logger.debug('No season detected for title %d (%s)', title_id, name) - - # extract year - if year: - title['year'] = int(year) - elif source['dsc_data_lancamento'] and source['dsc_data_lancamento'].isdigit(): - # year is based on season air date hence the adjustment - title['year'] = int(source['dsc_data_lancamento']) - title.get('season', 1) + 1 - - # add title only if is valid - # Check against title without ignored chars - if self.is_valid_title(title, title_id, sanitized_titles[0], season, title_year): - titles[title_id] = title - - logger.debug('Found %d titles', len(titles)) + # add title + titles[title_id] = title + + logger.debug('Found %d titles', len(titles)) return titles @region.cache_on_arguments(expiration_time=timedelta(minutes=15).total_seconds()) - def get_archives(self, title_id, language_code, title_type, season, episode): - """Get the archive list from a given `title_id`, `language_code`, `title_type`, `season` and `episode`. + def get_archives(self, title_id, language_code): + """Get the archive list from a given `title_id` and `language_code`. :param int title_id: title id. :param int language_code: language code. - :param str title_type: episode or movie - :param int season: season - :param int episode: episode :return: the archives. :rtype: list of :class:`LegendasTVArchive` """ + logger.info('Getting archives for title %d and language %d', title_id, language_code) archives = [] - page = 0 + page = 1 while True: # get the archive page - url = self.server_url + 'legenda/busca/-/{language}/-/{page}/{title}'.format( - language=language_code, page=page, title=title_id) + url = self.server_url + 'util/carrega_legendas_busca_filme/{title}/{language}/-/{page}'.format( + title=title_id, language=language_code, page=page) r = self.session.get(url) - raise_for_status(r) + r.raise_for_status() # parse the results soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) - for archive_soup in soup.select('div.list_element > article > div > div.f_left'): + for archive_soup in soup.select('div.list_element > article > div'): # create archive - archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2], - archive_soup.a.text, - 'pack' in archive_soup.parent['class'], - 'destaque' in archive_soup.parent['class'], + archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2], archive_soup.a.text, + 'pack' in archive_soup['class'], 'destaque' in archive_soup['class'], self.server_url + archive_soup.a['href'][1:]) - # clean name of path separators and pack flags - clean_name = archive.name.replace('/', '-') - if archive.pack and clean_name.startswith('(p)'): - clean_name = clean_name[3:] - - # guess from name - guess = guessit(clean_name, {'type': title_type}) - - # episode - if season and episode: - # discard mismatches on episode in non-pack archives - - # Guessit may return int for single episode or list for multi-episode - # Check if archive name has multiple episodes releases on it - if not archive.pack and 'episode' in guess: - wanted_episode = set(episode) if isinstance(episode, list) else {episode} - archive_episode = guess['episode'] if isinstance(guess['episode'], list) else {guess['episode']} - - if not wanted_episode.intersection(archive_episode): - logger.debug('Mismatched episode %s, discarding archive: %s', guess['episode'], clean_name) - continue # extract text containing downloads, rating and timestamp data_text = archive_soup.find('p', class_='data').text @@ -392,8 +300,6 @@ class LegendasTVProvider(Provider): raise ProviderError('Archive timestamp is in the future') # add archive - logger.info('Found archive for title %d and language %d at page %s: %s', - title_id, language_code, page, archive) archives.append(archive) # stop on last page @@ -416,7 +322,7 @@ class LegendasTVProvider(Provider): """ logger.info('Downloading archive %s', archive.id) r = self.session.get(self.server_url + 'downloadarquivo/{}'.format(archive.id)) - raise_for_status(r) + r.raise_for_status() # open the archive archive_stream = io.BytesIO(r.content) @@ -431,26 +337,60 @@ class LegendasTVProvider(Provider): def query(self, language, title, season=None, episode=None, year=None): # search for titles - titles = self.search_titles(title, season, year) + titles = self.search_titles(sanitize(title)) + + # search for titles with the quote or dot character + ignore_characters = {'\'', '.'} + if any(c in title for c in ignore_characters): + titles.update(self.search_titles(sanitize(title, ignore_characters=ignore_characters))) subtitles = [] # iterate over titles for title_id, t in titles.items(): + # discard mismatches on title + if sanitize(t['title']) != sanitize(title): + continue + + # episode + if season and episode: + # discard mismatches on type + if t['type'] != 'episode': + continue + + # discard mismatches on season + if 'season' not in t or t['season'] != season: + continue + # movie + else: + # discard mismatches on type + if t['type'] != 'movie': + continue - logger.info('Getting archives for title %d and language %d', title_id, language.legendastv) - archives = self.get_archives(title_id, language.legendastv, t['type'], season, episode) - if not archives: - logger.info('No archives found for title %d and language %d', title_id, language.legendastv) + # discard mismatches on year + if year is not None and 'year' in t and t['year'] != year: + continue # iterate over title's archives - for a in archives: + for a in self.get_archives(title_id, language.legendastv): + # clean name of path separators and pack flags + clean_name = a.name.replace('/', '-') + if a.pack and clean_name.startswith('(p)'): + clean_name = clean_name[3:] + + # guess from name + guess = guessit(clean_name, {'type': t['type']}) + + # episode + if season and episode: + # discard mismatches on episode in non-pack archives + if not a.pack and 'episode' in guess and guess['episode'] != episode: + continue # compute an expiration time based on the archive timestamp expiration_time = (datetime.utcnow().replace(tzinfo=pytz.utc) - a.timestamp).total_seconds() # attempt to get the releases from the cache - cache_key = releases_key.format(archive_id=a.id, archive_name=a.name) - releases = region.get(cache_key, expiration_time=expiration_time) + releases = region.get(releases_key.format(archive_id=a.id), expiration_time=expiration_time) # the releases are not in cache or cache is expired if releases == NO_VALUE: @@ -477,12 +417,12 @@ class LegendasTVProvider(Provider): releases.append(name) # cache the releases - region.set(cache_key, releases) + region.set(releases_key.format(archive_id=a.id), releases) # iterate over releases for r in releases: - subtitle = self.subtitle_class(language, t['type'], t['title'], t.get('year'), t.get('imdb_id'), - t.get('season'), a, r) + subtitle = LegendasTVSubtitle(language, t['type'], t['title'], t.get('year'), t.get('imdb_id'), + t.get('season'), a, r) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) @@ -491,19 +431,13 @@ class LegendasTVProvider(Provider): def list_subtitles(self, video, languages): season = episode = None if isinstance(video, Episode): - titles = [video.series] + video.alternative_series + title = video.series season = video.season episode = video.episode else: - titles = [video.title] + video.alternative_titles - - for title in titles: - subtitles = [s for l in languages for s in - self.query(l, title, season=season, episode=episode, year=video.year)] - if subtitles: - return subtitles + title = video.title - return [] + return [s for l in languages for s in self.query(l, title, season=season, episode=episode, year=video.year)] def download_subtitle(self, subtitle): # download archive in case we previously hit the releases cache and didn't download it @@ -512,11 +446,3 @@ class LegendasTVProvider(Provider): # extract subtitle's content subtitle.content = fix_line_ending(subtitle.archive.content.read(subtitle.name)) - - -def raise_for_status(r): - # When site is under maintaince and http status code 200. - if 'Em breve estaremos de volta' in r.text: - raise ServiceUnavailable - else: - r.raise_for_status() diff --git a/libs/subliminal/providers/napiprojekt.py b/libs/subliminal/providers/napiprojekt.py index 0a1a3aa37..f44f85d95 100644 --- a/libs/subliminal/providers/napiprojekt.py +++ b/libs/subliminal/providers/napiprojekt.py @@ -42,7 +42,6 @@ class NapiProjektSubtitle(Subtitle): def __init__(self, language, hash): super(NapiProjektSubtitle, self).__init__(language) self.hash = hash - self.content = None @property def id(self): @@ -63,10 +62,6 @@ class NapiProjektProvider(Provider): languages = {Language.fromalpha2(l) for l in ['pl']} required_hash = 'napiprojekt' server_url = 'http://napiprojekt.pl/unit_napisy/dl.php' - subtitle_class = NapiProjektSubtitle - - def __init__(self): - self.session = None def initialize(self): self.session = Session() @@ -86,16 +81,16 @@ class NapiProjektProvider(Provider): 'f': hash, 't': get_subhash(hash)} logger.info('Searching subtitle %r', params) - r = self.session.get(self.server_url, params=params, timeout=10) - r.raise_for_status() + response = self.session.get(self.server_url, params=params, timeout=10) + response.raise_for_status() # handle subtitles not found and errors - if r.content[:4] == b'NPc0': + if response.content[:4] == b'NPc0': logger.debug('No subtitles found') return None - subtitle = self.subtitle_class(language, hash) - subtitle.content = r.content + subtitle = NapiProjektSubtitle(language, hash) + subtitle.content = response.content logger.debug('Found subtitle %r', subtitle) return subtitle diff --git a/libs/subliminal/providers/opensubtitles.py b/libs/subliminal/providers/opensubtitles.py index f7a87fcad..5ab09da48 100644 --- a/libs/subliminal/providers/opensubtitles.py +++ b/libs/subliminal/providers/opensubtitles.py @@ -11,8 +11,7 @@ from six.moves.xmlrpc_client import ServerProxy from . import Provider, TimeoutSafeTransport from .. import __short_version__ -from ..exceptions import (AuthenticationError, ConfigurationError, DownloadLimitExceeded, ProviderError, - ServiceUnavailable) +from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, ProviderError from ..subtitle import Subtitle, fix_line_ending, guess_matches from ..utils import sanitize from ..video import Episode, Movie @@ -27,8 +26,7 @@ class OpenSubtitlesSubtitle(Subtitle): def __init__(self, language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind, hash, movie_name, movie_release_name, movie_year, movie_imdb_id, series_season, series_episode, filename, encoding): - super(OpenSubtitlesSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, - page_link=page_link, encoding=encoding) + super(OpenSubtitlesSubtitle, self).__init__(language, hearing_impaired, page_link, encoding) self.subtitle_id = subtitle_id self.matched_by = matched_by self.movie_kind = movie_kind @@ -60,8 +58,7 @@ class OpenSubtitlesSubtitle(Subtitle): if isinstance(video, Episode) and self.movie_kind == 'episode': # tag match, assume series, year, season and episode matches if self.matched_by == 'tag': - if not video.imdb_id or self.movie_imdb_id == video.imdb_id: - matches |= {'series', 'year', 'season', 'episode'} + matches |= {'series', 'year', 'season', 'episode'} # series if video.series and sanitize(self.series_name) == sanitize(video.series): matches.add('series') @@ -90,8 +87,7 @@ class OpenSubtitlesSubtitle(Subtitle): elif isinstance(video, Movie) and self.movie_kind == 'movie': # tag match, assume title and year matches if self.matched_by == 'tag': - if not video.imdb_id or self.movie_imdb_id == video.imdb_id: - matches |= {'title', 'year'} + matches |= {'title', 'year'} # title if video.title and sanitize(self.movie_name) == sanitize(video.title): matches.add('title') @@ -126,11 +122,10 @@ class OpenSubtitlesProvider(Provider): """ languages = {Language.fromopensubtitles(l) for l in language_converters['opensubtitles'].codes} - subtitle_class = OpenSubtitlesSubtitle def __init__(self, username=None, password=None): self.server = ServerProxy('https://api.opensubtitles.org/xml-rpc', TimeoutSafeTransport(10)) - if any((username, password)) and not all((username, password)): + if username and not password or not username and password: raise ConfigurationError('Username and password must be specified') # None values not allowed for logging in, so replace it by '' self.username = username or '' @@ -161,10 +156,7 @@ class OpenSubtitlesProvider(Provider): if hash and size: criteria.append({'moviehash': hash, 'moviebytesize': str(size)}) if imdb_id: - if season and episode: - criteria.append({'imdbid': imdb_id[2:], 'season': season, 'episode': episode}) - else: - criteria.append({'imdbid': imdb_id[2:]}) + criteria.append({'imdbid': imdb_id[2:]}) if tag: criteria.append({'tag': tag}) if query and season and episode: @@ -207,9 +199,9 @@ class OpenSubtitlesProvider(Provider): filename = subtitle_item['SubFileName'] encoding = subtitle_item.get('SubEncoding') or None - subtitle = self.subtitle_class(language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind, - hash, movie_name, movie_release_name, movie_year, movie_imdb_id, - series_season, series_episode, filename, encoding) + subtitle = OpenSubtitlesSubtitle(language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind, + hash, movie_name, movie_release_name, movie_year, movie_imdb_id, + series_season, series_episode, filename, encoding) logger.debug('Found subtitle %r by %s', subtitle, matched_by) subtitles.append(subtitle) @@ -268,6 +260,11 @@ class DisabledUserAgent(OpenSubtitlesError, AuthenticationError): pass +class ServiceUnavailable(OpenSubtitlesError): + """Exception raised when status is '503 Service Unavailable'.""" + pass + + def checked(response): """Check a response status before returning it. diff --git a/libs/subliminal/providers/podnapisi.py b/libs/subliminal/providers/podnapisi.py index 5dcf25583..f643682b9 100644 --- a/libs/subliminal/providers/podnapisi.py +++ b/libs/subliminal/providers/podnapisi.py @@ -31,7 +31,7 @@ class PodnapisiSubtitle(Subtitle): def __init__(self, language, hearing_impaired, page_link, pid, releases, title, season=None, episode=None, year=None): - super(PodnapisiSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link) + super(PodnapisiSubtitle, self).__init__(language, hearing_impaired, page_link) self.pid = pid self.releases = releases self.title = title @@ -49,8 +49,7 @@ class PodnapisiSubtitle(Subtitle): # episode if isinstance(video, Episode): # series - if video.series and (sanitize(self.title) in ( - sanitize(name) for name in [video.series] + video.alternative_series)): + if video.series and sanitize(self.title) == sanitize(video.series): matches.add('series') # year if video.original_series and self.year is None or video.year and video.year == self.year: @@ -67,8 +66,7 @@ class PodnapisiSubtitle(Subtitle): # movie elif isinstance(video, Movie): # title - if video.title and (sanitize(self.title) in ( - sanitize(name) for name in [video.title] + video.alternative_titles)): + if video.title and sanitize(self.title) == sanitize(video.title): matches.add('title') # year if video.year and self.year == video.year: @@ -84,11 +82,7 @@ class PodnapisiProvider(Provider): """Podnapisi Provider.""" languages = ({Language('por', 'BR'), Language('srp', script='Latn')} | {Language.fromalpha2(l) for l in language_converters['alpha2'].codes}) - server_url = 'https://www.podnapisi.net/subtitles/' - subtitle_class = PodnapisiSubtitle - - def __init__(self): - self.session = None + server_url = 'http://podnapisi.net/subtitles/' def initialize(self): self.session = Session() @@ -114,9 +108,7 @@ class PodnapisiProvider(Provider): pids = set() while True: # query the server - r = self.session.get(self.server_url + 'search/old', params=params, timeout=10) - r.raise_for_status() - xml = etree.fromstring(r.content) + xml = etree.fromstring(self.session.get(self.server_url + 'search/old', params=params, timeout=10).content) # exit if no results if not int(xml.find('pagination/results').text): @@ -126,14 +118,10 @@ class PodnapisiProvider(Provider): # loop over subtitles for subtitle_xml in xml.findall('subtitle'): # read xml elements - pid = subtitle_xml.find('pid').text - # ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321 - if pid in pids: - continue - language = Language.fromietf(subtitle_xml.find('language').text) hearing_impaired = 'n' in (subtitle_xml.find('flags').text or '') page_link = subtitle_xml.find('url').text + pid = subtitle_xml.find('pid').text releases = [] if subtitle_xml.find('release').text: for release in subtitle_xml.find('release').text.split(): @@ -146,11 +134,15 @@ class PodnapisiProvider(Provider): year = int(subtitle_xml.find('year').text) if is_episode: - subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title, - season=season, episode=episode, year=year) + subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title, + season=season, episode=episode, year=year) else: - subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title, - year=year) + subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title, + year=year) + + # ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321 + if pid in pids: + continue logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) @@ -167,21 +159,11 @@ class PodnapisiProvider(Provider): return subtitles def list_subtitles(self, video, languages): - season = episode = None if isinstance(video, Episode): - titles = [video.series] + video.alternative_series - season = video.season - episode = video.episode - else: - titles = [video.title] + video.alternative_titles - - for title in titles: - subtitles = [s for l in languages for s in - self.query(l, title, season=season, episode=episode, year=video.year)] - if subtitles: - return subtitles - - return [] + return [s for l in languages for s in self.query(l, video.series, season=video.season, + episode=video.episode, year=video.year)] + elif isinstance(video, Movie): + return [s for l in languages for s in self.query(l, video.title, year=video.year)] def download_subtitle(self, subtitle): # download as a zip diff --git a/libs/subliminal/providers/shooter.py b/libs/subliminal/providers/shooter.py index 62f11bd93..fc79faf7a 100644 --- a/libs/subliminal/providers/shooter.py +++ b/libs/subliminal/providers/shooter.py @@ -42,10 +42,6 @@ class ShooterProvider(Provider): """Shooter Provider.""" languages = {Language(l) for l in ['eng', 'zho']} server_url = 'https://www.shooter.cn/api/subapi.php' - subtitle_class = ShooterSubtitle - - def __init__(self): - self.session = None def initialize(self): self.session = Session() @@ -68,7 +64,7 @@ class ShooterProvider(Provider): # parse the subtitles results = json.loads(r.text) - subtitles = [self.subtitle_class(language, hash, t['Link']) for s in results for t in s['Files']] + subtitles = [ShooterSubtitle(language, hash, t['Link']) for s in results for t in s['Files']] return subtitles diff --git a/libs/subliminal/providers/subscenter.py b/libs/subliminal/providers/subscenter.py index f9bf3c8cb..1e25e5e1d 100644 --- a/libs/subliminal/providers/subscenter.py +++ b/libs/subliminal/providers/subscenter.py @@ -26,7 +26,7 @@ class SubsCenterSubtitle(Subtitle): provider_name = 'subscenter' def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, subtitle_id, subtitle_key, - subtitle_version, downloaded, releases): + downloaded, releases): super(SubsCenterSubtitle, self).__init__(language, hearing_impaired, page_link) self.series = series self.season = season @@ -34,7 +34,6 @@ class SubsCenterSubtitle(Subtitle): self.title = title self.subtitle_id = subtitle_id self.subtitle_key = subtitle_key - self.subtitle_version = subtitle_version self.downloaded = downloaded self.releases = releases @@ -75,8 +74,7 @@ class SubsCenterSubtitle(Subtitle): class SubsCenterProvider(Provider): """SubsCenter Provider.""" languages = {Language.fromalpha2(l) for l in ['he']} - server_url = 'http://www.subscenter.org/he/' - subtitle_class = SubsCenterSubtitle + server_url = 'http://www.subscenter.co/he/' def __init__(self, username=None, password=None): if username is not None and password is None or username is None and password is not None: @@ -191,7 +189,6 @@ class SubsCenterProvider(Provider): hearing_impaired = bool(subtitle_item['hearing_impaired']) subtitle_id = subtitle_item['id'] subtitle_key = subtitle_item['key'] - subtitle_version = subtitle_item['h_version'] downloaded = subtitle_item['downloaded'] release = subtitle_item['subtitle_version'] @@ -203,9 +200,8 @@ class SubsCenterProvider(Provider): continue # otherwise create it - subtitle = self.subtitle_class(language, hearing_impaired, page_link, title, season, episode, - title, subtitle_id, subtitle_key, subtitle_version, downloaded, - [release]) + subtitle = SubsCenterSubtitle(language, hearing_impaired, page_link, title, season, episode, + title, subtitle_id, subtitle_key, downloaded, [release]) logger.debug('Found subtitle %r', subtitle) subtitles[subtitle_id] = subtitle @@ -225,19 +221,15 @@ class SubsCenterProvider(Provider): def download_subtitle(self, subtitle): # download url = self.server_url + 'subtitle/download/{}/{}/'.format(subtitle.language.alpha2, subtitle.subtitle_id) - params = {'v': subtitle.subtitle_version, 'key': subtitle.subtitle_key} + params = {'v': subtitle.releases[0], 'key': subtitle.subtitle_key} r = self.session.get(url, params=params, headers={'Referer': subtitle.page_link}, timeout=10) r.raise_for_status() # open the zip - try: - with zipfile.ZipFile(io.BytesIO(r.content)) as zf: - # remove some filenames from the namelist - namelist = [n for n in zf.namelist() if not n.endswith('.txt')] - if len(namelist) > 1: - raise ProviderError('More than one file to unzip') - - subtitle.content = fix_line_ending(zf.read(namelist[0])) - except zipfile.BadZipfile: - # if no zip file was retrieved, daily downloads limit has exceeded - raise ProviderError('Daily limit exceeded') + with zipfile.ZipFile(io.BytesIO(r.content)) as zf: + # remove some filenames from the namelist + namelist = [n for n in zf.namelist() if not n.endswith('.txt')] + if len(namelist) > 1: + raise ProviderError('More than one file to unzip') + + subtitle.content = fix_line_ending(zf.read(namelist[0])) diff --git a/libs/subliminal/providers/thesubdb.py b/libs/subliminal/providers/thesubdb.py index 87c1e7b79..6bf4a0ebd 100644 --- a/libs/subliminal/providers/thesubdb.py +++ b/libs/subliminal/providers/thesubdb.py @@ -40,10 +40,6 @@ class TheSubDBProvider(Provider): languages = {Language.fromthesubdb(l) for l in language_converters['thesubdb'].codes} required_hash = 'thesubdb' server_url = 'http://api.thesubdb.com/' - subtitle_class = TheSubDBSubtitle - - def __init__(self): - self.session = None def initialize(self): self.session = Session() @@ -70,7 +66,7 @@ class TheSubDBProvider(Provider): for language_code in r.text.split(','): language = Language.fromthesubdb(language_code) - subtitle = self.subtitle_class(language, hash) + subtitle = TheSubDBSubtitle(language, hash) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) diff --git a/libs/subliminal/providers/tvsubtitles.py b/libs/subliminal/providers/tvsubtitles.py index 56b692c6e..ec033ee72 100644 --- a/libs/subliminal/providers/tvsubtitles.py +++ b/libs/subliminal/providers/tvsubtitles.py @@ -47,8 +47,7 @@ class TVsubtitlesSubtitle(Subtitle): matches = set() # series - if video.series and (sanitize(self.series) in ( - sanitize(name) for name in [video.series] + video.alternative_series)): + if video.series and sanitize(self.series) == sanitize(video.series): matches.add('series') # season if video.season and self.season == video.season: @@ -81,10 +80,6 @@ class TVsubtitlesProvider(Provider): ]} video_types = (Episode,) server_url = 'http://www.tvsubtitles.net/' - subtitle_class = TVsubtitlesSubtitle - - def __init__(self): - self.session = None def initialize(self): self.session = Session() @@ -163,7 +158,13 @@ class TVsubtitlesProvider(Provider): return episode_ids - def query(self, show_id, series, season, episode, year=None): + def query(self, series, season, episode, year=None): + # search the show id + show_id = self.search_show_id(series, year) + if show_id is None: + logger.error('No show id found for %r (%r)', series, {'year': year}) + return [] + # get the episode ids episode_ids = self.get_episode_ids(show_id, season) if episode not in episode_ids: @@ -183,9 +184,9 @@ class TVsubtitlesProvider(Provider): subtitle_id = int(row.parent['href'][10:-5]) page_link = self.server_url + 'subtitle-%d.html' % subtitle_id rip = row.find('p', title='rip').text.strip() or None - release = row.find('h5').text.strip() or None + release = row.find('p', title='release').text.strip() or None - subtitle = self.subtitle_class(language, page_link, subtitle_id, series, season, episode, year, rip, + subtitle = TVsubtitlesSubtitle(language, page_link, subtitle_id, series, season, episode, year, rip, release) logger.debug('Found subtitle %s', subtitle) subtitles.append(subtitle) @@ -193,24 +194,7 @@ class TVsubtitlesProvider(Provider): return subtitles def list_subtitles(self, video, languages): - # lookup show_id - titles = [video.series] + video.alternative_series - show_id = None - for title in titles: - show_id = self.search_show_id(title, video.year) - if show_id is not None: - break - - # query for subtitles with the show_id - if show_id is not None: - subtitles = [s for s in self.query(show_id, title, video.season, video.episode, video.year) - if s.language in languages and s.episode == video.episode] - if subtitles: - return subtitles - else: - logger.error('No show id found for %r (%r)', video.series, {'year': video.year}) - - return [] + return [s for s in self.query(video.series, video.season, video.episode, video.year) if s.language in languages] def download_subtitle(self, subtitle): # download as a zip diff --git a/libs/subliminal/refiners/tvdb.py b/libs/subliminal/refiners/tvdb.py index 575840455..1828e5cf0 100644 --- a/libs/subliminal/refiners/tvdb.py +++ b/libs/subliminal/refiners/tvdb.py @@ -3,7 +3,7 @@ from datetime import datetime, timedelta from functools import wraps import logging import re -import _strptime + import requests from .. import __short_version__ @@ -331,7 +331,6 @@ def refine(video, **kwargs): # add series information logger.debug('Found series %r', series) video.series = matching_result['match']['series'] - video.alternative_series.extend(series['aliases']) video.year = matching_result['match']['year'] video.original_series = matching_result['match']['original_series'] video.series_tvdb_id = series['id'] diff --git a/libs/subliminal/score.py b/libs/subliminal/score.py index 61dd22c25..31ccb3433 100644 --- a/libs/subliminal/score.py +++ b/libs/subliminal/score.py @@ -44,7 +44,7 @@ movie_scores = {'hash': 119, 'title': 60, 'year': 30, 'release_group': 15, 'format': 7, 'audio_codec': 3, 'resolution': 2, 'video_codec': 2, 'hearing_impaired': 1} #: Equivalent release groups -equivalent_release_groups = ({'LOL', 'DIMENSION'}, {'ASAP', 'IMMERSE', 'FLEET'}, {'AVS', 'SVA'}) +equivalent_release_groups = ({'LOL', 'DIMENSION'}, {'ASAP', 'IMMERSE', 'FLEET'}) def get_equivalent_release_groups(release_group): diff --git a/libs/subliminal/subtitle.py b/libs/subliminal/subtitle.py index 5c2c789b2..60cdf3d6e 100644 --- a/libs/subliminal/subtitle.py +++ b/libs/subliminal/subtitle.py @@ -208,14 +208,8 @@ def guess_matches(video, guess, partial=False): if video.season and 'season' in guess and guess['season'] == video.season: matches.add('season') # episode - # Currently we only have single-ep support (guessit returns a multi-ep as a list with int values) - # Most providers only support single-ep, so make sure it contains only 1 episode - # In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number) - if video.episode and 'episode' in guess: - episode_guess = guess['episode'] - episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess - if episode == video.episode: - matches.add('episode') + if video.episode and 'episode' in guess and guess['episode'] == video.episode: + matches.add('episode') # year if video.year and 'year' in guess and guess['year'] == video.year: matches.add('year') @@ -258,4 +252,4 @@ def fix_line_ending(content): :rtype: bytes """ - return content.replace(b'\r\n', b'\n') + return content.replace(b'\r\n', b'\n').replace(b'\r', b'\n') diff --git a/libs/subliminal/video.py b/libs/subliminal/video.py index 0db6c65c4..00304e911 100644 --- a/libs/subliminal/video.py +++ b/libs/subliminal/video.py @@ -13,9 +13,9 @@ VIDEO_EXTENSIONS = ('.3g2', '.3gp', '.3gp2', '.3gpp', '.60d', '.ajp', '.asf', '. '.bix', '.box', '.cam', '.dat', '.divx', '.dmf', '.dv', '.dvr-ms', '.evo', '.flc', '.fli', '.flic', '.flv', '.flx', '.gvi', '.gvp', '.h264', '.m1v', '.m2p', '.m2ts', '.m2v', '.m4e', '.m4v', '.mjp', '.mjpeg', '.mjpg', '.mkv', '.moov', '.mov', '.movhd', '.movie', '.movx', '.mp4', - '.mpe', '.mpeg', '.mpg', '.mpv', '.mpv2', '.mxf', '.nsv', '.nut', '.ogg', '.ogm', '.ogv', '.omf', + '.mpe', '.mpeg', '.mpg', '.mpv', '.mpv2', '.mxf', '.nsv', '.nut', '.ogg', '.ogm' '.ogv', '.omf', '.ps', '.qt', '.ram', '.rm', '.rmvb', '.swf', '.ts', '.vfw', '.vid', '.video', '.viv', '.vivo', - '.vob', '.vro', '.webm', '.wm', '.wmv', '.wmx', '.wrap', '.wvx', '.wx', '.x264', '.xvid') + '.vob', '.vro', '.wm', '.wmv', '.wmx', '.wrap', '.wvx', '.wx', '.x264', '.xvid') class Video(object): @@ -123,12 +123,11 @@ class Episode(Video): :param int year: year of the series. :param bool original_series: whether the series is the first with this name. :param int tvdb_id: TVDB id of the episode. - :param list alternative_series: alternative names of the series :param \*\*kwargs: additional parameters for the :class:`Video` constructor. """ def __init__(self, name, series, season, episode, title=None, year=None, original_series=True, tvdb_id=None, - series_tvdb_id=None, series_imdb_id=None, alternative_series=None, **kwargs): + series_tvdb_id=None, series_imdb_id=None, **kwargs): super(Episode, self).__init__(name, **kwargs) #: Series of the episode @@ -158,9 +157,6 @@ class Episode(Video): #: IMDb id of the series self.series_imdb_id = series_imdb_id - #: Alternative names of the series - self.alternative_series = alternative_series or [] - @classmethod def fromguess(cls, name, guess): if guess['type'] != 'episode': @@ -169,13 +165,7 @@ class Episode(Video): if 'title' not in guess or 'episode' not in guess: raise ValueError('Insufficient data to process the guess') - # Currently we only have single-ep support (guessit returns a multi-ep as a list with int values) - # Most providers only support single-ep, so make sure it contains only 1 episode - # In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number) - episode_guess = guess.get('episode') - episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess - - return cls(name, guess['title'], guess.get('season', 1), episode, title=guess.get('episode_title'), + return cls(name, guess['title'], guess.get('season', 1), guess['episode'], title=guess.get('episode_title'), year=guess.get('year'), format=guess.get('format'), original_series='year' not in guess, release_group=guess.get('release_group'), resolution=guess.get('screen_size'), video_codec=guess.get('video_codec'), audio_codec=guess.get('audio_codec')) @@ -196,11 +186,10 @@ class Movie(Video): :param str title: title of the movie. :param int year: year of the movie. - :param list alternative_titles: alternative titles of the movie :param \*\*kwargs: additional parameters for the :class:`Video` constructor. """ - def __init__(self, name, title, year=None, alternative_titles=None, **kwargs): + def __init__(self, name, title, year=None, **kwargs): super(Movie, self).__init__(name, **kwargs) #: Title of the movie @@ -209,9 +198,6 @@ class Movie(Video): #: Year of the movie self.year = year - #: Alternative titles of the movie - self.alternative_titles = alternative_titles or [] - @classmethod def fromguess(cls, name, guess): if guess['type'] != 'movie': @@ -220,13 +206,9 @@ class Movie(Video): if 'title' not in guess: raise ValueError('Insufficient data to process the guess') - alternative_titles = [] - if 'alternative_title' in guess: - alternative_titles.append(u"%s %s" % (guess['title'], guess['alternative_title'])) - return cls(name, guess['title'], format=guess.get('format'), release_group=guess.get('release_group'), resolution=guess.get('screen_size'), video_codec=guess.get('video_codec'), - audio_codec=guess.get('audio_codec'), year=guess.get('year'), alternative_titles=alternative_titles) + audio_codec=guess.get('audio_codec'), year=guess.get('year')) @classmethod def fromname(cls, name): diff --git a/libs/subliminal_patch/core.py b/libs/subliminal_patch/core.py index 363477e1f..d3f2b1504 100644 --- a/libs/subliminal_patch/core.py +++ b/libs/subliminal_patch/core.py @@ -10,7 +10,7 @@ import time import operator import itertools -from httplib import ResponseNotReady +from http.client import ResponseNotReady import rarfile import requests @@ -21,14 +21,13 @@ from babelfish import LanguageReverseError from guessit.jsonutils import GuessitEncoder from subliminal import ProviderError, refiner_manager -from extensions import provider_registry -from subliminal.exceptions import ServiceUnavailable, DownloadLimitExceeded +from subliminal_patch.extensions import provider_registry from subliminal.score import compute_score as default_compute_score from subliminal.utils import hash_napiprojekt, hash_opensubtitles, hash_shooter, hash_thesubdb from subliminal.video import VIDEO_EXTENSIONS, Video, Episode, Movie from subliminal.core import guessit, ProviderPool, io, is_windows_special_path, \ ThreadPoolExecutor, check_video -from subliminal_patch.exceptions import TooManyRequests, APIThrottled +from subliminal_patch.exceptions import TooManyRequests, APIThrottled, ServiceUnavailable, DownloadLimitExceeded from subzero.language import Language from scandir import scandir, scandir_generic as _scandir_generic @@ -186,7 +185,7 @@ class SZProviderPool(ProviderPool): except (requests.Timeout, socket.timeout): logger.error('Provider %r timed out', provider) - except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled), e: + except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled) as e: self.throttle_callback(provider, e) return @@ -283,7 +282,7 @@ class SZProviderPool(ProviderPool): logger.debug("RAR Traceback: %s", traceback.format_exc()) return False - except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled), e: + except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled) as e: self.throttle_callback(subtitle.provider_name, e) self.discarded_providers.add(subtitle.provider_name) return False @@ -648,7 +647,7 @@ def search_external_subtitles(path, languages=None, only_one=False): abspath = unicode(os.path.abspath( os.path.join(*[video_path if not os.path.isabs(folder_or_subfolder) else "", folder_or_subfolder, video_filename]))) - except Exception, e: + except Exception as e: logger.error("skipping path %s because of %s", repr(folder_or_subfolder), e) continue logger.debug("external subs: scanning path %s", abspath) diff --git a/libs/subliminal_patch/exceptions.py b/libs/subliminal_patch/exceptions.py index e336a10af..946d3d9b6 100644 --- a/libs/subliminal_patch/exceptions.py +++ b/libs/subliminal_patch/exceptions.py @@ -9,3 +9,13 @@ class TooManyRequests(ProviderError): class APIThrottled(ProviderError): pass + + +class ServiceUnavailable(ProviderError): + """Exception raised when status is '503 Service Unavailable'.""" + pass + + +class DownloadLimitExceeded(ProviderError): + """Exception raised by providers when download limit is exceeded.""" + pass diff --git a/libs/subliminal_patch/http.py b/libs/subliminal_patch/http.py index db313578e..af79c61a4 100644 --- a/libs/subliminal_patch/http.py +++ b/libs/subliminal_patch/http.py @@ -8,7 +8,7 @@ import os import socket import logging import requests -import xmlrpclib +import xmlrpc.client import dns.resolver import ipaddress import re @@ -16,7 +16,7 @@ import re from requests import exceptions from urllib3.util import connection from retry.api import retry_call -from exceptions import APIThrottled +from .exceptions import APIThrottled from dogpile.cache.api import NO_VALUE from subliminal.cache import region from subliminal_patch.pitcher import pitchers @@ -32,10 +32,8 @@ try: except ImportError: from urllib.parse import urlparse -from subzero.lib.io import get_viable_encoding - logger = logging.getLogger(__name__) -pem_file = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(unicode(__file__, get_viable_encoding()))), "..", certifi.where())) +pem_file = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", certifi.where())) try: default_ssl_context = ssl.create_default_context(cafile=pem_file) except AttributeError: @@ -99,7 +97,7 @@ class CFSession(CloudScraper): # Solve Challenge resp = self.sendChallengeResponse(resp, **kwargs) - except ValueError, e: + except ValueError as e: if e.message == "Captcha": parsed_url = urlparse(url) domain = parsed_url.netloc @@ -231,7 +229,7 @@ class RetryingCFSession(RetryingSession, CFSession): pass -class SubZeroRequestsTransport(xmlrpclib.SafeTransport): +class SubZeroRequestsTransport(xmlrpc.client.SafeTransport): """ Drop in Transport for xmlrpclib that uses Requests instead of httplib diff --git a/libs/subliminal_patch/pitcher.py b/libs/subliminal_patch/pitcher.py index b2cef63b3..cdf47cb17 100644 --- a/libs/subliminal_patch/pitcher.py +++ b/libs/subliminal_patch/pitcher.py @@ -8,7 +8,7 @@ from subliminal.cache import region from dogpile.cache.api import NO_VALUE from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, AnticaptchaException,\ Proxy -from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TOKEN_TIMEOUT +from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TIMEOUT logger = logging.getLogger(__name__) @@ -185,7 +185,7 @@ class DBCProxyLessPitcher(Pitcher): password = None def __init__(self, website_name, website_url, website_key, - timeout=DEFAULT_TOKEN_TIMEOUT, tries=3, *args, **kwargs): + timeout=DEFAULT_TIMEOUT, tries=3, *args, **kwargs): super(DBCProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries) self.username, self.password = self.client_key.split(":", 1) diff --git a/libs/subliminal_patch/refiners/tvdb.py b/libs/subliminal_patch/refiners/tvdb.py index 808c8ef90..13cd2131a 100644 --- a/libs/subliminal_patch/refiners/tvdb.py +++ b/libs/subliminal_patch/refiners/tvdb.py @@ -5,7 +5,7 @@ import datetime from subliminal.refiners.tvdb import Episode, logger, search_series, series_re, sanitize, get_series, \ get_series_episode, region, tvdb_client -from util import fix_session_bases +from .util import fix_session_bases TVDB_SEASON_EXPIRATION_TIME = datetime.timedelta(days=1).total_seconds() diff --git a/libs/subliminal_patch/subtitle.py b/libs/subliminal_patch/subtitle.py index 057be546a..1dbd481fe 100644 --- a/libs/subliminal_patch/subtitle.py +++ b/libs/subliminal_patch/subtitle.py @@ -272,9 +272,9 @@ class Subtitle(Subtitle_): def prepare_text(text, style): body = [] for fragment, sty in parse_tags(text, style, sub.styles): - fragment = fragment.replace(ur"\h", u" ") - fragment = fragment.replace(ur"\n", u"\n") - fragment = fragment.replace(ur"\N", u"\n") + fragment = fragment.replace(r"\h", u" ") + fragment = fragment.replace(r"\n", u"\n") + fragment = fragment.replace(r"\N", u"\n") if format == "srt": if sty.italic: fragment = u"<i>%s</i>" % fragment diff --git a/libs/subzero/lib/__init__.py b/libs/subzero/lib/__init__.py index 2c1fe6987..9ffc2cdb3 100644 --- a/libs/subzero/lib/__init__.py +++ b/libs/subzero/lib/__init__.py @@ -1,2 +1,8 @@ -import dict, geezip, httpfake, io, json, rar, which
\ No newline at end of file +from .dict import * +from .geezip import * +from .httpfake import * +from .io import * +from .json import * +from .rar import * +from .which import *
\ No newline at end of file diff --git a/libs/subzero/lib/geezip.py b/libs/subzero/lib/geezip.py index 89657834f..7609ef6be 100644 --- a/libs/subzero/lib/geezip.py +++ b/libs/subzero/lib/geezip.py @@ -28,7 +28,7 @@ class GeezipFile(gzip.GzipFile): fileobj.write(self.compress.flush(Z_FINISH)) gzip.write32u(fileobj, self.crc) # self.size may exceed 2GB, or even 4GB - gzip.write32u(fileobj, self.size & 0xffffffffL) + gzip.write32u(fileobj, self.size & 0xffffffff) fileobj.flush() finally: myfileobj = self.myfileobj diff --git a/libs/subzero/modification/__init__.py b/libs/subzero/modification/__init__.py index d6008b17b..b4d3dd5dc 100644 --- a/libs/subzero/modification/__init__.py +++ b/libs/subzero/modification/__init__.py @@ -1,5 +1,5 @@ # coding=utf-8 -from registry import registry -from mods import hearing_impaired, ocr_fixes, fps, offset, common, color -from main import SubtitleModifications, SubMod +from .registry import registry +from .mods import hearing_impaired, ocr_fixes, fps, offset, common, color +from .main import SubtitleModifications, SubMod diff --git a/libs/subzero/modification/dictionaries/__init__.py b/libs/subzero/modification/dictionaries/__init__.py index 201b7409a..903b99f0d 100644 --- a/libs/subzero/modification/dictionaries/__init__.py +++ b/libs/subzero/modification/dictionaries/__init__.py @@ -1,3 +1,3 @@ # coding=utf-8 -from data import data
\ No newline at end of file +from .data import data
\ No newline at end of file diff --git a/libs/subzero/modification/dictionaries/data.py b/libs/subzero/modification/dictionaries/data.py index c56a0c809..a7ae3ee44 100644 --- a/libs/subzero/modification/dictionaries/data.py +++ b/libs/subzero/modification/dictionaries/data.py @@ -192,7 +192,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'WholeWords': {'data': OrderedDict([(u'l\xe2rt', u'l\xe4rt'), (u'hederv\xe5rda', u'hederv\xe4rda'), (u'storm\xe2stare', u'storm\xe4stare'), (u'Avf\xe2rd', u'Avf\xe4rd'), (u't\xe2lten', u't\xe4lten'), (u'\xe2rjag', u'\xe4r jag'), (u'\xe4rjag', u'\xe4r jag'), (u'j\xe2mlikar', u'j\xe4mlikar'), (u'Riskako\ufb02', u'Riskakor'), (u'Karamellen/', u'Karamellen'), (u'Lngen\xfcng', u'Ingenting'), (u'\xe4rju', u'\xe4r ju'), (u'S\xe1', u'S\xe5'), (u'n\xe4rjag', u'n\xe4r jag'), (u'alltjag', u'allt jag'), (u'g\xf6rjag', u'g\xf6r jag'), (u'trorjag', u'tror jag'), (u'varju', u'var ju'), (u'g\xf6rju', u'g\xf6r ju'), (u'kanju', u'kan ju'), (u'blirjag', u'blir jag'), (u's\xe4gerjag', u's\xe4ger jag'), (u'beh\xe5llerjag', u'beh\xe5ller jag'), (u'pr\xf8blem', u'problem'), (u'r\xe4ddadeju', u'r\xe4ddade ju'), (u'hon\xf8m', u'honom'), (u'Ln', u'In'), (u'sv\xe5r\ufb02\xf6rtad', u'sv\xe5rfl\xf6rtad'), (u'\xf8ch', u'och'), (u'\ufb02\xf6rtar', u'fl\xf6rtar'), (u'k\xe4nnerjag', u'k\xe4nner jag'), (u'\ufb02ickan', u'flickan'), (u'sn\xf8', u'sn\xf6'), (u'gerju', u'ger ju'), (u'k\xf8ntakter', u'kontakter'), (u'\xf8lycka', u'olycka'), (u'n\xf8lla', u'nolla'), (u'sinnenajublar', u'sinnena jublar'), (u'ijobbet', u'i jobbet'), (u'F\xe5rjag', u'F\xe5r jag'), (u'Ar', u'\xc4r'), (u'liggerju', u'ligger ju'), (u'um', u'om'), (u'lbland', u'Ibland'), (u'skjuterjag', u'skjuter jag'), (u'Vadd\xe5', u'Vad d\xe5'), (u'pratarj\xe4mt', u'pratar j\xe4mt'), (u'harju', u'har ju'), (u'sitterjag', u'sitter jag'), (u'h\xe4\ufb02a', u'h\xe4rja'), (u's\ufb01\xe4l', u'stj\xe4l'), (u'F\xd6U', u'F\xf6lj'), (u'varf\xf6rjag', u'varf\xf6r jag'), (u's\ufb01\xe4rna', u'stj\xe4rna'), (u'b\xf6\ufb02ar', u'b\xf6rjar'), (u'b\xf6\ufb02an', u'b\xf6rjan'), (u'st\xe4ri', u'st\xe5r'), (u'p\xe4', u'p\xe5'), (u'harjag', u'har jag'), (u'attjag', u'att jag'), (u'Verkarjag', u'Verkar jag'), (u'K\xe4nnerjag', u'K\xe4nner jag'), (u'd\xe4rjag', u'd\xe4r jag'), (u'tu\ufb01', u'tuff'), (u'lurarjag', u'lurar jag'), (u'varj\xe4ttebra', u'var j\xe4ttebra'), (u'allvan', u'allvar'), (u'deth\xe4r', u'det h\xe4r'), (u'va\ufb02e', u'varje'), (u'F\xf6Uer', u'F\xf6ljer'), (u'personalm\xf6tetl', u'personalm\xf6tet!'), (u'harjust', u'har just'), (u'\xe4rj\xe4tteduktig', u'\xe4r j\xe4tteduktig'), (u'd\xe4rja', u'd\xe4r ja'), (u'lngen\xfcng', u'lngenting'), (u'iluften', u'i luften'), (u'\xf6sen', u'\xf6ser'), (u'tv\xe2', u'tv\xe5'), (u'Uejerna', u'Tjejerna'), (u'h\xe5n*', u'h\xe5rt'), (u'\xc4rjag', u'\xc4r jag'), (u'keL', u'Okej'), (u'F\xf6rjag', u'F\xf6r jag'), (u'varj\xe4ttekul', u'var j\xe4ttekul'), (u'k\xe4mpan', u'k\xe4mpar'), (u'mycketjobb', u'mycket jobb'), (u'Uus', u'ljus'), (u'serjag', u'ser jag'), (u'vetjag', u'vet jag'), (u'f\xe5rjag', u'f\xe5r jag'), (u'hurjag', u'hur jag'), (u'f\xf6rs\xf6kerjag', u'f\xf6rs\xf6ker jag'), (u't\xe1nagel', u't\xe5nagel'), (u'va\xfce', u'varje'), (u'Uudet', u'ljudet'), (u'amhopa', u'allihopa'), (u'V\xe4\xfc', u'V\xe4lj'), (u'g\xe4ri', u'g\xe5r'), (u'r\xf6d\xfcus', u'r\xf6dljus'), (u'Uuset', u'ljuset'), (u'Rid\xe0n', u'Rid\xe5n'), (u'vi\xfca', u'vilja'), (u'g\xe5ri', u'g\xe5r i'), (u'Hurd\xe5', u'Hur d\xe5'), (u'inter\\/juar', u'intervjuar'), (u'menarjag', u'menar jag'), (u'spyrjag', u'spyr jag'), (u'bri\xfcera', u'briljera'), (u'N\xe4rjag', u'N\xe4r jag'), (u'ner\\/\xf6s', u'nerv\xf6s'), (u'ilivets', u'i livets'), (u'n\xe4got', u'n\xe5got'), (u'p\xe0', u'p\xe5'), (u'Lnnan', u'Innan'), (u'Uf', u'Ut'), (u'lnnan', u'Innan'), (u'D\xe0ren', u'D\xe5ren'), (u'F\xe0rjag', u'F\xe5r jag'), (u'Vad\xe4rdetd\xe4L', u'Vad \xe4r det d\xe4r'), (u'sm\xe0tjuv', u'sm\xe5tjuv'), (u't\xe0gr\xe5nare', u't\xe5gr\xe5nare'), (u'dit\xe0t', u'dit\xe5t'), (u's\xe4', u's\xe5'), (u'v\xe0rdsl\xf6sa', u'v\xe5rdsl\xf6sa'), (u'n\xe0n', u'n\xe5n'), (u'kommerjag', u'kommer jag'), (u'\xe4rj\xe4ttebra', u'\xe4r j\xe4ttebra'), (u'\xe4rj\xe4vligt', u'\xe4r j\xe4vligt'), (u'\xe0kerjag', u'\xe5ker jag'), (u'ellerjapaner', u'eller japaner'), (u'attjaga', u'att jaga'), (u'eften', u'efter'), (u'h\xe4stan', u'h\xe4star'), (u'Lntensivare', u'Intensivare'), (u'fr\xe0garjag', u'fr\xe5gar jag'), (u'pen/ers', u'pervers'), (u'r\xe0barkade', u'r\xe5barkade'), (u'styrkon', u'styrkor'), (u'Dif\xe5f', u'Dit\xe5t'), (u'h\xe4nden', u'h\xe4nder'), (u'f\xf6\ufb01a', u'f\xf6lja'), (u'Idioten/', u'Idioter!'), (u'Varf\xf6rjagade', u'Varf\xf6r jagade'), (u'd\xe4rf\xf6rjag', u'd\xe4rf\xf6r jag'), (u'forjag', u'for jag'), (u'Iivsgladje', u'livsgl\xe4dje'), (u'narjag', u'n\xe4r jag'), (u'sajag', u'sa jag'), (u'genastja', u'genast ja'), (u'rockument\xe0ren', u'rockument\xe4ren'), (u'turne', u'turn\xe9'), (u'fickjag', u'fick jag'), (u'sager', u's\xe4ger'), (u'Ijush\xe5rig', u'ljush\xe5rig'), (u'tradg\xe5rdsolycka', u'tr\xe4dg\xe5rdsolycka'), (u'kvavdes', u'kv\xe4vdes'), (u'd\xe0rja', u'd\xe4r ja'), (u'hedersgaster', u'hedersg\xe4ster'), (u'Nar', u'N\xe4r'), (u'smaki\xf6sa', u'smakl\xf6sa'), (u'lan', u'Ian'), (u'Lan', u'Ian'), (u'eri', u'er i'), (u'universitetsamne', u'universitets\xe4mne'), (u'garna', u'g\xe4rna'), (u'ar', u'\xe4r'), (u'baltdjur', u'b\xe4ltdjur'), (u'varjag', u'var jag'), (u'\xe0r', u'\xe4r'), (u'f\xf6rf\xf6rst\xe0rkare', u'f\xf6rf\xf6rst\xe4rkare'), (u'arjattespeciell', u'\xe4r j\xe4ttespeciell'), (u'h\xe0rg\xe5r', u'h\xe4r g\xe5r'), (u'Ia', u'la'), (u'Iimousinen', u'limousinen'), (u'krickettra', u'krickettr\xe4'), (u'h\xe5rdrockv\xe0rlden', u'h\xe5rdrockv\xe4rlden'), (u'tr\xe0bit', u'tr\xe4bit'), (u'Mellanvastern', u'Mellanv\xe4stern'), (u'arju', u'\xe4r ju'), (u'turnen', u'turn\xe9n'), (u'kanns', u'k\xe4nns'), (u'battre', u'b\xe4ttre'), (u'v\xe0rldsturne', u'v\xe4rldsturne'), (u'dar', u'd\xe4r'), (u'sj\xe0lvant\xe0nder', u'sj\xe4lvant\xe4nder'), (u'jattelange', u'j\xe4ttel\xe4nge'), (u'berattade', u'ber\xe4ttade'), (u'S\xe4', u'S\xe5'), (u'vandpunkten', u'v\xe4ndpunkten'), (u'N\xe0rjag', u'N\xe4r jag'), (u'lasa', u'l\xe4sa'), (u'skitl\xe0skigt', u'skitl\xe4skigt'), (u'sambandsv\xe0g', u'sambandsv\xe4g'), (u'valdigt', u'v\xe4ldigt'), (u'Stamga\ufb01el', u'St\xe4mgaffel'), (u'\xe0rjag', u'\xe4r jag'), (u'tajming', u'tajmning'), (u'utg\xe4ng', u'utg\xe5ng'), (u'H\xe0r\xe5t', u'H\xe4r\xe5t'), (u'h\xe0r\xe5t', u'h\xe4r\xe5t'), (u'anvander', u'anv\xe4nder'), (u'harjobbat', u'har jobbat'), (u'imageide', u'imageid\xe9'), (u'kla\ufb01en', u'klaffen'), (u'sjalv', u'sj\xe4lv'), (u'dvarg', u'dv\xe4rg'), (u'detjag', u'det jag'), (u'dvargarna', u'dv\xe4rgarna'), (u'fantasiv\xe0rld', u'fantasiv\xe4rld'), (u'\ufb01olliga', u'Fjolliga'), (u'mandoiinstr\xe0ngar', u'mandollnstr\xe4ngar'), (u'mittjobb', u'mitt jobb'), (u'Skajag', u'Ska jag'), (u'landari', u'landar i'), (u'gang', u'g\xe4ng'), (u'Detjag', u'Det jag'), (u'Narmre', u'N\xe4rmre'), (u'I\xe5tjavelni', u'l\xe5tj\xe4veln'), (u'H\xe5llerjag', u'H\xe5ller jag'), (u'visionarer', u'vision\xe4rer'), (u'T\xfclvad', u'Till vad'), (u'milit\xe0rbas', u'milit\xe4rbas'), (u'jattegiada', u'j\xe4tteglada'), (u'Fastjag', u'Fast jag'), (u's\xe5jag', u's\xe5 jag'), (u'rockvarlden', u'rockv\xe4rlden'), (u'saknarjag', u'saknar jag'), (u'allafall', u'alla fall'), (u'\ufb01anta', u'fjanta'), (u'Kr\xe0ma', u'Kr\xe4ma'), (u'stammer', u'st\xe4mmer'), (u'budb\xe0rare', u'budb\xe4rare'), (u'Iivsfiiosofi', u'livsfiiosofi'), (u'f\xf6rj\xe4mnan', u'f\xf6r j\xe4mnan'), (u'gillarjag', u'gillar jag'), (u'Iarvat', u'larvat'), (u'klararjag', u'klarar jag'), (u"hatta\ufb01'\xe0r", u'hattaff\xe4r'), (u'D\xe0', u'D\xe5'), (u'upp\ufb01nna', u'uppfinna'), (u'R\xe0ttf\xe5glar', u'R\xe5ttf\xe5glar'), (u'Sv\xe4\xfcboda', u'Sv\xe4ljboda'), (u'P\xe5b\xf6\ufb02ar', u'P\xe5b\xf6rjar'), (u'slutarju', u'slutar ju'), (u'ni\ufb01skebu\xfcken', u'i fiskebutiken'), (u'h\xe4rj\xe4keln', u'h\xe4r j\xe4keln'), (u'H\xdfppa', u'Hoppa'), (u'f\xf6rst\xf6rds', u'f\xf6rst\xf6rdes'), (u'varj\xe4ttegoda', u'var j\xe4ttegoda'), (u'Kor\\/', u'Korv'), (u'br\xfcl\xe9el', u'br\xfcl\xe9e!'), (u'Hei', u'Hej'), (u'\xe4lskarjordgubbsglass', u'\xe4lskar jordgubbsglass'), (u'Sn\xf6bom', u'Sn\xf6boll'), (u'Sn\xf6boH', u'Sn\xf6boll'), (u'Sn\xf6bol', u'Sn\xf6boll'), (u'sn\xf6boH', u'sn\xf6boll'), (u'L\xe4ggerp\xe5', u'L\xe4gger p\xe5'), (u'lnge\ufb02', u'lnget!'), (u'S\xe4gerj\xe4ttesmarta', u'S\xe4ger j\xe4ttesmarta'), (u'dopplen/\xe4derradar', u'dopplerv\xe4derradar'), (u's\xe4kertj\xe4ttefin', u's\xe4kert j\xe4ttefin'), (u'\xe4rj\xe4ttefin', u'\xe4r j\xe4ttefin'), (u'verkarju', u'verkar ju'), (u'blirju', u'blir ju'), (u'kor\\/', u'korv'), (u'naturkatastro\ufb01', u'naturkatastrof!'), (u'stickerjag', u'stickerj ag'), (u'j\xe4ttebu\ufb01\xe9', u'j\xe4ttebuff\xe9'), (u'be\ufb01nner', u'befinner'), (u'Sp\ufb02ng', u'Spring'), (u'trec\ufb01e', u'tredje'), (u'ryckerjag', u'rycker jag'), (u'skullejag', u'skulle jag'), (u'vetju', u'vet ju'), (u'a\ufb02jag', u'att jag'), (u'\ufb02nns', u'finns'), (u'\xe4rl\xe5ng', u'\xe4r l\xe5ng'), (u'k\xe5ra', u'k\xe4ra'), (u'\xe4r\ufb01na', u'\xe4r \ufb01na'), (u'\xe4ri', u'\xe4r i'), (u'h\xf6rden', u'h\xf6r den'), (u'\xe4ttj\xe4g', u'att j\xe4g'), (u'g\xe4r', u'g\xe5r'), (u'f\xf6ri', u'f\xf6r i'), (u'Hurvisste', u'Hur visste'), (u'\ufb01ck', u'fick'), (u'\ufb01nns', u'finns'), (u'\ufb01n', u'fin'), (u'Fa', u'Bra.'), (u'bori', u'bor i'), (u'fiendeplanl', u'fiendeplan!'), (u'if\xf6rnamn', u'i f\xf6rnamn'), (u'detju', u'det ju'), (u'N\xfcd', u'Niki'), (u'hatarjag', u'hatar jag'), (u'Klararjag', u'Klarar jag'), (u'deta\ufb01er', u'detaljer'), (u'v\xe4/', u'v\xe4l'), (u'smakarju', u'smakar ju'), (u'Teache\ufb02', u'Teacher!'), (u'imorse', u'i morse'), (u'drickerjag', u'dricker jag'), (u'st\xe5ri', u'st\xe5r i'), (u'Harjag', u'Har jag'), (u'Talarjag', u'Talar jag'), (u'undrarjag', u'undrar jag'), (u'\xe5lderjag', u'\xe5lder jag'), (u'va\ufb01e', u'varje'), (u'f\xf6rfalskningl', u'f\xf6rfalskning!'), (u'Vi\ufb01iiiam', u'William'), (u'V\\\ufb01lliams', u'Williams'), (u'attjobba', u'att jobba'), (u'intei', u'inte i'), (u'n\xe4rV\\\ufb01lliam', u'n\xe4r William'), (u'V\\\ufb01lliam', u'William'), (u'E\ufb01ersom', u'Eftersom'), (u'Vl\ufb01lliam', u'William'), (u'I\xe4ngejag', u'l\xe4nge jag'), (u"'\ufb01digare", u'Tidigare'), (u'b\xf6rjadei', u'b\xf6rjade i'), (u'merjust', u'mer just'), (u'e\ufb01er\xe5t', u'efter\xe5t'), (u'gjordejag', u'gjorde jag'), (u'hadeju', u'hade ju'), (u'g\xe5rvi', u'g\xe5r vi'), (u'k\xf6perjag', u'k\xf6per jag'), (u'M\xe5stejag', u'M\xe5ste jag'), (u'k\xe4nnerju', u'k\xe4nner ju'), (u'\ufb02n', u'fin'), (u'treviig', u'trevlig'), (u'Grattisl', u'Grattis!'), (u'kande', u'k\xe4nde'), (u"'llden", u'Tiden'), (u'sakjag', u'sak jag'), (u'klartjag', u'klart jag'), (u'h\xe4\ufb01igt', u'h\xe4ftigt'), (u'I\xe4mnarjag', u'l\xe4mnar jag'), (u'gickju', u'gick ju'), (u'skajag', u'ska jag'), (u'G\xf6rjag', u'G\xf6r jag'), (u'm\xe5stejag', u'm\xe5ste jag'), (u'gra\\/iditet', u'graviditet'), (u'hittadqdin', u'hittade din'), (u'\xe4rjobbigt', u'\xe4r jobbigt'), (u'Overdrivet', u'\xd6verdrivet'), (u'hOgtidlig', u'h\xf6gtidlig'), (u'Overtyga', u'\xd6vertyga'), (u'SKILSMASSA', u'SKILSM\xc4SSA'), (u'brukarju', u'brukar ju'), (u'lsabel', u'Isabel'), (u'kundejag', u'kunde jag'), (u'\xe4rl\xe4get', u'\xe4r l\xe4get'), (u'blirinte', u'blir inte'), (u'ijakt', u'i jakt'), (u'avjordens', u'av jordens'), (u'90000O', u'900000'), (u'9O0', u'900'), (u'\xe4rp\xe5', u'\xe4r p\xe5'), (u'\xe4rproteserna', u'\xe4r proteserna'), (u'\xe4rytterst', u'\xe4r ytterst'), (u'beborjorden', u'bebor jorden'), (u'filmjag', u'film jag'), (u'fokuserarp\xe5', u'fokuserar p\xe5'), (u'folkjag', u'folk jag'), (u'f\xf6rest\xe4lldejag', u'f\xf6rest\xe4llde jag'), (u'f\xf6rpubliken', u'f\xf6r publiken'), (u'gilladejag', u'gillade jag'), (u'h\xe5llerp\xe5', u'h\xe5ller p\xe5'), (u'harp\xe5', u'har p\xe5'), (u'harplaner', u'har planer'), (u'harprylar', u'har prylar'), (u'kommerpubliken', u'kommer publiken'), (u'kostymerp\xe5', u'kostymer p\xe5'), (u'litarp\xe5', u'litar p\xe5'), (u'lngen', u'Ingen'), (u'lnom', u'Inom'), (u'lnte', u'Inte'), (u'ochjag', u'och jag'), (u'Ochjag', u'Och jag'), (u'ochjorden', u'och jorden'), (u'omjag', u'om jag'), (u'Omjag', u'Om jag'), (u'passarperfekt', u'passar perfekt'), (u's\xe4ttetjag', u's\xe4ttet jag'), (u'silverp\xe5', u'silver p\xe5'), (u'skruvarjag', u'skruvar jag'), (u'somjag', u'som jag'), (u'Somjag', u'Som jag'), (u'talarp\xe5', u'talar p\xe5'), (u't\xe4nktejag', u't\xe4nkte jag'), (u'tapparjag', u'tappar jag'), (u'tittarp\xe5', u'tittar p\xe5'), (u'visstejag', u'visste jag'), (u'medjetpacks', u'med jetpacks'), (u's\xe4tterp\xe5', u's\xe4tter p\xe5'), (u'st\xe5rp\xe5', u'st\xe5r p\xe5'), (u'tillh\xf6rp\xe5', u'tillh\xf6r p\xe5')]), 'pattern': u"(?um)(\\b|^)(?:l\\\xe2rt|hederv\\\xe5rda|storm\\\xe2stare|Avf\\\xe2rd|t\\\xe2lten|\\\xe2rjag|\\\xe4rjag|j\\\xe2mlikar|Riskako\\\ufb02|Karamellen\\/|Lngen\\\xfcng|\\\xe4rju|S\\\xe1|n\\\xe4rjag|alltjag|g\\\xf6rjag|trorjag|varju|g\\\xf6rju|kanju|blirjag|s\\\xe4gerjag|beh\\\xe5llerjag|pr\\\xf8blem|r\\\xe4ddadeju|hon\\\xf8m|Ln|sv\\\xe5r\\\ufb02\\\xf6rtad|\\\xf8ch|\\\ufb02\\\xf6rtar|k\\\xe4nnerjag|\\\ufb02ickan|sn\\\xf8|gerju|k\\\xf8ntakter|\\\xf8lycka|n\\\xf8lla|sinnenajublar|ijobbet|F\\\xe5rjag|Ar|liggerju|um|lbland|skjuterjag|Vadd\\\xe5|pratarj\\\xe4mt|harju|sitterjag|h\\\xe4\\\ufb02a|s\\\ufb01\\\xe4l|F\\\xd6U|varf\\\xf6rjag|s\\\ufb01\\\xe4rna|b\\\xf6\\\ufb02ar|b\\\xf6\\\ufb02an|st\\\xe4ri|p\\\xe4|harjag|attjag|Verkarjag|K\\\xe4nnerjag|d\\\xe4rjag|tu\\\ufb01|lurarjag|varj\\\xe4ttebra|allvan|deth\\\xe4r|va\\\ufb02e|F\\\xf6Uer|personalm\\\xf6tetl|harjust|\\\xe4rj\\\xe4tteduktig|d\\\xe4rja|lngen\\\xfcng|iluften|\\\xf6sen|tv\\\xe2|Uejerna|h\\\xe5n\\*|\\\xc4rjag|keL|F\\\xf6rjag|varj\\\xe4ttekul|k\\\xe4mpan|mycketjobb|Uus|serjag|vetjag|f\\\xe5rjag|hurjag|f\\\xf6rs\\\xf6kerjag|t\\\xe1nagel|va\\\xfce|Uudet|amhopa|V\\\xe4\\\xfc|g\\\xe4ri|r\\\xf6d\\\xfcus|Uuset|Rid\\\xe0n|vi\\\xfca|g\\\xe5ri|Hurd\\\xe5|inter\\\\\\/juar|menarjag|spyrjag|bri\\\xfcera|N\\\xe4rjag|ner\\\\\\/\\\xf6s|ilivets|n\\\xe4got|p\\\xe0|Lnnan|Uf|lnnan|D\\\xe0ren|F\\\xe0rjag|Vad\\\xe4rdetd\\\xe4L|sm\\\xe0tjuv|t\\\xe0gr\\\xe5nare|dit\\\xe0t|s\\\xe4|v\\\xe0rdsl\\\xf6sa|n\\\xe0n|kommerjag|\\\xe4rj\\\xe4ttebra|\\\xe4rj\\\xe4vligt|\\\xe0kerjag|ellerjapaner|attjaga|eften|h\\\xe4stan|Lntensivare|fr\\\xe0garjag|pen\\/ers|r\\\xe0barkade|styrkon|Dif\\\xe5f|h\\\xe4nden|f\\\xf6\\\ufb01a|Idioten\\/|Varf\\\xf6rjagade|d\\\xe4rf\\\xf6rjag|forjag|Iivsgladje|narjag|sajag|genastja|rockument\\\xe0ren|turne|fickjag|sager|Ijush\\\xe5rig|tradg\\\xe5rdsolycka|kvavdes|d\\\xe0rja|hedersgaster|Nar|smaki\\\xf6sa|lan|Lan|eri|universitetsamne|garna|ar|baltdjur|varjag|\\\xe0r|f\\\xf6rf\\\xf6rst\\\xe0rkare|arjattespeciell|h\\\xe0rg\\\xe5r|Ia|Iimousinen|krickettra|h\\\xe5rdrockv\\\xe0rlden|tr\\\xe0bit|Mellanvastern|arju|turnen|kanns|battre|v\\\xe0rldsturne|dar|sj\\\xe0lvant\\\xe0nder|jattelange|berattade|S\\\xe4|vandpunkten|N\\\xe0rjag|lasa|skitl\\\xe0skigt|sambandsv\\\xe0g|valdigt|Stamga\\\ufb01el|\\\xe0rjag|tajming|utg\\\xe4ng|H\\\xe0r\\\xe5t|h\\\xe0r\\\xe5t|anvander|harjobbat|imageide|kla\\\ufb01en|sjalv|dvarg|detjag|dvargarna|fantasiv\\\xe0rld|\\\ufb01olliga|mandoiinstr\\\xe0ngar|mittjobb|Skajag|landari|gang|Detjag|Narmre|I\\\xe5tjavelni|H\\\xe5llerjag|visionarer|T\\\xfclvad|milit\\\xe0rbas|jattegiada|Fastjag|s\\\xe5jag|rockvarlden|saknarjag|allafall|\\\ufb01anta|Kr\\\xe0ma|stammer|budb\\\xe0rare|Iivsfiiosofi|f\\\xf6rj\\\xe4mnan|gillarjag|Iarvat|klararjag|hatta\\\ufb01\\'\\\xe0r|D\\\xe0|upp\\\ufb01nna|R\\\xe0ttf\\\xe5glar|Sv\\\xe4\\\xfcboda|P\\\xe5b\\\xf6\\\ufb02ar|slutarju|ni\\\ufb01skebu\\\xfcken|h\\\xe4rj\\\xe4keln|H\\\xdfppa|f\\\xf6rst\\\xf6rds|varj\\\xe4ttegoda|Kor\\\\\\/|br\\\xfcl\\\xe9el|Hei|\\\xe4lskarjordgubbsglass|Sn\\\xf6bom|Sn\\\xf6boH|Sn\\\xf6bol|sn\\\xf6boH|L\\\xe4ggerp\\\xe5|lnge\\\ufb02|S\\\xe4gerj\\\xe4ttesmarta|dopplen\\/\\\xe4derradar|s\\\xe4kertj\\\xe4ttefin|\\\xe4rj\\\xe4ttefin|verkarju|blirju|kor\\\\\\/|naturkatastro\\\ufb01|stickerjag|j\\\xe4ttebu\\\ufb01\\\xe9|be\\\ufb01nner|Sp\\\ufb02ng|trec\\\ufb01e|ryckerjag|skullejag|vetju|a\\\ufb02jag|\\\ufb02nns|\\\xe4rl\\\xe5ng|k\\\xe5ra|\\\xe4r\\\ufb01na|\\\xe4ri|h\\\xf6rden|\\\xe4ttj\\\xe4g|g\\\xe4r|f\\\xf6ri|Hurvisste|\\\ufb01ck|\\\ufb01nns|\\\ufb01n|Fa|bori|fiendeplanl|if\\\xf6rnamn|detju|N\\\xfcd|hatarjag|Klararjag|deta\\\ufb01er|v\\\xe4\\/|smakarju|Teache\\\ufb02|imorse|drickerjag|st\\\xe5ri|Harjag|Talarjag|undrarjag|\\\xe5lderjag|va\\\ufb01e|f\\\xf6rfalskningl|Vi\\\ufb01iiiam|V\\\\\\\ufb01lliams|attjobba|intei|n\\\xe4rV\\\\\\\ufb01lliam|V\\\\\\\ufb01lliam|E\\\ufb01ersom|Vl\\\ufb01lliam|I\\\xe4ngejag|\\'\\\ufb01digare|b\\\xf6rjadei|merjust|e\\\ufb01er\\\xe5t|gjordejag|hadeju|g\\\xe5rvi|k\\\xf6perjag|M\\\xe5stejag|k\\\xe4nnerju|\\\ufb02n|treviig|Grattisl|kande|\\'llden|sakjag|klartjag|h\\\xe4\\\ufb01igt|I\\\xe4mnarjag|gickju|skajag|G\\\xf6rjag|m\\\xe5stejag|gra\\\\\\/iditet|hittadqdin|\\\xe4rjobbigt|Overdrivet|hOgtidlig|Overtyga|SKILSMASSA|brukarju|lsabel|kundejag|\\\xe4rl\\\xe4get|blirinte|ijakt|avjordens|90000O|9O0|\\\xe4rp\\\xe5|\\\xe4rproteserna|\\\xe4rytterst|beborjorden|filmjag|fokuserarp\\\xe5|folkjag|f\\\xf6rest\\\xe4lldejag|f\\\xf6rpubliken|gilladejag|h\\\xe5llerp\\\xe5|harp\\\xe5|harplaner|harprylar|kommerpubliken|kostymerp\\\xe5|litarp\\\xe5|lngen|lnom|lnte|ochjag|Ochjag|ochjorden|omjag|Omjag|passarperfekt|s\\\xe4ttetjag|silverp\\\xe5|skruvarjag|somjag|Somjag|talarp\\\xe5|t\\\xe4nktejag|tapparjag|tittarp\\\xe5|visstejag|medjetpacks|s\\\xe4tterp\\\xe5|st\\\xe5rp\\\xe5|tillh\\\xf6rp\\\xe5)(\\b|$)"}}} -for lang, grps in data.iteritems(): - for grp in grps.iterkeys(): +for lang, grps in data.items(): + for grp in grps.keys(): if data[lang][grp]["pattern"]: data[lang][grp]["pattern"] = re.compile(data[lang][grp]["pattern"]) diff --git a/libs/subzero/modification/main.py b/libs/subzero/modification/main.py index 7d35c2e27..f1fa3fbfd 100644 --- a/libs/subzero/modification/main.py +++ b/libs/subzero/modification/main.py @@ -6,14 +6,14 @@ import pysubs2 import logging import time -from mods import EMPTY_TAG_PROCESSOR, EmptyEntryError -from registry import registry +from .mods import EMPTY_TAG_PROCESSOR, EmptyEntryError +from .registry import registry from subzero.language import Language logger = logging.getLogger(__name__) -lowercase_re = re.compile(ur'(?sux)[a-zà-ž]') +lowercase_re = re.compile(r'(?sux)[a-zà-ž]') class SubtitleModifications(object): @@ -143,7 +143,7 @@ class SubtitleModifications(object): continue # clear empty args - final_mod_args = dict(filter(lambda (k, v): bool(v), args.iteritems())) + final_mod_args = dict(filter(lambda kv: bool(kv[1]), args.iteritems())) _data = SubtitleModifications.get_mod_signature(identifier, **final_mod_args) if _data == mods_merged_log[identifier]["final_identifier"]: @@ -180,7 +180,7 @@ class SubtitleModifications(object): entries_used = 0 for entry in self.f: entry_used = False - for sub in entry.text.strip().split("\N"): + for sub in entry.text.strip().split(r"\N"): # skip HI bracket entries, those might actually be lowercase sub = sub.strip() for processor in registry.mods["remove_HI"].processors[:4]: @@ -272,7 +272,7 @@ class SubtitleModifications(object): continue skip_entry = False - for line in t.split(ur"\N"): + for line in t.split(r"\N"): # don't bother the mods with surrounding tags old_line = line line = line.strip() @@ -377,7 +377,7 @@ class SubtitleModifications(object): logger.debug(u"%d: %r -> ''", index, entry.text) continue - new_text = ur"\N".join(lines) + new_text = r"\N".join(lines) # cheap man's approach to avoid open tags add_start_tags = [] diff --git a/libs/subzero/modification/mods/__init__.py b/libs/subzero/modification/mods/__init__.py index aaf4c37e4..5766dccc0 100644 --- a/libs/subzero/modification/mods/__init__.py +++ b/libs/subzero/modification/mods/__init__.py @@ -95,7 +95,7 @@ class SubtitleTextModification(SubtitleModification): pass -TAG = ur"(?:\s*{\\[iusb][0-1]}\s*)*" +TAG = r"(?:\s*{\\[iusb][0-1]}\s*)*" EMPTY_TAG_PROCESSOR = ReProcessor(re.compile(r'({\\\w1})[\s.,-_!?]*({\\\w0})'), "", name="empty_tag") empty_line_post_processors = [ diff --git a/libs/subzero/modification/mods/common.py b/libs/subzero/modification/mods/common.py index 14c360937..e57e53cd7 100644 --- a/libs/subzero/modification/mods/common.py +++ b/libs/subzero/modification/mods/common.py @@ -22,10 +22,10 @@ class CommonFixes(SubtitleTextModification): processors = [ # normalize hyphens - NReProcessor(re.compile(ur'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"), + NReProcessor(re.compile(r'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"), # -- = em dash - NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1—", name="CM_multidash"), + NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), r"\1—", name="CM_multidash"), # line = _/-/\s NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="<CM_non_word_only"), @@ -37,23 +37,23 @@ class CommonFixes(SubtitleTextModification): NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"), # fix music symbols - NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'), + NReProcessor(re.compile(r'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'), lambda x: u"♪ " if x.group(1) else u" ♪", name="CM_music_symbols"), # '' = " - NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"), + NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"), # double quotes instead of single quotes inside words - NReProcessor(re.compile(ur'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), ur"\1'\2", name="CM_double_as_single"), + NReProcessor(re.compile(r'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), r"\1'\2", name="CM_double_as_single"), # normalize quotes - NReProcessor(re.compile(ur'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'), + NReProcessor(re.compile(r'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'), lambda match: '"' + (" " if match.group(2).endswith(" ") else ""), name="CM_normalize_quotes"), # normalize single quotes - NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"), + NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"), # remove leading ... NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"), @@ -89,8 +89,8 @@ class CommonFixes(SubtitleTextModification): # space before ending doublequote? # replace uppercase I with lowercase L in words - NReProcessor(re.compile(ur'(?u)([a-zà-ž]+)(I+)'), - lambda match: ur'%s%s' % (match.group(1), "l" * len(match.group(2))), + NReProcessor(re.compile(r'(?u)([a-zà-ž]+)(I+)'), + lambda match: r'%s%s' % (match.group(1), "l" * len(match.group(2))), name="CM_uppercase_i_in_word"), # fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be @@ -101,11 +101,11 @@ class CommonFixes(SubtitleTextModification): name="CM_spaces_in_numbers"), # uppercase after dot - NReProcessor(re.compile(ur'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'), - lambda match: ur'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"), + NReProcessor(re.compile(r'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'), + lambda match: r'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"), # remove double interpunction - NReProcessor(re.compile(ur'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'), + NReProcessor(re.compile(r'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'), lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""), name="CM_double_interpunct"), @@ -149,14 +149,14 @@ class ReverseRTL(SubtitleModification): processors = [ # new? (?u)(^([\s.!?]*)(.+?)(\s*)(-?\s*)$); \5\4\3\2 - #NReProcessor(re.compile(ur"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2", + #NReProcessor(re.compile(r"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2", # name="CM_RTL_reverse") - NReProcessor(re.compile(ur"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2", + NReProcessor(re.compile(r"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2", name="CM_RTL_reverse") ] -split_upper_re = re.compile(ur"(\s*[.!?♪\-]\s*)") +split_upper_re = re.compile(r"(\s*[.!?♪\-]\s*)") class FixUppercase(SubtitleModification): diff --git a/libs/subzero/modification/mods/hearing_impaired.py b/libs/subzero/modification/mods/hearing_impaired.py index cb72d898c..c615b5679 100644 --- a/libs/subzero/modification/mods/hearing_impaired.py +++ b/libs/subzero/modification/mods/hearing_impaired.py @@ -26,71 +26,71 @@ class HearingImpaired(SubtitleTextModification): processors = [ # full bracket entry, single or multiline; starting with brackets and ending with brackets - FullBracketEntryProcessor(re.compile(ur'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}), + FullBracketEntryProcessor(re.compile(r'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}), "", name="HI_brackets_full"), # uppercase text before colon (at least 3 uppercase chars); at start or after a sentence, # possibly with a dash in front; ignore anything ending with a quote - NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])' - ur'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "", + NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])' + r'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "", name="HI_before_colon_caps"), # any text before colon (at least 3 chars); at start or after a sentence, # possibly with a dash in front; try not breaking actual sentences with a colon at the end by not matching if # a space is inside the text; ignore anything ending with a quote - NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])' - ur'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9])'), + NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])' + r'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9])'), lambda match: match.group(1) if (match.group(2).count(" ") > 0 or match.group(1).count("-") > 0) else "" if not match.group(1).startswith(" ") else " ", name="HI_before_colon_noncaps"), # brackets (only remove if at least 3 chars in brackets) - NReProcessor(re.compile(ur'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % + NReProcessor(re.compile(r'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "", name="HI_brackets"), - #NReProcessor(re.compile(ur'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}), + #NReProcessor(re.compile(r'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}), # "", name="HI_bracket_open_start"), - #NReProcessor(re.compile(ur'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "", + #NReProcessor(re.compile(r'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "", # name="HI_bracket_open_end"), # text before colon (and possible dash in front), max 11 chars after the first whitespace (if any) # NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"), # starting text before colon (at least 3 chars) - #NReProcessor(re.compile(ur'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "", + #NReProcessor(re.compile(r'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "", # name="HI_before_colon"), # text in brackets at start, after optional dash, before colon or at end of line # fixme: may be too aggressive - #NReProcessor(re.compile(ur'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "", + #NReProcessor(re.compile(r'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "", # name="HI_brackets_special"), # all caps line (at least 4 consecutive uppercase chars) - NReProcessor(re.compile(ur'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps", + NReProcessor(re.compile(r'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps", supported=lambda p: not p.only_uppercase), # remove MAN: - NReProcessor(re.compile(ur'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"), + NReProcessor(re.compile(r'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"), # dash in front # NReProcessor(re.compile(r'(?u)^\s*-\s*'), "", name="HI_starting_dash"), # all caps at start before new sentence - NReProcessor(re.compile(ur'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1", + NReProcessor(re.compile(r'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1", name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase), ] post_processors = empty_line_post_processors last_processors = [ # remove music symbols - NReProcessor(re.compile(ur'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}), + NReProcessor(re.compile(r'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}), "", name="HI_music_symbols_only"), # remove music entries - NReProcessor(re.compile(ur'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'), + NReProcessor(re.compile(r'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'), "", name="HI_music"), ] diff --git a/libs/subzero/modification/mods/ocr_fixes.py b/libs/subzero/modification/mods/ocr_fixes.py index ec57ca006..ebf485bb7 100644 --- a/libs/subzero/modification/mods/ocr_fixes.py +++ b/libs/subzero/modification/mods/ocr_fixes.py @@ -38,11 +38,11 @@ class FixOCR(SubtitleTextModification): return [ # remove broken HI tag colons (ANNOUNCER'., ". instead of :) after at least 3 uppercase chars # don't modify stuff inside quotes - NReProcessor(re.compile(ur'(?u)(^[^"\'’ʼ❜‘‛”“‟„]*(?<=[A-ZÀ-Ž]{3})[A-ZÀ-Ž-_\s0-9]+)' - ur'(["\'’ʼ❜‘‛”“‟„]*[.,‚،⹁、;]+)(\s*)(?!["\'’ʼ❜‘‛”“‟„])'), + NReProcessor(re.compile(r'(?u)(^[^"\'’ʼ❜‘‛”“‟„]*(?<=[A-ZÀ-Ž]{3})[A-ZÀ-Ž-_\s0-9]+)' + r'(["\'’ʼ❜‘‛”“‟„]*[.,‚،⹁、;]+)(\s*)(?!["\'’ʼ❜‘‛”“‟„])'), r"\1:\3", name="OCR_fix_HI_colons", supported=lambda p: not p.only_uppercase), # fix F'bla - NReProcessor(re.compile(ur'(?u)(\bF)(\')([A-zÀ-ž]*\b)'), r"\1\3", name="OCR_fix_F"), + NReProcessor(re.compile(r'(?u)(\bF)(\')([A-zÀ-ž]*\b)'), r"\1\3", name="OCR_fix_F"), WholeLineProcessor(self.data_dict["WholeLines"], name="OCR_replace_line"), MultipleWordReProcessor(self.data_dict["WholeWords"], name="OCR_replace_word"), MultipleWordReProcessor(self.data_dict["BeginLines"], name="OCR_replace_beginline"), diff --git a/libs/subzero/modification/mods/offset.py b/libs/subzero/modification/mods/offset.py index 2e342c0a1..ca2ae30eb 100644 --- a/libs/subzero/modification/mods/offset.py +++ b/libs/subzero/modification/mods/offset.py @@ -31,7 +31,7 @@ class ShiftOffset(SubtitleModification): else: new_args[key] = int(value) - return dict(filter(lambda (k, v): bool(v), new_args.iteritems())) + return dict(filter(lambda kv: bool(kv[1]), new_args.items())) def modify(self, content, debug=False, parent=None, **kwargs): parent.f.shift(h=int(kwargs.get("h", 0)), m=int(kwargs.get("m", 0)), s=int(kwargs.get("s", 0)), diff --git a/libs/yaml/__init__.py b/libs/yaml2.7/__init__.py index e7a419dd2..e7a419dd2 100644 --- a/libs/yaml/__init__.py +++ b/libs/yaml2.7/__init__.py diff --git a/libs/yaml/composer.py b/libs/yaml2.7/composer.py index df85ef653..df85ef653 100644 --- a/libs/yaml/composer.py +++ b/libs/yaml2.7/composer.py diff --git a/libs/yaml/constructor.py b/libs/yaml2.7/constructor.py index 516dad1ce..516dad1ce 100644 --- a/libs/yaml/constructor.py +++ b/libs/yaml2.7/constructor.py diff --git a/libs/yaml/cyaml.py b/libs/yaml2.7/cyaml.py index ebb895935..ebb895935 100644 --- a/libs/yaml/cyaml.py +++ b/libs/yaml2.7/cyaml.py diff --git a/libs/yaml/dumper.py b/libs/yaml2.7/dumper.py index f9cd49fda..f9cd49fda 100644 --- a/libs/yaml/dumper.py +++ b/libs/yaml2.7/dumper.py diff --git a/libs/yaml/emitter.py b/libs/yaml2.7/emitter.py index 9561a8274..9561a8274 100644 --- a/libs/yaml/emitter.py +++ b/libs/yaml2.7/emitter.py diff --git a/libs/yaml/error.py b/libs/yaml2.7/error.py index 577686db5..577686db5 100644 --- a/libs/yaml/error.py +++ b/libs/yaml2.7/error.py diff --git a/libs/yaml/events.py b/libs/yaml2.7/events.py index f79ad389c..f79ad389c 100644 --- a/libs/yaml/events.py +++ b/libs/yaml2.7/events.py diff --git a/libs/yaml/loader.py b/libs/yaml2.7/loader.py index a79182eaf..a79182eaf 100644 --- a/libs/yaml/loader.py +++ b/libs/yaml2.7/loader.py diff --git a/libs/yaml/nodes.py b/libs/yaml2.7/nodes.py index c4f070c41..c4f070c41 100644 --- a/libs/yaml/nodes.py +++ b/libs/yaml2.7/nodes.py diff --git a/libs/yaml/parser.py b/libs/yaml2.7/parser.py index f9e3057f3..f9e3057f3 100644 --- a/libs/yaml/parser.py +++ b/libs/yaml2.7/parser.py diff --git a/libs/yaml/reader.py b/libs/yaml2.7/reader.py index b2f10b091..b2f10b091 100644 --- a/libs/yaml/reader.py +++ b/libs/yaml2.7/reader.py diff --git a/libs/yaml/representer.py b/libs/yaml2.7/representer.py index 9dca41af7..9dca41af7 100644 --- a/libs/yaml/representer.py +++ b/libs/yaml2.7/representer.py diff --git a/libs/yaml/resolver.py b/libs/yaml2.7/resolver.py index 528fbc0ea..528fbc0ea 100644 --- a/libs/yaml/resolver.py +++ b/libs/yaml2.7/resolver.py diff --git a/libs/yaml/scanner.py b/libs/yaml2.7/scanner.py index 5126cf07b..5126cf07b 100644 --- a/libs/yaml/scanner.py +++ b/libs/yaml2.7/scanner.py diff --git a/libs/yaml/serializer.py b/libs/yaml2.7/serializer.py index 0bf1e96dc..0bf1e96dc 100644 --- a/libs/yaml/serializer.py +++ b/libs/yaml2.7/serializer.py diff --git a/libs/yaml/tokens.py b/libs/yaml2.7/tokens.py index 4d0b48a39..4d0b48a39 100644 --- a/libs/yaml/tokens.py +++ b/libs/yaml2.7/tokens.py |