diff options
author | Louis Vézina <[email protected]> | 2019-11-25 19:50:16 -0500 |
---|---|---|
committer | Louis Vézina <[email protected]> | 2019-11-25 19:50:16 -0500 |
commit | 4a2ed8830d0fc05e92b9ccc5b3694d05e3a3ea98 (patch) | |
tree | e768b9eee6ea0163c2d5c2e5a94bbd47df8cad56 | |
parent | 672e69e88780d0a0d7e58d1f127662e8f7a9a311 (diff) | |
parent | 4016ed1d746cf21bedecd13aeb770abf4bc53b39 (diff) | |
download | bazarr-4a2ed8830d0fc05e92b9ccc5b3694d05e3a3ea98.tar.gz bazarr-4a2ed8830d0fc05e92b9ccc5b3694d05e3a3ea98.zip |
Merge branch 'python3' into development
313 files changed, 25785 insertions, 3497 deletions
@@ -1,5 +1,10 @@ # coding=utf-8 +from __future__ import absolute_import +from __future__ import print_function + +import bazarr.libs +from six import PY3 import subprocess as sp import time import os @@ -13,14 +18,19 @@ from bazarr.get_args import args def check_python_version(): python_version = platform.python_version_tuple() minimum_python_version_tuple = (2, 7, 13) + minimum_python3_version_tuple = (3, 6, 0) minimum_python_version = ".".join(str(i) for i in minimum_python_version_tuple) + minimum_python3_version = ".".join(str(i) for i in minimum_python3_version_tuple) - if int(python_version[0]) > minimum_python_version_tuple[0]: - print "Python 3 isn't supported. Please use Python " + minimum_python_version + " or greater." - os._exit(0) + if int(python_version[0]) == minimum_python3_version_tuple[0]: + if int(python_version[1]) >= minimum_python3_version_tuple[1]: + pass + else: + print("Python " + minimum_python3_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python.") + os._exit(0) elif int(python_version[1]) < minimum_python_version_tuple[1] or int(re.search(r'\d+', python_version[2]).group()) < minimum_python_version_tuple[2]: - print "Python " + minimum_python_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python." + print("Python " + minimum_python_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python.") os._exit(0) @@ -33,10 +43,13 @@ def start_bazarr(): script = [sys.executable, "-u", os.path.normcase(os.path.join(dir_name, 'bazarr', 'main.py'))] + sys.argv[1:] ep = sp.Popen(script, stdout=sp.PIPE, stderr=sp.STDOUT, stdin=sp.PIPE) - print "Bazarr starting..." + print("Bazarr starting...") try: for line in iter(ep.stdout.readline, ''): - sys.stdout.write(line) + if PY3: + sys.stdout.buffer.write(line) + else: + sys.stdout.write(line) except KeyboardInterrupt: pass @@ -61,16 +74,16 @@ if __name__ == '__main__': try: os.remove(stopfile) except: - print 'Unable to delete stop file.' + print('Unable to delete stop file.') else: - print 'Bazarr exited.' + print('Bazarr exited.') os._exit(0) if os.path.exists(restartfile): try: os.remove(restartfile) except: - print 'Unable to delete restart file.' + print('Unable to delete restart file.') else: start_bazarr() diff --git a/bazarr/analytics.py b/bazarr/analytics.py index 264a9c91c..66157ece8 100644 --- a/bazarr/analytics.py +++ b/bazarr/analytics.py @@ -1,6 +1,7 @@ # coding=utf-8 -import cPickle as pickle +from __future__ import absolute_import +import six.moves.cPickle as pickle import base64 import random import platform @@ -28,14 +29,14 @@ def track_event(category=None, action=None, label=None): try: if settings.analytics.visitor: - visitor = pickle.loads(base64.b64decode(settings.analytics.visitor)) + visitor = pickle.loads(base64.b64decode(settings.analytics.visitor), encoding='utf-8') if visitor.user_agent is None: visitor.user_agent = os.environ.get("SZ_USER_AGENT") if visitor.unique_id > int(0x7fffffff): visitor.unique_id = random.randint(0, 0x7fffffff) except: visitor = Visitor() - visitor.unique_id = long(random.randint(0, 0x7fffffff)) + visitor.unique_id = random.randint(0, 0x7fffffff) session = Session() event = Event(category=category, action=action, label=label, value=1) diff --git a/bazarr/check_update.py b/bazarr/check_update.py index d9dd3faa9..965973828 100644 --- a/bazarr/check_update.py +++ b/bazarr/check_update.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import os import logging import json diff --git a/bazarr/config.py b/bazarr/config.py index c123c1987..deea48731 100644 --- a/bazarr/config.py +++ b/bazarr/config.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import os from simpleconfigparser import simpleconfigparser diff --git a/bazarr/embedded_subs_reader.py b/bazarr/embedded_subs_reader.py index 9938a5f4a..3cc5c98b4 100644 --- a/bazarr/embedded_subs_reader.py +++ b/bazarr/embedded_subs_reader.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import import enzyme from enzyme.exceptions import MalformedMKVError import logging diff --git a/bazarr/get_args.py b/bazarr/get_args.py index a92052729..3eb99c2a9 100644 --- a/bazarr/get_args.py +++ b/bazarr/get_args.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import os import argparse diff --git a/bazarr/get_episodes.py b/bazarr/get_episodes.py index 018a4560f..6bdf32202 100644 --- a/bazarr/get_episodes.py +++ b/bazarr/get_episodes.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import os import requests import logging @@ -164,7 +165,7 @@ def sync_episodes(): for i, altered_episode in enumerate(altered_episodes, 1): notifications.write(msg='Indexing episodes embedded subtitles...', queue='get_episodes', item=i, length=len(altered_episodes)) - store_subtitles(path_replace(altered_episode[1])) + store_subtitles(altered_episode[1], path_replace(altered_episode[1])) logging.debug('BAZARR All episodes synced from Sonarr into database.') diff --git a/bazarr/get_languages.py b/bazarr/get_languages.py index 83aed02e1..4854a43c3 100644 --- a/bazarr/get_languages.py +++ b/bazarr/get_languages.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import os import pycountry @@ -10,25 +11,23 @@ from database import database def load_language_in_db(): # Get languages list in langs tuple - langs = [{'code3': lang.alpha_3, 'code2': lang.alpha_2, 'name': lang.name} + langs = [[lang.alpha_3, lang.alpha_2, lang.name] for lang in pycountry.languages if hasattr(lang, 'alpha_2')] # Insert languages in database table - for lang in langs: - database.execute("INSERT OR IGNORE INTO table_settings_languages (code3, code2, name) VALUES (?, ?, ?)", - (lang['code3'], lang['code2'], lang['name'])) + database.execute("INSERT OR IGNORE INTO table_settings_languages (code3, code2, name) VALUES (?, ?, ?)", + langs, execute_many=True) database.execute("INSERT OR IGNORE INTO table_settings_languages (code3, code2, name) " "VALUES ('pob', 'pb', 'Brazilian Portuguese')") - langs = [{'code3b': lang.bibliographic, 'code3': lang.alpha_3} + langs = [[lang.bibliographic, lang.alpha_3] for lang in pycountry.languages if hasattr(lang, 'alpha_2') and hasattr(lang, 'bibliographic')] # Update languages in database table - for lang in langs: - database.execute("UPDATE table_settings_languages SET code3b=? WHERE code3=?", (lang['code3b'], lang['code3'])) + database.execute("UPDATE table_settings_languages SET code3b=? WHERE code3=?", langs, execute_many=True) def language_from_alpha2(lang): diff --git a/bazarr/get_movies.py b/bazarr/get_movies.py index b8f59a12d..6ce8dfd31 100644 --- a/bazarr/get_movies.py +++ b/bazarr/get_movies.py @@ -1,6 +1,8 @@ # coding=utf-8 +from __future__ import absolute_import import os +import six import requests import logging from queueconfig import notifications @@ -75,7 +77,7 @@ def update_movies(): if movie["path"] != None and movie['movieFile']['relativePath'] != None: try: - overview = unicode(movie['overview']) + overview = six.text_type(movie['overview']) except: overview = "" try: @@ -129,27 +131,27 @@ def update_movies(): audioCodec = None # Add movies in radarr to current movies list - current_movies_radarr.append(unicode(movie['tmdbId'])) + current_movies_radarr.append(six.text_type(movie['tmdbId'])) - if unicode(movie['tmdbId']) in current_movies_db_list: + if six.text_type(movie['tmdbId']) in current_movies_db_list: movies_to_update.append({'radarrId': movie["id"], - 'title': unicode(movie["title"]), - 'path': unicode(movie["path"] + separator + movie['movieFile']['relativePath']), - 'tmdbId': unicode(movie["tmdbId"]), - 'poster': unicode(poster), - 'fanart': unicode(fanart), - 'audio_language': unicode(profile_id_to_language(movie['qualityProfileId'], audio_profiles)), + 'title': six.text_type(movie["title"]), + 'path': six.text_type(movie["path"] + separator + movie['movieFile']['relativePath']), + 'tmdbId': six.text_type(movie["tmdbId"]), + 'poster': six.text_type(poster), + 'fanart': six.text_type(fanart), + 'audio_language': six.text_type(profile_id_to_language(movie['qualityProfileId'], audio_profiles)), 'sceneName': sceneName, - 'monitored': unicode(bool(movie['monitored'])), - 'year': unicode(movie['year']), - 'sortTitle': unicode(movie['sortTitle']), - 'alternativeTitles': unicode(alternativeTitles), - 'format': unicode(format), - 'resolution': unicode(resolution), - 'video_codec': unicode(videoCodec), - 'audio_codec': unicode(audioCodec), - 'overview': unicode(overview), - 'imdbId': unicode(imdbId), + 'monitored': six.text_type(bool(movie['monitored'])), + 'year': six.text_type(movie['year']), + 'sortTitle': six.text_type(movie['sortTitle']), + 'alternativeTitles': six.text_type(alternativeTitles), + 'format': six.text_type(format), + 'resolution': six.text_type(resolution), + 'video_codec': six.text_type(videoCodec), + 'audio_codec': six.text_type(audioCodec), + 'overview': six.text_type(overview), + 'imdbId': six.text_type(imdbId), 'movie_file_id': movie['movieFile']['id']}) else: if movie_default_enabled is True: @@ -165,7 +167,7 @@ def update_movies(): 'fanart': fanart, 'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles), 'sceneName': sceneName, - 'monitored': unicode(bool(movie['monitored'])), + 'monitored': six.text_type(bool(movie['monitored'])), 'sortTitle': movie['sortTitle'], 'year': movie['year'], 'alternativeTitles': alternativeTitles, @@ -189,7 +191,7 @@ def update_movies(): 'fanart': fanart, 'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles), 'sceneName': sceneName, - 'monitored': unicode(bool(movie['monitored'])), + 'monitored': six.text_type(bool(movie['monitored'])), 'sortTitle': movie['sortTitle'], 'year': movie['year'], 'alternativeTitles': alternativeTitles, @@ -251,7 +253,7 @@ def update_movies(): for i, altered_movie in enumerate(altered_movies, 1): notifications.write(msg='Indexing movies embedded subtitles...', queue='get_movies', item=i, length=len(altered_movies)) - store_subtitles_movie(path_replace_movie(altered_movie[1])) + store_subtitles_movie(altered_movie[1], path_replace_movie(altered_movie[1])) logging.debug('BAZARR All movies synced from Radarr into database.') diff --git a/bazarr/get_providers.py b/bazarr/get_providers.py index e572519eb..27d213fac 100644 --- a/bazarr/get_providers.py +++ b/bazarr/get_providers.py @@ -10,6 +10,7 @@ from get_args import args from config import settings from subliminal_patch.exceptions import TooManyRequests, APIThrottled, ParseResponseError from subliminal.exceptions import DownloadLimitExceeded, ServiceUnavailable +from subliminal import region as subliminal_cache_region VALID_THROTTLE_EXCEPTIONS = (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled, ParseResponseError) @@ -154,19 +155,26 @@ def provider_throttle(name, exception): throttle_until = datetime.datetime.now() + throttle_delta if cls_name not in VALID_COUNT_EXCEPTIONS or throttled_count(name): - tp[name] = (cls_name, throttle_until, throttle_description) - settings.general.throtteled_providers = str(tp) - with open(os.path.join(args.config_dir, 'config', 'config.ini'), 'w+') as handle: - settings.write(handle) - - logging.info("Throttling %s for %s, until %s, because of: %s. Exception info: %r", name, throttle_description, - throttle_until.strftime("%y/%m/%d %H:%M"), cls_name, exception.message) + if cls_name == 'ValueError' and exception.args[0].startswith('unsupported pickle protocol'): + for fn in subliminal_cache_region.backend.all_filenames: + try: + os.remove(fn) + except (IOError, OSError): + logging.debug("Couldn't remove cache file: %s", os.path.basename(fn)) + else: + tp[name] = (cls_name, throttle_until, throttle_description) + settings.general.throtteled_providers = str(tp) + with open(os.path.join(args.config_dir, 'config', 'config.ini'), 'w+') as handle: + settings.write(handle) + + logging.info("Throttling %s for %s, until %s, because of: %s. Exception info: %r", name, throttle_description, + throttle_until.strftime("%y/%m/%d %H:%M"), cls_name, exception.args[0]) def throttled_count(name): global throttle_count - if name in throttle_count.keys(): - if 'count' in throttle_count[name].keys(): + if name in list(throttle_count.keys()): + if 'count' in list(throttle_count[name].keys()): for key, value in throttle_count[name].items(): if key == 'count': value += 1 diff --git a/bazarr/get_series.py b/bazarr/get_series.py index daf7b6cf7..8d826c977 100644 --- a/bazarr/get_series.py +++ b/bazarr/get_series.py @@ -1,5 +1,7 @@ # coding=utf-8 +from __future__ import absolute_import +from __future__ import print_function import os import requests import logging @@ -12,6 +14,7 @@ from config import settings, url_sonarr from list_subtitles import list_missing_subtitles from database import database, dict_converter from utils import get_sonarr_version +import six from helper import path_replace @@ -60,7 +63,7 @@ def update_series(): for i, show in enumerate(r.json(), 1): notifications.write(msg="Getting series data from Sonarr...", queue='get_series', item=i, length=seriesListLength) try: - overview = unicode(show['overview']) + overview = six.text_type(show['overview']) except: overview = "" try: @@ -82,17 +85,17 @@ def update_series(): current_shows_sonarr.append(show['tvdbId']) if show['tvdbId'] in current_shows_db_list: - series_to_update.append({'title': unicode(show["title"]), - 'path': unicode(show["path"]), + series_to_update.append({'title': six.text_type(show["title"]), + 'path': six.text_type(show["path"]), 'tvdbId': int(show["tvdbId"]), 'sonarrSeriesId': int(show["id"]), - 'overview': unicode(overview), - 'poster': unicode(poster), - 'fanart': unicode(fanart), - 'audio_language': unicode(profile_id_to_language((show['qualityProfileId'] if sonarr_version.startswith('2') else show['languageProfileId']), audio_profiles)), - 'sortTitle': unicode(show['sortTitle']), - 'year': unicode(show['year']), - 'alternateTitles': unicode(alternateTitles)}) + 'overview': six.text_type(overview), + 'poster': six.text_type(poster), + 'fanart': six.text_type(fanart), + 'audio_language': six.text_type(profile_id_to_language((show['qualityProfileId'] if get_sonarr_version().startswith('2') else show['languageProfileId']), audio_profiles)), + 'sortTitle': six.text_type(show['sortTitle']), + 'year': six.text_type(show['year']), + 'alternateTitles': six.text_type(alternateTitles)}) else: if serie_default_enabled is True: series_to_add.append({'title': show["title"], @@ -127,7 +130,7 @@ def update_series(): for series in removed_series: database.execute("DELETE FROM table_shows WHERE tvdbId=?",(series,)) - + # Update existing series in DB series_in_db_list = [] series_in_db = database.execute("SELECT title, path, tvdbId, sonarrSeriesId, overview, poster, fanart, " diff --git a/bazarr/get_subtitle.py b/bazarr/get_subtitle.py index b75a8949e..81d2e8a6e 100644 --- a/bazarr/get_subtitle.py +++ b/bazarr/get_subtitle.py @@ -1,12 +1,13 @@ # coding=utf-8 +from __future__ import absolute_import import os import sys import ast import logging import subprocess import time -import cPickle as pickle +import six.moves.cPickle as pickle import codecs import types import re @@ -36,6 +37,9 @@ from pyprobe.pyprobe import VideoFileParser from database import database, dict_mapper from analytics import track_event +import six +from six.moves import range +from functools import reduce def get_video(path, title, sceneName, use_scenename, providers=None, media_type="movie"): @@ -90,11 +94,11 @@ def get_scores(video, media_type, min_score_movie_perc=60 * 100 / 120.0, min_sco """ max_score = 120.0 min_score = max_score * min_score_movie_perc / 100.0 - scores = subliminal_scores.movie_scores.keys() + scores = list(subliminal_scores.movie_scores.keys()) if media_type == "series": max_score = 360.0 min_score = max_score * min_score_series_perc / 100.0 - scores = subliminal_scores.episode_scores.keys() + scores = list(subliminal_scores.episode_scores.keys()) if video.is_special: min_score = max_score * min_score_special_ep / 100.0 @@ -118,7 +122,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce hi = "force non-HI" language_set = set() - if not isinstance(language, types.ListType): + if not isinstance(language, list): language = [language] if forced == "True": @@ -184,7 +188,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce saved_any = False if downloaded_subtitles: - for video, subtitles in downloaded_subtitles.iteritems(): + for video, subtitles in six.iteritems(downloaded_subtitles): if not subtitles: continue @@ -220,10 +224,10 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce else: action = "downloaded" if video.used_scene_name: - message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + unicode( + message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + six.text_type( round(subtitle.score * 100 / max_score, 2)) + "% using this scene name: " + sceneName else: - message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + unicode( + message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + six.text_type( round(subtitle.score * 100 / max_score, 2)) + "% using filename guessing." if use_postprocessing is True: @@ -420,7 +424,6 @@ def manual_download_subtitle(path, language, hi, forced, subtitle, provider, pro if not subtitle.is_valid(): logging.exception('BAZARR No valid Subtitles file found for this file: ' + path) return - logging.debug('BAZARR Subtitles file downloaded for this file:' + path) try: score = round(subtitle.score / max_score * 100, 2) fld = get_target_folder(path) @@ -449,7 +452,7 @@ def manual_download_subtitle(path, language, hi, forced, subtitle, provider, pro downloaded_path = saved_subtitle.storage_path logging.debug('BAZARR Subtitles file saved to disk: ' + downloaded_path) is_forced_string = " forced" if subtitle.language.forced else "" - message = downloaded_language + is_forced_string + " Subtitles downloaded from " + downloaded_provider + " with a score of " + unicode( + message = downloaded_language + is_forced_string + " Subtitles downloaded from " + downloaded_provider + " with a score of " + six.text_type( score) + "% using manual search." if use_postprocessing is True: @@ -595,7 +598,7 @@ def series_download_subtitles(no): language_code = result[2] + ":forced" if forced else result[2] provider = result[3] score = result[4] - store_subtitles(path_replace(episode['path'])) + store_subtitles(episode['path'], path_replace(episode['path'])) history_log(1, no, episode['sonarrEpisodeId'], message, path, language_code, provider, score) send_notifications(no, episode['sonarrEpisodeId'], message) else: @@ -651,7 +654,7 @@ def episode_download_subtitles(no): language_code = result[2] + ":forced" if forced else result[2] provider = result[3] score = result[4] - store_subtitles(path_replace(episode['path'])) + store_subtitles(episode['path'], path_replace(episode['path'])) history_log(1, episode['sonarrSeriesId'], episode['sonarrEpisodeId'], message, path, language_code, provider, score) send_notifications(episode['sonarrSeriesId'], episode['sonarrEpisodeId'], message) else: @@ -702,7 +705,7 @@ def movies_download_subtitles(no): language_code = result[2] + ":forced" if forced else result[2] provider = result[3] score = result[4] - store_subtitles_movie(path_replace_movie(movie['path'])) + store_subtitles_movie(movie['path'], path_replace_movie(movie['path'])) history_log_movie(1, no, message, path, language_code, provider, score) send_notifications_movie(no, message) else: @@ -730,19 +733,19 @@ def wanted_download_subtitles(path, l, count_episodes): for episode in episodes_details: attempt = episode['failedAttempts'] - if type(attempt) == unicode: + if type(attempt) == six.text_type: attempt = ast.literal_eval(attempt) for language in ast.literal_eval(episode['missing_subtitles']): if attempt is None: attempt = [] attempt.append([language, time.time()]) else: - att = zip(*attempt)[0] + att = list(zip(*attempt))[0] if language not in att: attempt.append([language, time.time()]) database.execute("UPDATE table_episodes SET failedAttempts=? WHERE sonarrEpisodeId=?", - (unicode(attempt), episode['sonarrEpisodeId'])) + (six.text_type(attempt), episode['sonarrEpisodeId'])) for i in range(len(attempt)): if attempt[i][0] == language: @@ -765,7 +768,7 @@ def wanted_download_subtitles(path, l, count_episodes): language_code = result[2] + ":forced" if forced else result[2] provider = result[3] score = result[4] - store_subtitles(path_replace(episode['path'])) + store_subtitles(episode['path'], path_replace(episode['path'])) history_log(1, episode['sonarrSeriesId'], episode['sonarrEpisodeId'], message, path, language_code, provider, score) send_notifications(episode['sonarrSeriesId'], episode['sonarrEpisodeId'], message) else: @@ -783,19 +786,19 @@ def wanted_download_subtitles_movie(path, l, count_movies): for movie in movies_details: attempt = movie['failedAttempts'] - if type(attempt) == unicode: + if type(attempt) == six.text_type: attempt = ast.literal_eval(attempt) for language in ast.literal_eval(movie['missing_subtitles']): if attempt is None: attempt = [] attempt.append([language, time.time()]) else: - att = zip(*attempt)[0] + att = list(zip(*attempt))[0] if language not in att: attempt.append([language, time.time()]) database.execute("UPDATE table_movies SET failedAttempts=? WHERE radarrId=?", - (unicode(attempt), movie['radarrId'])) + (six.text_type(attempt), movie['radarrId'])) for i in range(len(attempt)): if attempt[i][0] == language: @@ -818,7 +821,7 @@ def wanted_download_subtitles_movie(path, l, count_movies): language_code = result[2] + ":forced" if forced else result[2] provider = result[3] score = result[4] - store_subtitles_movie(path_replace_movie(movie['path'])) + store_subtitles_movie(movie['path'], path_replace_movie(movie['path'])) history_log_movie(1, movie['radarrId'], message, path, language_code, provider, score) send_notifications_movie(movie['radarrId'], message) else: @@ -902,7 +905,7 @@ def refine_from_db(path, video): "table_episodes.video_codec, table_episodes.audio_codec, table_episodes.path " "FROM table_episodes INNER JOIN table_shows on " "table_shows.sonarrSeriesId = table_episodes.sonarrSeriesId " - "WHERE table_episodes.path = ?", (unicode(path_replace_reverse(path)),), only_one=True) + "WHERE table_episodes.path = ?", (path_replace_reverse(path),), only_one=True) if data: video.series, year, country = series_re.match(data['seriesTitle']).groups() @@ -924,7 +927,7 @@ def refine_from_db(path, video): elif isinstance(video, Movie): data = database.execute("SELECT title, year, alternativeTitles, format, resolution, video_codec, audio_codec, " "imdbId FROM table_movies WHERE path = ?", - (unicode(path_replace_reverse_movie(path)),), only_one=True) + (path_replace_reverse_movie(path),), only_one=True) if data: video.title = re.sub(r'(\(\d\d\d\d\))', '', data['title']) @@ -1118,7 +1121,7 @@ def upgrade_subtitles(): language_code = result[2] + ":forced" if forced else result[2] provider = result[3] score = result[4] - store_subtitles(path_replace(episode['video_path'])) + store_subtitles(episode['video_path'], path_replace(episode['video_path'])) history_log(3, episode['sonarrSeriesId'], episode['sonarrEpisodeId'], message, path, language_code, provider, score) send_notifications(episode['sonarrSeriesId'], episode['sonarrEpisodeId'], message) @@ -1167,6 +1170,6 @@ def upgrade_subtitles(): language_code = result[2] + ":forced" if forced else result[2] provider = result[3] score = result[4] - store_subtitles_movie(path_replace_movie(movie['video_path'])) + store_subtitles_movie(movie['video_path'], path_replace_movie(movie['video_path'])) history_log_movie(3, movie['radarrId'], message, path, language_code, provider, score) send_notifications_movie(movie['radarrId'], message) diff --git a/bazarr/helper.py b/bazarr/helper.py index 9519651bf..a16645127 100644 --- a/bazarr/helper.py +++ b/bazarr/helper.py @@ -1,73 +1,140 @@ # coding=utf-8 +from __future__ import absolute_import import ast import os import re import types import logging +import operator + +import libs + import chardet from bs4 import UnicodeDammit +from collections import OrderedDict from config import settings +from utils import get_sonarr_platform, get_radarr_platform + + +def sonarr_path_mapping_regex(): + global path_mapping + global sonarr_regex + global sonarr_use_path_mapping + + path_mapping = ast.literal_eval(settings.general.path_mappings) + path_mapping = sorted(path_mapping, key=operator.itemgetter(0), reverse=True) + path_mapping = OrderedDict((mapping[0], mapping[1]) for mapping in path_mapping if mapping[0] != '') + if any(item for sublist in path_mapping for item in sublist): + sonarr_use_path_mapping = True + sonarr_regex = re.compile("|".join(path_mapping.keys())) + else: + sonarr_use_path_mapping = False + + +def sonarr_path_mapping_reverse_regex(): + global sonarr_platform + global path_mapping_reverse + global sonarr_reverse_regex + global sonarr_use_path_mapping + + sonarr_platform = get_sonarr_platform() + + path_mapping_reverse = ast.literal_eval(settings.general.path_mappings) + path_mapping_reverse = sorted(path_mapping_reverse, key=operator.itemgetter(0), reverse=True) + path_mapping_reverse = OrderedDict((mapping[1], mapping[0]) for mapping in path_mapping_reverse if mapping[0] != '') + if any(item for sublist in path_mapping_reverse for item in sublist): + sonarr_use_path_mapping = True + sonarr_reverse_regex = re.compile("|".join(map(re.escape, path_mapping_reverse.keys()))) + else: + sonarr_use_path_mapping = False + + +def radarr_path_mapping_regex(): + global path_mapping_movie + global radarr_regex + global radarr_use_path_mapping + + path_mapping_movie = ast.literal_eval(settings.general.path_mappings_movie) + path_mapping_movie = sorted(path_mapping_movie, key=operator.itemgetter(0), reverse=True) + path_mapping_movie = OrderedDict((mapping[0], mapping[1]) for mapping in path_mapping_movie if mapping[0] != '') + if any(item for sublist in path_mapping_movie for item in sublist): + radarr_use_path_mapping = True + radarr_regex = re.compile("|".join(path_mapping_movie.keys())) + else: + radarr_use_path_mapping = False + + +def radarr_path_mapping_reverse_regex(): + global radarr_platform + global path_mapping_reverse_movie + global radarr_reverse_regex + global radarr_use_path_mapping + + radarr_platform = get_sonarr_platform() + + path_mapping_reverse_movie = ast.literal_eval(settings.general.path_mappings) + path_mapping_reverse_movie = sorted(path_mapping_reverse_movie, key=operator.itemgetter(0), reverse=True) + path_mapping_reverse_movie = OrderedDict((mapping[1], mapping[0]) for mapping in path_mapping_reverse_movie if mapping[0] != '') + if any(item for sublist in path_mapping_reverse_movie for item in sublist): + radarr_use_path_mapping = True + radarr_reverse_regex = re.compile("|".join(map(re.escape, path_mapping_reverse_movie.keys()))) + else: + radarr_use_path_mapping = False def path_replace(path): - if path is None: - return None + if path is None or sonarr_use_path_mapping is False: + return path + + reverted_path = sonarr_regex.sub(lambda match: path_mapping[match.group(0)], path, count=1) - for path_mapping in ast.literal_eval(settings.general.path_mappings): - if path_mapping[0] in path: - path = path.replace(path_mapping[0], path_mapping[1]) - if path.startswith('\\\\') or re.match(r'^[a-zA-Z]:\\', path): - path = path.replace('/', '\\') - elif path.startswith('/'): - path = path.replace('\\', '/') - break - return path + from os.path import normpath + + return normpath(reverted_path) def path_replace_reverse(path): - if path is None: - return None + if path is None or sonarr_use_path_mapping is False: + return path + + reverted_path_temp = sonarr_reverse_regex.sub(lambda match: path_mapping_reverse[match.group(0)], path, count=1) - for path_mapping in ast.literal_eval(settings.general.path_mappings): - if path_mapping[1] in path: - path = path.replace(path_mapping[1], path_mapping[0]) - if path.startswith('\\\\') or re.match(r'^[a-zA-Z]:\\', path): - path = path.replace('/', '\\') - elif path.startswith('/'): - path = path.replace('\\', '/') - break - return path + if sonarr_platform == 'posix': + from posixpath import normpath + reverted_path = reverted_path_temp.replace('\\', '/') + elif sonarr_platform == 'nt': + from ntpath import normpath + reverted_path = reverted_path_temp.replace('/', '\\') + + return normpath(reverted_path) def path_replace_movie(path): - if path is None: - return None + if path is None or radarr_use_path_mapping is False: + return path + + reverted_path = radarr_regex.sub(lambda match: path_mapping_movie[match.group(0)], path, count=1) - for path_mapping in ast.literal_eval(settings.general.path_mappings_movie): - if path_mapping[0] in path: - path = path.replace(path_mapping[0], path_mapping[1]) - if path.startswith('\\\\') or re.match(r'^[a-zA-Z]:\\', path): - path = path.replace('/', '\\') - elif path.startswith('/'): - path = path.replace('\\', '/') - break - return path + from os.path import normpath + + return normpath(reverted_path) def path_replace_reverse_movie(path): - if path is None: - return None + if path is None or radarr_use_path_mapping is False: + return path + + reverted_path_temp = radarr_reverse_regex.sub(lambda match: path_mapping_reverse_movie[match.group(0)], path, count=1) - for path_mapping in ast.literal_eval(settings.general.path_mappings_movie): - if path_mapping[1] in path: - path = path.replace(path_mapping[1], path_mapping[0]) - if path.startswith('\\\\') or re.match(r'^[a-zA-Z]:\\', path): - path = path.replace('/', '\\') - elif path.startswith('/'): - path = path.replace('\\', '/') - break - return path + if radarr_platform == 'posix': + from posixpath import normpath + reverted_path = reverted_path_temp.replace('\\', '/') + elif radarr_platform == 'nt': + from ntpath import normpath + reverted_path = reverted_path_temp.replace('/', '\\') + + return normpath(reverted_path) def pp_replace(pp_command, episode, subtitles, language, language_code2, language_code3, forced): @@ -126,7 +193,7 @@ def force_unicode(s): :param s: string :return: unicode string """ - if not isinstance(s, types.UnicodeType): + if not isinstance(s, str): try: s = s.decode("utf-8") except UnicodeDecodeError: @@ -136,3 +203,9 @@ def force_unicode(s): except UnicodeDecodeError: s = UnicodeDammit(s).unicode_markup return s + + +sonarr_path_mapping_regex() +sonarr_path_mapping_reverse_regex() +radarr_path_mapping_regex() +radarr_path_mapping_reverse_regex() diff --git a/bazarr/init.py b/bazarr/init.py index 51bc67df0..83c82c627 100644 --- a/bazarr/init.py +++ b/bazarr/init.py @@ -1,11 +1,12 @@ # coding=utf-8 +from __future__ import absolute_import, print_function import os import time import rarfile from cork import Cork -from ConfigParser2 import ConfigParser +from backports import configparser2 from config import settings from get_args import args from logger import configure_logging @@ -34,7 +35,7 @@ if not os.path.exists(args.config_dir): try: os.mkdir(os.path.join(args.config_dir)) except OSError: - print "BAZARR The configuration directory doesn't exist and Bazarr cannot create it (permission issue?)." + print("BAZARR The configuration directory doesn't exist and Bazarr cannot create it (permission issue?).") exit(2) if not os.path.exists(os.path.join(args.config_dir, 'config')): @@ -83,7 +84,7 @@ if not os.path.exists(os.path.join(args.config_dir, 'config', 'releases.txt')): config_file = os.path.normpath(os.path.join(args.config_dir, 'config', 'config.ini')) -cfg = ConfigParser() +cfg = configparser2.ConfigParser() if not os.path.exists(os.path.normpath(os.path.join(args.config_dir, 'config', 'users.json'))): cork = Cork(os.path.normpath(os.path.join(args.config_dir, 'config')), initialize=True) diff --git a/bazarr/libs.py b/bazarr/libs.py index 37b8809ac..078fbec3e 100644 --- a/bazarr/libs.py +++ b/bazarr/libs.py @@ -2,10 +2,28 @@ import os import sys +from shutil import rmtree + + +def clean_libs(): + libs_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'libs') + + # Delete the old module almost empty directory compatible only with Python 2.7.x that cause bad magic number error + # if they are present in Python 3.x. + module_list = ['enum', 'concurrent'] + for module in module_list: + module_path = os.path.join(libs_dir, module) + rmtree(module_path, ignore_errors=True) def set_libs(): sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../libs/')) + from six import PY3 + if PY3: + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../libs3/')) + else: + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../libs2/')) +clean_libs() set_libs() diff --git a/bazarr/list_subtitles.py b/bazarr/list_subtitles.py index c89bdd963..59ab79225 100644 --- a/bazarr/list_subtitles.py +++ b/bazarr/list_subtitles.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import gc import os import babelfish @@ -13,6 +14,7 @@ from subliminal import core from subliminal_patch import search_external_subtitles from subzero.language import Language from bs4 import UnicodeDammit +import six from get_args import args from database import database @@ -23,18 +25,19 @@ from helper import path_replace, path_replace_movie, path_replace_reverse, \ from queueconfig import notifications from embedded_subs_reader import embedded_subs_reader +import six gc.enable() -def store_subtitles(file): - logging.debug('BAZARR started subtitles indexing for this file: ' + file) +def store_subtitles(original_path, reversed_path): + logging.debug('BAZARR started subtitles indexing for this file: ' + reversed_path) actual_subtitles = [] - if os.path.exists(file): + if os.path.exists(reversed_path): if settings.general.getboolean('use_embedded_subs'): logging.debug("BAZARR is trying to index embedded subtitles.") try: - subtitle_languages = embedded_subs_reader.list_languages(file) + subtitle_languages = embedded_subs_reader.list_languages(reversed_path) for subtitle_language, subtitle_forced, subtitle_codec in subtitle_languages: try: if settings.general.getboolean("ignore_pgs_subs") and subtitle_codec == "hdmv_pgs_subtitle": @@ -52,23 +55,23 @@ def store_subtitles(file): pass except Exception as e: logging.exception( - "BAZARR error when trying to analyze this %s file: %s" % (os.path.splitext(file)[1], file)) + "BAZARR error when trying to analyze this %s file: %s" % (os.path.splitext(reversed_path)[1], reversed_path)) pass - + brazilian_portuguese = [".pt-br", ".pob", "pb"] brazilian_portuguese_forced = [".pt-br.forced", ".pob.forced", "pb.forced"] try: dest_folder = get_subtitle_destination_folder() subliminal_patch.core.CUSTOM_PATHS = [dest_folder] if dest_folder else [] - subtitles = search_external_subtitles(file, languages=get_language_set(), + subtitles = search_external_subtitles(reversed_path, languages=get_language_set(), only_one=settings.general.getboolean('single_language')) - subtitles = guess_external_subtitles(get_subtitle_destination_folder() or os.path.dirname(file), subtitles) + subtitles = guess_external_subtitles(get_subtitle_destination_folder() or os.path.dirname(reversed_path), subtitles) except Exception as e: logging.exception("BAZARR unable to index external subtitles.") pass else: - for subtitle, language in subtitles.iteritems(): - subtitle_path = get_external_subtitles_path(file, subtitle) + for subtitle, language in six.iteritems(subtitles): + subtitle_path = get_external_subtitles_path(reversed_path, subtitle) if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)): logging.debug("BAZARR external subtitles detected: " + "pb") actual_subtitles.append( @@ -86,29 +89,29 @@ def store_subtitles(file): else: if os.path.splitext(subtitle)[1] != ".sub": logging.debug("BAZARR falling back to file content analysis to detect language.") - with open(os.path.join(os.path.dirname(file), subtitle), 'r') as f: + with open(os.path.join(os.path.dirname(reversed_path), subtitle), 'r') as f: text = f.read() try: encoding = UnicodeDammit(text) - text = text.decode(encoding.original_encoding) + if six.PY2: + text = text.decode(encoding.original_encoding) detected_language = langdetect.detect(text) except Exception as e: logging.exception( 'BAZARR Error trying to detect language for this subtitles file: ' + - os.path.join(os.path.dirname(file), subtitle) + + os.path.join(os.path.dirname(reversed_path), subtitle) + ' You should try to delete this subtitles file manually and ask Bazarr to download it again.') else: if len(detected_language) > 0: logging.debug( "BAZARR external subtitles detected and analysis guessed this language: " + str( detected_language)) - actual_subtitles.append([str(detected_language), path_replace_reverse( - os.path.join(os.path.dirname(file), subtitle))]) + actual_subtitles.append([str(detected_language), path_replace_reverse(subtitle_path)]) database.execute("UPDATE table_episodes SET subtitles=? WHERE path=?", - (str(actual_subtitles), path_replace_reverse(file))) + (str(actual_subtitles), original_path)) episode = database.execute("SELECT sonarrEpisodeId FROM table_episodes WHERE path=?", - (path_replace_reverse(file),), only_one=True) + (original_path,), only_one=True) if episode: logging.debug("BAZARR storing those languages to DB: " + str(actual_subtitles)) @@ -118,19 +121,19 @@ def store_subtitles(file): else: logging.debug("BAZARR this file doesn't seems to exist or isn't accessible.") - logging.debug('BAZARR ended subtitles indexing for this file: ' + file) + logging.debug('BAZARR ended subtitles indexing for this file: ' + reversed_path) return actual_subtitles -def store_subtitles_movie(file): - logging.debug('BAZARR started subtitles indexing for this file: ' + file) +def store_subtitles_movie(original_path, reversed_path): + logging.debug('BAZARR started subtitles indexing for this file: ' + reversed_path) actual_subtitles = [] - if os.path.exists(file): + if os.path.exists(reversed_path): if settings.general.getboolean('use_embedded_subs'): logging.debug("BAZARR is trying to index embedded subtitles.") try: - subtitle_languages = embedded_subs_reader.list_languages(file) + subtitle_languages = embedded_subs_reader.list_languages(reversed_path) for subtitle_language, subtitle_forced, subtitle_codec in subtitle_languages: try: if settings.general.getboolean("ignore_pgs_subs") and subtitle_codec == "hdmv_pgs_subtitle": @@ -148,64 +151,58 @@ def store_subtitles_movie(file): pass except Exception as e: logging.exception( - "BAZARR error when trying to analyze this %s file: %s" % (os.path.splitext(file)[1], file)) + "BAZARR error when trying to analyze this %s file: %s" % (os.path.splitext(reversed_path)[1], reversed_path)) pass - + brazilian_portuguese = [".pt-br", ".pob", "pb"] brazilian_portuguese_forced = [".pt-br.forced", ".pob.forced", "pb.forced"] try: dest_folder = get_subtitle_destination_folder() or '' subliminal_patch.core.CUSTOM_PATHS = [dest_folder] if dest_folder else [] - subtitles = search_external_subtitles(file, languages=get_language_set()) - subtitles = guess_external_subtitles(get_subtitle_destination_folder() or os.path.dirname(file), subtitles) + subtitles = search_external_subtitles(reversed_path, languages=get_language_set()) + subtitles = guess_external_subtitles(get_subtitle_destination_folder() or os.path.dirname(reversed_path), subtitles) except Exception as e: logging.exception("BAZARR unable to index external subtitles.") pass else: - for subtitle, language in subtitles.iteritems(): - if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)) is True: + for subtitle, language in six.iteritems(subtitles): + subtitle_path = get_external_subtitles_path(reversed_path, subtitle) + if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)): logging.debug("BAZARR external subtitles detected: " + "pb") - actual_subtitles.append( - [str("pb"), - path_replace_reverse_movie(os.path.join(os.path.dirname(file), dest_folder, subtitle))]) - elif str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese_forced)) is True: + actual_subtitles.append([str("pb"), path_replace_reverse_movie(subtitle_path)]) + elif str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese_forced)): logging.debug("BAZARR external subtitles detected: " + "pb:forced") - actual_subtitles.append( - [str("pb:forced"), - path_replace_reverse_movie(os.path.join(os.path.dirname(file), dest_folder, subtitle))]) + actual_subtitles.append([str("pb:forced"), path_replace_reverse_movie(subtitle_path)]) elif not language: continue elif str(language) != 'und': logging.debug("BAZARR external subtitles detected: " + str(language)) - actual_subtitles.append( - [str(language), - path_replace_reverse_movie(os.path.join(os.path.dirname(file), dest_folder, subtitle))]) + actual_subtitles.append([str(language), path_replace_reverse_movie(subtitle_path)]) else: if os.path.splitext(subtitle)[1] != ".sub": logging.debug("BAZARR falling back to file content analysis to detect language.") - with open(os.path.join(os.path.dirname(file), dest_folder, subtitle), 'r') as f: + with open(os.path.join(os.path.dirname(reversed_path), dest_folder, subtitle), 'r') as f: text = f.read() try: encoding = UnicodeDammit(text) - text = text.decode(encoding.original_encoding) + if six.PY2: + text = text.decode(encoding.original_encoding) detected_language = langdetect.detect(text) except Exception as e: logging.exception( 'BAZARR Error trying to detect language for this subtitles file: ' + - os.path.join(os.path.dirname(file), subtitle) + + os.path.join(os.path.dirname(reversed_path), subtitle) + ' You should try to delete this subtitles file manually and ask Bazarr to download it again.') else: if len(detected_language) > 0: logging.debug( - "BAZARR external subtitles detected and analysis guessed this language: " + str( - detected_language)) - actual_subtitles.append([str(detected_language), path_replace_reverse_movie( - os.path.join(os.path.dirname(file), dest_folder, subtitle))]) + "BAZARR external subtitles detected and analysis guessed this language: " + + str(detected_language)) + actual_subtitles.append([str(detected_language), path_replace_reverse_movie(subtitle_path)]) database.execute("UPDATE table_movies SET subtitles=? WHERE path=?", - (str(actual_subtitles), path_replace_reverse_movie(file))) - movie = database.execute("SELECT radarrId FROM table_movies WHERE path=?", - (path_replace_reverse_movie(file),), only_one=True) + (str(actual_subtitles), original_path)) + movie = database.execute("SELECT radarrId FROM table_movies WHERE path=?", (original_path,), only_one=True) if movie: logging.debug("BAZARR storing those languages to DB: " + str(actual_subtitles)) @@ -215,7 +212,7 @@ def store_subtitles_movie(file): else: logging.debug("BAZARR this file doesn't seems to exist or isn't accessible.") - logging.debug('BAZARR ended subtitles indexing for this file: ' + file) + logging.debug('BAZARR ended subtitles indexing for this file: ' + reversed_path) return actual_subtitles @@ -339,7 +336,7 @@ def series_full_scan_subtitles(): for i, episode in enumerate(episodes, 1): notifications.write(msg='Updating all episodes subtitles from disk...', queue='list_subtitles_series', item=i, length=count_episodes) - store_subtitles(path_replace(episode['path'])) + store_subtitles(episode['path'], path_replace(episode['path'])) gc.collect() @@ -351,7 +348,7 @@ def movies_full_scan_subtitles(): for i, movie in enumerate(movies, 1): notifications.write(msg='Updating all movies subtitles from disk...', queue='list_subtitles_movies', item=i, length=count_movies) - store_subtitles_movie(path_replace_movie(movie['path'])) + store_subtitles_movie(movie['path'], path_replace_movie(movie['path'])) gc.collect() @@ -360,14 +357,14 @@ def series_scan_subtitles(no): episodes = database.execute("SELECT path FROM table_episodes WHERE sonarrSeriesId=?", (no,)) for episode in episodes: - store_subtitles(path_replace(episode['path'])) + store_subtitles(episode['path'], path_replace(episode['path'])) def movies_scan_subtitles(no): movies = database.execute("SELECT path FROM table_movies WHERE radarrId=?", (no,)) for movie in movies: - store_subtitles_movie(path_replace_movie(movie['path'])) + store_subtitles_movie(movie['path'], path_replace_movie(movie['path'])) def get_external_subtitles_path(file, subtitle): @@ -398,7 +395,7 @@ def get_external_subtitles_path(file, subtitle): def guess_external_subtitles(dest_folder, subtitles): - for subtitle, language in subtitles.iteritems(): + for subtitle, language in six.iteritems(subtitles): if not language: subtitle_path = os.path.join(dest_folder, subtitle) if os.path.exists(subtitle_path) and os.path.splitext(subtitle_path)[1] in core.SUBTITLE_EXTENSIONS: @@ -408,7 +405,8 @@ def guess_external_subtitles(dest_folder, subtitles): text = f.read() try: encoding = UnicodeDammit(text) - text = text.decode(encoding.original_encoding) + if six.PY2: + text = text.decode(encoding.original_encoding) detected_language = langdetect.detect(text) except Exception as e: logging.exception('BAZARR Error trying to detect language for this subtitles file: ' + diff --git a/bazarr/logger.py b/bazarr/logger.py index c4529c895..7edcac8f4 100644 --- a/bazarr/logger.py +++ b/bazarr/logger.py @@ -1,10 +1,13 @@ # coding=utf-8 +from __future__ import absolute_import import os import logging import re import types import platform +import warnings +import six from logging.handlers import TimedRotatingFileHandler from get_args import args @@ -39,6 +42,9 @@ class NoExceptionFormatter(logging.Formatter): def configure_logging(debug=False): + if six.PY3: + warnings.simplefilter('ignore', category=ResourceWarning) + if not debug: log_level = "INFO" else: @@ -61,7 +67,7 @@ def configure_logging(debug=False): # File Logging global fh fh = TimedRotatingFileHandler(os.path.join(args.config_dir, 'log/bazarr.log'), when="midnight", interval=1, - backupCount=7) + backupCount=7, delay=True) f = OneLineExceptionFormatter('%(asctime)s|%(levelname)-8s|%(name)-32s|%(message)s|', '%d/%m/%Y %H:%M:%S') fh.setFormatter(f) @@ -108,10 +114,10 @@ class MyFilter(logging.Filter): class ArgsFilteringFilter(logging.Filter): def filter_args(self, record, func): - if isinstance(record.args, (types.ListType, types.TupleType)): + if isinstance(record.args, (list, tuple)): final_args = [] for arg in record.args: - if not isinstance(arg, basestring): + if not isinstance(arg, six.string_types): final_args.append(arg) continue @@ -119,7 +125,7 @@ class ArgsFilteringFilter(logging.Filter): record.args = type(record.args)(final_args) elif isinstance(record.args, dict): for key, arg in record.args.items(): - if not isinstance(arg, basestring): + if not isinstance(arg, six.string_types): continue record.args[key] = func(arg) diff --git a/bazarr/main.py b/bazarr/main.py index 37012f62b..51d26c254 100644 --- a/bazarr/main.py +++ b/bazarr/main.py @@ -1,6 +1,6 @@ # coding=utf-8 -bazarr_version = '0.8.3.4' +bazarr_version = '0.8.4' import os os.environ["SZ_USER_AGENT"] = "Bazarr/1" @@ -9,6 +9,11 @@ os.environ["BAZARR_VERSION"] = bazarr_version import gc import sys import libs + +import six +from six.moves import zip +from functools import reduce + import bottle import itertools import operator @@ -16,7 +21,7 @@ import pretty import math import ast import hashlib -import urllib +import six.moves.urllib.request, six.moves.urllib.parse, six.moves.urllib.error import warnings import queueconfig import platform @@ -54,13 +59,15 @@ from list_subtitles import store_subtitles, store_subtitles_movie, series_scan_s from get_subtitle import download_subtitle, series_download_subtitles, movies_download_subtitles, \ manual_search, manual_download_subtitle, manual_upload_subtitle from utils import history_log, history_log_movie, get_sonarr_version, get_radarr_version +from helper import path_replace_reverse, path_replace_reverse_movie from scheduler import * from notifier import send_notifications, send_notifications_movie from subliminal_patch.extensions import provider_registry as provider_manager from subliminal_patch.core import SUBTITLE_EXTENSIONS -reload(sys) -sys.setdefaultencoding('utf8') +if six.PY2: + reload(sys) + sys.setdefaultencoding('utf8') gc.enable() # Check and install update on startup when running on Windows from installer @@ -1289,12 +1296,12 @@ def save_settings(): settings_death_by_captcha_username = request.forms.get('settings_death_by_captcha_username') settings_death_by_captcha_password = request.forms.get('settings_death_by_captcha_password') - before = (unicode(settings.general.ip), int(settings.general.port), unicode(settings.general.base_url), - unicode(settings.general.path_mappings), unicode(settings.general.getboolean('use_sonarr')), - unicode(settings.general.getboolean('use_radarr')), unicode(settings.general.path_mappings_movie)) - after = (unicode(settings_general_ip), int(settings_general_port), unicode(settings_general_baseurl), - unicode(settings_general_pathmapping), unicode(settings_general_use_sonarr), - unicode(settings_general_use_radarr), unicode(settings_general_pathmapping_movie)) + before = (six.text_type(settings.general.ip), int(settings.general.port), six.text_type(settings.general.base_url), + six.text_type(settings.general.path_mappings), six.text_type(settings.general.getboolean('use_sonarr')), + six.text_type(settings.general.getboolean('use_radarr')), six.text_type(settings.general.path_mappings_movie)) + after = (six.text_type(settings_general_ip), int(settings_general_port), six.text_type(settings_general_baseurl), + six.text_type(settings_general_pathmapping), six.text_type(settings_general_use_sonarr), + six.text_type(settings_general_use_radarr), six.text_type(settings_general_pathmapping_movie)) settings.general.ip = text_type(settings_general_ip) settings.general.port = text_type(settings_general_port) @@ -1359,7 +1366,7 @@ def save_settings(): settings_proxy_password = request.forms.get('settings_proxy_password') settings_proxy_exclude = request.forms.get('settings_proxy_exclude') - before_proxy_password = (unicode(settings.proxy.type), unicode(settings.proxy.exclude)) + before_proxy_password = (six.text_type(settings.proxy.type), six.text_type(settings.proxy.exclude)) if before_proxy_password[0] != settings_proxy_type: configured() if before_proxy_password[1] == settings_proxy_password: @@ -1698,10 +1705,12 @@ def system(): def get_logs(): authorize() logs = [] - for line in reversed(open(os.path.join(args.config_dir, 'log', 'bazarr.log')).readlines()): - lin = [] - lin = line.split('|') - logs.append(lin) + with open(os.path.join(args.config_dir, 'log', 'bazarr.log')) as file: + for line in file.readlines(): + lin = [] + lin = line.split('|') + logs.append(lin) + logs.reverse() return dict(data=logs) @@ -1733,7 +1742,7 @@ def remove_subtitles(): history_log(0, sonarrSeriesId, sonarrEpisodeId, result, language=alpha2_from_alpha3(language)) except OSError as e: logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath) - store_subtitles(unicode(episodePath)) + store_subtitles(path_replace_reverse(episodePath), episodePath) @route(base_url + 'remove_subtitles_movie', method='POST') @@ -1751,7 +1760,7 @@ def remove_subtitles_movie(): history_log_movie(0, radarrId, result, language=alpha2_from_alpha3(language)) except OSError as e: logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath) - store_subtitles_movie(unicode(moviePath)) + store_subtitles_movie(path_replace_reverse_movie(moviePath), moviePath) @route(base_url + 'get_subtitle', method='POST') @@ -1784,7 +1793,7 @@ def get_subtitle(): score = result[4] history_log(1, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score) send_notifications(sonarrSeriesId, sonarrEpisodeId, message) - store_subtitles(unicode(episodePath)) + store_subtitles(path, episodePath) redirect(ref) except OSError: pass @@ -1841,7 +1850,7 @@ def manual_get_subtitle(): score = result[4] history_log(2, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score) send_notifications(sonarrSeriesId, sonarrEpisodeId, message) - store_subtitles(unicode(episodePath)) + store_subtitles(path, episodePath) redirect(ref) except OSError: pass @@ -1884,7 +1893,7 @@ def perform_manual_upload_subtitle(): score = 360 history_log(4, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score) send_notifications(sonarrSeriesId, sonarrEpisodeId, message) - store_subtitles(unicode(episodePath)) + store_subtitles(path, episodePath) redirect(ref) except OSError: @@ -1920,7 +1929,7 @@ def get_subtitle_movie(): score = result[4] history_log_movie(1, radarrId, message, path, language_code, provider, score) send_notifications_movie(radarrId, message) - store_subtitles_movie(unicode(moviePath)) + store_subtitles_movie(path, moviePath) redirect(ref) except OSError: pass @@ -1975,7 +1984,7 @@ def manual_get_subtitle_movie(): score = result[4] history_log_movie(2, radarrId, message, path, language_code, provider, score) send_notifications_movie(radarrId, message) - store_subtitles_movie(unicode(moviePath)) + store_subtitles_movie(path, moviePath) redirect(ref) except OSError: pass @@ -2017,7 +2026,7 @@ def perform_manual_upload_subtitle_movie(): score = 120 history_log_movie(4, radarrId, message, path, language_code, provider, score) send_notifications_movie(radarrId, message) - store_subtitles_movie(unicode(moviePath)) + store_subtitles_movie(path, moviePath) redirect(ref) except OSError: @@ -2091,7 +2100,7 @@ def api_history(): @route(base_url + 'test_url/<protocol>/<url:path>', method='GET') @custom_auth_basic(check_credentials) def test_url(protocol, url): - url = urllib.unquote(url) + url = six.moves.urllib.parse.unquote(url) try: result = requests.get(protocol + "://" + url, allow_redirects=False, verify=False).json()['version'] except: @@ -2103,7 +2112,7 @@ def test_url(protocol, url): @route(base_url + 'test_notification/<protocol>/<provider:path>', method='GET') @custom_auth_basic(check_credentials) def test_notification(protocol, provider): - provider = urllib.unquote(provider) + provider = six.moves.urllib.parse.unquote(provider) apobj = apprise.Apprise() apobj.add(protocol + "://" + provider) diff --git a/bazarr/notifier.py b/bazarr/notifier.py index 4626fa78f..57c9e4960 100644 --- a/bazarr/notifier.py +++ b/bazarr/notifier.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import apprise import os import logging @@ -22,23 +23,20 @@ def update_notifier(): notifiers_current = [] for notifier in notifiers_current_db: - notifiers_current.append(notifier['name']) + notifiers_current.append([notifier['name']]) for x in results['schemas']: - if x['service_name'] not in notifiers_current: - notifiers_new.append(x['service_name']) + if [x['service_name']] not in notifiers_current: + notifiers_new.append([x['service_name'], 0]) logging.debug('Adding new notifier agent: ' + x['service_name']) else: - notifiers_old.append(x['service_name']) - notifier_current = [i for i in notifiers_current] + notifiers_old.append([x['service_name']]) - notifiers_to_delete = list(set(notifier_current) - set(notifiers_old)) - - for notifier_new in notifiers_new: - database.execute("INSERT INTO table_settings_notifier (name, enabled) VALUES (?, ?)", (notifier_new, 0)) + notifiers_to_delete = [item for item in notifiers_current if item not in notifiers_old] + + database.execute("INSERT INTO table_settings_notifier (name, enabled) VALUES (?, ?)", notifiers_new, execute_many=True) - for notifier_to_delete in notifiers_to_delete: - database.execute("DELETE FROM table_settings_notifier WHERE name=?", (notifier_to_delete,)) + database.execute("DELETE FROM table_settings_notifier WHERE name=?", notifiers_to_delete, execute_many=True) def get_notifier_providers(): diff --git a/bazarr/queueconfig.py b/bazarr/queueconfig.py index fd33c1850..55ece9b9c 100644 --- a/bazarr/queueconfig.py +++ b/bazarr/queueconfig.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import from collections import deque import json @@ -36,7 +37,7 @@ class Notify: :rtype: str """ - if self.queue: + if self.queue and (len(self.queue) > 0): return self.queue.popleft() diff --git a/bazarr/scheduler.py b/bazarr/scheduler.py index 4fc64431b..eba04a828 100644 --- a/bazarr/scheduler.py +++ b/bazarr/scheduler.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import from get_episodes import sync_episodes, update_all_episodes from get_movies import update_movies, update_all_movies from get_series import update_series diff --git a/bazarr/utils.py b/bazarr/utils.py index 2fd70ae80..aee63a298 100644 --- a/bazarr/utils.py +++ b/bazarr/utils.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import os import time import platform @@ -10,7 +11,6 @@ import requests from whichcraft import which from get_args import args from config import settings, url_sonarr, url_radarr -from database import database from subliminal import region as subliminal_cache_region import datetime @@ -19,6 +19,7 @@ import glob def history_log(action, sonarrSeriesId, sonarrEpisodeId, description, video_path=None, language=None, provider=None, score=None, forced=False): + from database import database database.execute("INSERT INTO table_history (action, sonarrSeriesId, sonarrEpisodeId, timestamp, description," "video_path, language, provider, score) VALUES (?,?,?,?,?,?,?,?,?)", (action, sonarrSeriesId, sonarrEpisodeId, time.time(), @@ -28,6 +29,7 @@ def history_log(action, sonarrSeriesId, sonarrEpisodeId, description, video_path def history_log_movie(action, radarrId, description, video_path=None, language=None, provider=None, score=None, forced=False): + from database import database database.execute("INSERT INTO table_history_movie (action, radarrId, timestamp, description, video_path, language, " "provider, score) VALUES (?,?,?,?,?,?,?,?)", (action, radarrId, time.time(), description, video_path, language, provider, score)) @@ -93,6 +95,23 @@ def get_sonarr_version(): return sonarr_version +def get_sonarr_platform(): + use_sonarr = settings.general.getboolean('use_sonarr') + apikey_sonarr = settings.sonarr.apikey + sv = url_sonarr() + "/api/system/status?apikey=" + apikey_sonarr + sonarr_platform = '' + if use_sonarr: + try: + if requests.get(sv, timeout=60, verify=False).json()['isLinux'] or requests.get(sv, timeout=60, verify=False).json()['isOsx']: + sonarr_platform = 'posix' + elif requests.get(sv, timeout=60, verify=False).json()['isWindows']: + sonarr_platform = 'nt' + except Exception as e: + logging.DEBUG('BAZARR cannot get Sonarr platform') + + return sonarr_platform + + def get_radarr_version(): use_radarr = settings.general.getboolean('use_radarr') apikey_radarr = settings.radarr.apikey @@ -105,3 +124,20 @@ def get_radarr_version(): logging.debug('BAZARR cannot get Radarr version') return radarr_version + + +def get_radarr_platform(): + use_radarr = settings.general.getboolean('use_radarr') + apikey_radarr = settings.radarr.apikey + rv = url_radarr() + "/api/system/status?apikey=" + apikey_radarr + radarr_platform = '' + if use_radarr: + try: + if requests.get(rv, timeout=60, verify=False).json()['isLinux'] or requests.get(rv, timeout=60, verify=False).json()['isOsx']: + radarr_platform = 'posix' + elif requests.get(rv, timeout=60, verify=False).json()['isWindows']: + radarr_platform = 'nt' + except Exception as e: + logging.DEBUG('BAZARR cannot get Radarr platform') + + return radarr_platform diff --git a/libs/ConfigParser2.py b/libs/ConfigParser2.py deleted file mode 100644 index 4ec642a5d..000000000 --- a/libs/ConfigParser2.py +++ /dev/null @@ -1,797 +0,0 @@ -"""Configuration file parser. - -A setup file consists of sections, lead by a "[section]" header, -and followed by "name: value" entries, with continuations and such in -the style of RFC 822. - -The option values can contain format strings which refer to other values in -the same section, or values in a special [DEFAULT] section. - -For example: - - something: %(dir)s/whatever - -would resolve the "%(dir)s" to the value of dir. All reference -expansions are done late, on demand. - -Intrinsic defaults can be specified by passing them into the -ConfigParser constructor as a dictionary. - -class: - -ConfigParser -- responsible for parsing a list of - configuration files, and managing the parsed database. - - methods: - - __init__(defaults=None) - create the parser and specify a dictionary of intrinsic defaults. The - keys must be strings, the values must be appropriate for %()s string - interpolation. Note that `__name__' is always an intrinsic default; - its value is the section's name. - - sections() - return all the configuration section names, sans DEFAULT - - has_section(section) - return whether the given section exists - - has_option(section, option) - return whether the given option exists in the given section - - options(section) - return list of configuration options for the named section - - read(filenames) - read and parse the list of named configuration files, given by - name. A single filename is also allowed. Non-existing files - are ignored. Return list of successfully read files. - - readfp(fp, filename=None) - read and parse one configuration file, given as a file object. - The filename defaults to fp.name; it is only used in error - messages (if fp has no `name' attribute, the string `<???>' is used). - - get(section, option, raw=False, vars=None) - return a string value for the named option. All % interpolations are - expanded in the return values, based on the defaults passed into the - constructor and the DEFAULT section. Additional substitutions may be - provided using the `vars' argument, which must be a dictionary whose - contents override any pre-existing defaults. - - getint(section, options) - like get(), but convert value to an integer - - getfloat(section, options) - like get(), but convert value to a float - - getboolean(section, options) - like get(), but convert value to a boolean (currently case - insensitively defined as 0, false, no, off for False, and 1, true, - yes, on for True). Returns False or True. - - items(section, raw=False, vars=None) - return a list of tuples with (name, value) for each option - in the section. - - remove_section(section) - remove the given file section and all its options - - remove_option(section, option) - remove the given option from the given section - - set(section, option, value) - set the given option - - write(fp) - write the configuration state in .ini format -""" - -try: - from collections import OrderedDict as _default_dict -except ImportError: - # fallback for setup.py which hasn't yet built _collections - _default_dict = dict - -import re - -__all__ = ["NoSectionError", "DuplicateSectionError", "NoOptionError", - "InterpolationError", "InterpolationDepthError", - "InterpolationSyntaxError", "ParsingError", - "MissingSectionHeaderError", - "ConfigParser", "SafeConfigParser", "RawConfigParser", - "DEFAULTSECT", "MAX_INTERPOLATION_DEPTH"] - -DEFAULTSECT = "DEFAULT" - -MAX_INTERPOLATION_DEPTH = 10 - - - -# exception classes -class Error(Exception): - """Base class for ConfigParser exceptions.""" - - def _get_message(self): - """Getter for 'message'; needed only to override deprecation in - BaseException.""" - return self.__message - - def _set_message(self, value): - """Setter for 'message'; needed only to override deprecation in - BaseException.""" - self.__message = value - - # BaseException.message has been deprecated since Python 2.6. To prevent - # DeprecationWarning from popping up over this pre-existing attribute, use - # a new property that takes lookup precedence. - message = property(_get_message, _set_message) - - def __init__(self, msg=''): - self.message = msg - Exception.__init__(self, msg) - - def __repr__(self): - return self.message - - __str__ = __repr__ - -class NoSectionError(Error): - """Raised when no section matches a requested option.""" - - def __init__(self, section): - Error.__init__(self, 'No section: %r' % (section,)) - self.section = section - self.args = (section, ) - -class DuplicateSectionError(Error): - """Raised when a section is multiply-created.""" - - def __init__(self, section): - Error.__init__(self, "Section %r already exists" % section) - self.section = section - self.args = (section, ) - -class NoOptionError(Error): - """A requested option was not found.""" - - def __init__(self, option, section): - Error.__init__(self, "No option %r in section: %r" % - (option, section)) - self.option = option - self.section = section - self.args = (option, section) - -class InterpolationError(Error): - """Base class for interpolation-related exceptions.""" - - def __init__(self, option, section, msg): - Error.__init__(self, msg) - self.option = option - self.section = section - self.args = (option, section, msg) - -class InterpolationMissingOptionError(InterpolationError): - """A string substitution required a setting which was not available.""" - - def __init__(self, option, section, rawval, reference): - msg = ("Bad value substitution:\n" - "\tsection: [%s]\n" - "\toption : %s\n" - "\tkey : %s\n" - "\trawval : %s\n" - % (section, option, reference, rawval)) - InterpolationError.__init__(self, option, section, msg) - self.reference = reference - self.args = (option, section, rawval, reference) - -class InterpolationSyntaxError(InterpolationError): - """Raised when the source text into which substitutions are made - does not conform to the required syntax.""" - -class InterpolationDepthError(InterpolationError): - """Raised when substitutions are nested too deeply.""" - - def __init__(self, option, section, rawval): - msg = ("Value interpolation too deeply recursive:\n" - "\tsection: [%s]\n" - "\toption : %s\n" - "\trawval : %s\n" - % (section, option, rawval)) - InterpolationError.__init__(self, option, section, msg) - self.args = (option, section, rawval) - -class ParsingError(Error): - """Raised when a configuration file does not follow legal syntax.""" - - def __init__(self, filename): - Error.__init__(self, 'File contains parsing errors: %s' % filename) - self.filename = filename - self.errors = [] - self.args = (filename, ) - - def append(self, lineno, line): - self.errors.append((lineno, line)) - self.message += '\n\t[line %2d]: %s' % (lineno, line) - -class MissingSectionHeaderError(ParsingError): - """Raised when a key-value pair is found before any section header.""" - - def __init__(self, filename, lineno, line): - Error.__init__( - self, - 'File contains no section headers.\nfile: %s, line: %d\n%r' % - (filename, lineno, line)) - self.filename = filename - self.lineno = lineno - self.line = line - self.args = (filename, lineno, line) - - -class RawConfigParser: - def __init__(self, defaults=None, dict_type=_default_dict, - allow_no_value=False): - self._dict = dict_type - self._sections = self._dict() - self._defaults = self._dict() - if allow_no_value: - self._optcre = self.OPTCRE_NV - else: - self._optcre = self.OPTCRE - if defaults: - for key, value in defaults.items(): - self._defaults[self.optionxform(key)] = value - self.comment_store = None ## used for storing comments in ini - - - def defaults(self): - return self._defaults - - def sections(self): - """Return a list of section names, excluding [DEFAULT]""" - # self._sections will never have [DEFAULT] in it - return self._sections.keys() - - def add_section(self, section): - """Create a new section in the configuration. - - Raise DuplicateSectionError if a section by the specified name - already exists. Raise ValueError if name is DEFAULT or any of it's - case-insensitive variants. - """ - if section.lower() == "default": - raise ValueError, 'Invalid section name: %s' % section - - if section in self._sections: - raise DuplicateSectionError(section) - self._sections[section] = self._dict() - - def has_section(self, section): - """Indicate whether the named section is present in the configuration. - - The DEFAULT section is not acknowledged. - """ - return section in self._sections - - def options(self, section): - """Return a list of option names for the given section name.""" - try: - opts = self._sections[section].copy() - except KeyError: - raise NoSectionError(section) - opts.update(self._defaults) - if '__name__' in opts: - del opts['__name__'] - return opts.keys() - - def read(self, filenames): - """Read and parse a filename or a list of filenames. - - Files that cannot be opened are silently ignored; this is - designed so that you can specify a list of potential - configuration file locations (e.g. current directory, user's - home directory, systemwide directory), and all existing - configuration files in the list will be read. A single - filename may also be given. - - Return list of successfully read files. - """ - if isinstance(filenames, basestring): - filenames = [filenames] - read_ok = [] - for filename in filenames: - try: - fp = open(filename) - except IOError: - continue - self._read(fp, filename) - fp.close() - read_ok.append(filename) - return read_ok - - def readfp(self, fp, filename=None): - """Like read() but the argument must be a file-like object. - - The `fp' argument must have a `readline' method. Optional - second argument is the `filename', which if not given, is - taken from fp.name. If fp has no `name' attribute, `<???>' is - used. - - """ - if filename is None: - try: - filename = fp.name - except AttributeError: - filename = '<???>' - self._read(fp, filename) - - def get(self, section, option): - opt = self.optionxform(option) - if section not in self._sections: - if section != DEFAULTSECT: - raise NoSectionError(section) - if opt in self._defaults: - return self._defaults[opt] - else: - raise NoOptionError(option, section) - elif opt in self._sections[section]: - return self._sections[section][opt] - elif opt in self._defaults: - return self._defaults[opt] - else: - raise NoOptionError(option, section) - - def items(self, section): - try: - d2 = self._sections[section] - except KeyError: - if section != DEFAULTSECT: - raise NoSectionError(section) - d2 = self._dict() - d = self._defaults.copy() - d.update(d2) - if "__name__" in d: - del d["__name__"] - return d.items() - - def _get(self, section, conv, option): - return conv(self.get(section, option)) - - def getint(self, section, option): - return self._get(section, int, option) - - def getfloat(self, section, option): - return self._get(section, float, option) - - _boolean_states = {'1': True, 'yes': True, 'true': True, 'on': True, - '0': False, 'no': False, 'false': False, 'off': False} - - def getboolean(self, section, option): - v = self.get(section, option) - if v.lower() not in self._boolean_states: - raise ValueError, 'Not a boolean: %s' % v - return self._boolean_states[v.lower()] - - def optionxform(self, optionstr): - return optionstr.lower() - - def has_option(self, section, option): - """Check for the existence of a given option in a given section.""" - if not section or section == DEFAULTSECT: - option = self.optionxform(option) - return option in self._defaults - elif section not in self._sections: - return False - else: - option = self.optionxform(option) - return (option in self._sections[section] - or option in self._defaults) - - def set(self, section, option, value=None): - """Set an option.""" - if not section or section == DEFAULTSECT: - sectdict = self._defaults - else: - try: - sectdict = self._sections[section] - except KeyError: - raise NoSectionError(section) - sectdict[self.optionxform(option)] = value - - def write(self, fp): - """Write an .ini-format representation of the configuration state.""" - if self._defaults: - fp.write("[%s]\n" % DEFAULTSECT) - for (key, value) in self._defaults.items(): - fp.write("%s = %s\n" % (key, str(value).replace('\n', '\n\t'))) - fp.write("\n") - for section in self._sections: - fp.write("[%s]\n" % section) - for (key, value) in self._sections[section].items(): - if key == "__name__": - continue - if (value is not None) or (self._optcre == self.OPTCRE): - key = " = ".join((key, str(value).replace('\n', '\n\t'))) - fp.write("%s\n" % (key)) - fp.write("\n") - - def remove_option(self, section, option): - """Remove an option.""" - if not section or section == DEFAULTSECT: - sectdict = self._defaults - else: - try: - sectdict = self._sections[section] - except KeyError: - raise NoSectionError(section) - option = self.optionxform(option) - existed = option in sectdict - if existed: - del sectdict[option] - return existed - - def remove_section(self, section): - """Remove a file section.""" - existed = section in self._sections - if existed: - del self._sections[section] - return existed - - # - # Regular expressions for parsing section headers and options. - # - SECTCRE = re.compile( - r'\[' # [ - r'(?P<header>[^]]+)' # very permissive! - r'\]' # ] - ) - OPTCRE = re.compile( - r'(?P<option>[^:=\s][^:=]*)' # very permissive! - r'\s*(?P<vi>[:=])\s*' # any number of space/tab, - # followed by separator - # (either : or =), followed - # by any # space/tab - r'(?P<value>.*)$' # everything up to eol - ) - OPTCRE_NV = re.compile( - r'(?P<option>[^:=\s][^:=]*)' # very permissive! - r'\s*(?:' # any number of space/tab, - r'(?P<vi>[:=])\s*' # optionally followed by - # separator (either : or - # =), followed by any # - # space/tab - r'(?P<value>.*))?$' # everything up to eol - ) - - def _read(self, fp, fpname): - """Parse a sectioned setup file. - - The sections in setup file contains a title line at the top, - indicated by a name in square brackets (`[]'), plus key/value - options lines, indicated by `name: value' format lines. - Continuations are represented by an embedded newline then - leading whitespace. Blank lines, lines beginning with a '#', - and just about everything else are ignored. - """ - - comment_store = {} - cursect = None # None, or a dictionary - optname = None - lineno = 0 - e = None # None, or an exception - while True: - line = fp.readline() - if not line: - break - lineno = lineno + 1 - # comment or blank line? - if line.strip() == '' : - continue - ### store comments for doc purposes - ### Deal with cases of sections and options being there or not - if line[0] in '#;' and cursect is not None: - if optname is None: - comment_store.setdefault(cursect['__name__'] + - "::" + "global",[]).append(line) - else: - comment_store.setdefault(cursect['__name__'] + - "::" + optname,[]).append(line) - continue - elif line[0] in '#;' and cursect is None: - comment_store.setdefault("global" + - "::" + optname,[]).append(line) - continue - - if line.split(None, 1)[0].lower() == 'rem' and line[0] in "rR": - # no leading whitespace - continue - # continuation line? - if line[0].isspace() and cursect is not None and optname: - value = line.strip() - if value: - cursect[optname].append(value) - # a section header or option header? - else: - # is it a section header? - mo = self.SECTCRE.match(line) - if mo: - sectname = mo.group('header') - if sectname in self._sections: - cursect = self._sections[sectname] - elif sectname == DEFAULTSECT: - cursect = self._defaults - else: - cursect = self._dict() - cursect['__name__'] = sectname - self._sections[sectname] = cursect - # So sections can't start with a continuation line - optname = None - # no section header in the file? - elif cursect is None: - raise MissingSectionHeaderError(fpname, lineno, line) - # an option line? - else: - mo = self._optcre.match(line) - if mo: - optname, vi, optval = mo.group('option', 'vi', 'value') - optname = self.optionxform(optname.rstrip()) - # This check is fine because the OPTCRE cannot - # match if it would set optval to None - if optval is not None: - if vi in ('=', ':') and ';' in optval: - # ';' is a comment delimiter only if it follows - # a spacing character - pos = optval.find(';') - if pos != -1 and optval[pos-1].isspace(): - optval = optval[:pos] - optval = optval.strip() - # allow empty values - if optval == '""': - optval = '' - cursect[optname] = [optval] - else: - # valueless option handling - cursect[optname] = optval - else: - # a non-fatal parsing error occurred. set up the - # exception but keep going. the exception will be - # raised at the end of the file and will contain a - # list of all bogus lines - if not e: - e = ParsingError(fpname) - e.append(lineno, repr(line)) - # if any parsing errors occurred, raise an exception - if e: - raise e - - # join the multi-line values collected while reading - all_sections = [self._defaults] - all_sections.extend(self._sections.values()) - for options in all_sections: - for name, val in options.items(): - if isinstance(val, list): - options[name] = '\n'.join(val) - self.comment_store = comment_store - - def ini_as_rst(self): - """trivial helper function to putput comment_stroe as rest - - .. todo:: write actual doctests with string input - >> p = ConfigParser2.SafeConfigParser() - >> p.read(f) - ['/usr/home/pbrian/src/public/configparser2/example.ini'] - >> open("/tmp/foo.rst", "w").write(p.ini_as_rst()) - - """ - outstr = ".. rst version of ini file\n\n" - _cursectname = None - for item in sorted(self.comment_store.keys()): - _sect, _opt = item.split("::") - if _sect != _cursectname: - outstr += "\n%s\n%s\n" % (_sect, "-"* len(_sect)) - _cursectname = _sect - txt = " ".join(self.comment_store[item]) - txt = txt.replace("#", "").replace(";","") - outstr += ":%s: %s" % (_opt, txt) - return outstr - - - -import UserDict as _UserDict - -class _Chainmap(_UserDict.DictMixin): - """Combine multiple mappings for successive lookups. - - For example, to emulate Python's normal lookup sequence: - - import __builtin__ - pylookup = _Chainmap(locals(), globals(), vars(__builtin__)) - """ - - def __init__(self, *maps): - self._maps = maps - - def __getitem__(self, key): - for mapping in self._maps: - try: - return mapping[key] - except KeyError: - pass - raise KeyError(key) - - def keys(self): - result = [] - seen = set() - for mapping in self._maps: - for key in mapping: - if key not in seen: - result.append(key) - seen.add(key) - return result - -class ConfigParser(RawConfigParser): - - def get(self, section, option, raw=False, vars=None): - """Get an option value for a given section. - - If `vars' is provided, it must be a dictionary. The option is looked up - in `vars' (if provided), `section', and in `defaults' in that order. - - All % interpolations are expanded in the return values, unless the - optional argument `raw' is true. Values for interpolation keys are - looked up in the same manner as the option. - - The section DEFAULT is special. - """ - sectiondict = {} - try: - sectiondict = self._sections[section] - except KeyError: - if section != DEFAULTSECT: - raise NoSectionError(section) - # Update with the entry specific variables - vardict = {} - if vars: - for key, value in vars.items(): - vardict[self.optionxform(key)] = value - d = _Chainmap(vardict, sectiondict, self._defaults) - option = self.optionxform(option) - try: - value = d[option] - except KeyError: - raise NoOptionError(option, section) - - if raw or value is None: - return value - else: - return self._interpolate(section, option, value, d) - - def items(self, section, raw=False, vars=None): - """Return a list of tuples with (name, value) for each option - in the section. - - All % interpolations are expanded in the return values, based on the - defaults passed into the constructor, unless the optional argument - `raw' is true. Additional substitutions may be provided using the - `vars' argument, which must be a dictionary whose contents overrides - any pre-existing defaults. - - The section DEFAULT is special. - """ - d = self._defaults.copy() - try: - d.update(self._sections[section]) - except KeyError: - if section != DEFAULTSECT: - raise NoSectionError(section) - # Update with the entry specific variables - if vars: - for key, value in vars.items(): - d[self.optionxform(key)] = value - options = d.keys() - if "__name__" in options: - options.remove("__name__") - if raw: - return [(option, d[option]) - for option in options] - else: - return [(option, self._interpolate(section, option, d[option], d)) - for option in options] - - def _interpolate(self, section, option, rawval, vars): - # do the string interpolation - value = rawval - depth = MAX_INTERPOLATION_DEPTH - while depth: # Loop through this until it's done - depth -= 1 - if value and "%(" in value: - value = self._KEYCRE.sub(self._interpolation_replace, value) - try: - value = value % vars - except KeyError, e: - raise InterpolationMissingOptionError( - option, section, rawval, e.args[0]) - else: - break - if value and "%(" in value: - raise InterpolationDepthError(option, section, rawval) - return value - - _KEYCRE = re.compile(r"%\(([^)]*)\)s|.") - - def _interpolation_replace(self, match): - s = match.group(1) - if s is None: - return match.group() - else: - return "%%(%s)s" % self.optionxform(s) - - -class SafeConfigParser(ConfigParser): - - def _interpolate(self, section, option, rawval, vars): - # do the string interpolation - L = [] - self._interpolate_some(option, L, rawval, section, vars, 1) - return ''.join(L) - - _interpvar_re = re.compile(r"%\(([^)]+)\)s") - - def _interpolate_some(self, option, accum, rest, section, map, depth): - if depth > MAX_INTERPOLATION_DEPTH: - raise InterpolationDepthError(option, section, rest) - while rest: - p = rest.find("%") - if p < 0: - accum.append(rest) - return - if p > 0: - accum.append(rest[:p]) - rest = rest[p:] - # p is no longer used - c = rest[1:2] - if c == "%": - accum.append("%") - rest = rest[2:] - elif c == "(": - m = self._interpvar_re.match(rest) - if m is None: - raise InterpolationSyntaxError(option, section, - "bad interpolation variable reference %r" % rest) - var = self.optionxform(m.group(1)) - rest = rest[m.end():] - try: - v = map[var] - except KeyError: - raise InterpolationMissingOptionError( - option, section, rest, var) - if "%" in v: - self._interpolate_some(option, accum, v, - section, map, depth + 1) - else: - accum.append(v) - else: - raise InterpolationSyntaxError( - option, section, - "'%%' must be followed by '%%' or '(', found: %r" % (rest,)) - - def set(self, section, option, value=None): - """Set an option. Extend ConfigParser.set: check for string values.""" - # The only legal non-string value if we allow valueless - # options is None, so we need to check if the value is a - # string if: - # - we do not allow valueless options, or - # - we allow valueless options but the value is not None - if self._optcre is self.OPTCRE or value: - if not isinstance(value, basestring): - raise TypeError("option values must be strings") - if value is not None: - # check for bad percent signs: - # first, replace all "good" interpolations - tmp_value = value.replace('%%', '') - tmp_value = self._interpvar_re.sub('', tmp_value) - # then, check if there's a lone percent sign left - if '%' in tmp_value: - raise ValueError("invalid interpolation syntax in %r at " - "position %d" % (value, tmp_value.find('%'))) - ConfigParser.set(self, section, option, value) diff --git a/libs/appdirs.py b/libs/appdirs.py index f4dba0953..805fb2abd 100644 --- a/libs/appdirs.py +++ b/libs/appdirs.py @@ -13,7 +13,7 @@ See <http://github.com/ActiveState/appdirs> for details and usage. # - Mac OS X: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/index.html # - XDG spec for Un*x: http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html -__version_info__ = (1, 4, 0) +__version_info__ = (1, 4, 3) __version__ = '.'.join(map(str, __version_info__)) @@ -117,7 +117,7 @@ def site_data_dir(appname=None, appauthor=None, version=None, multipath=False): returned, or '/usr/local/share/<AppName>', if XDG_DATA_DIRS is not set - Typical user data directories are: + Typical site data directories are: Mac OS X: /Library/Application Support/<AppName> Unix: /usr/local/share/<AppName> or /usr/share/<AppName> Win XP: C:\Documents and Settings\All Users\Application Data\<AppAuthor>\<AppName> @@ -184,13 +184,13 @@ def user_config_dir(appname=None, appauthor=None, version=None, roaming=False): <http://technet.microsoft.com/en-us/library/cc766489(WS.10).aspx> for a discussion of issues. - Typical user data directories are: + Typical user config directories are: Mac OS X: same as user_data_dir Unix: ~/.config/<AppName> # or in $XDG_CONFIG_HOME, if defined Win *: same as user_data_dir For Unix, we follow the XDG spec and support $XDG_CONFIG_HOME. - That means, by deafult "~/.config/<AppName>". + That means, by default "~/.config/<AppName>". """ if system in ["win32", "darwin"]: path = user_data_dir(appname, appauthor, None, roaming) @@ -222,7 +222,7 @@ def site_config_dir(appname=None, appauthor=None, version=None, multipath=False) returned. By default, the first item from XDG_CONFIG_DIRS is returned, or '/etc/xdg/<AppName>', if XDG_CONFIG_DIRS is not set - Typical user data directories are: + Typical site config directories are: Mac OS X: same as site_data_dir Unix: /etc/xdg/<AppName> or $XDG_CONFIG_DIRS[i]/<AppName> for each value in $XDG_CONFIG_DIRS @@ -311,6 +311,48 @@ def user_cache_dir(appname=None, appauthor=None, version=None, opinion=True): return path +def user_state_dir(appname=None, appauthor=None, version=None, roaming=False): + r"""Return full path to the user-specific state dir for this application. + + "appname" is the name of application. + If None, just the system directory is returned. + "appauthor" (only used on Windows) is the name of the + appauthor or distributing body for this application. Typically + it is the owning company name. This falls back to appname. You may + pass False to disable it. + "version" is an optional version path element to append to the + path. You might want to use this if you want multiple versions + of your app to be able to run independently. If used, this + would typically be "<major>.<minor>". + Only applied when appname is present. + "roaming" (boolean, default False) can be set True to use the Windows + roaming appdata directory. That means that for users on a Windows + network setup for roaming profiles, this user data will be + sync'd on login. See + <http://technet.microsoft.com/en-us/library/cc766489(WS.10).aspx> + for a discussion of issues. + + Typical user state directories are: + Mac OS X: same as user_data_dir + Unix: ~/.local/state/<AppName> # or in $XDG_STATE_HOME, if defined + Win *: same as user_data_dir + + For Unix, we follow this Debian proposal <https://wiki.debian.org/XDGBaseDirectorySpecification#state> + to extend the XDG spec and support $XDG_STATE_HOME. + + That means, by default "~/.local/state/<AppName>". + """ + if system in ["win32", "darwin"]: + path = user_data_dir(appname, appauthor, None, roaming) + else: + path = os.getenv('XDG_STATE_HOME', os.path.expanduser("~/.local/state")) + if appname: + path = os.path.join(path, appname) + if appname and version: + path = os.path.join(path, version) + return path + + def user_log_dir(appname=None, appauthor=None, version=None, opinion=True): r"""Return full path to the user-specific log dir for this application. @@ -329,7 +371,7 @@ def user_log_dir(appname=None, appauthor=None, version=None, opinion=True): "Logs" to the base app data dir for Windows, and "log" to the base cache dir for Unix. See discussion below. - Typical user cache directories are: + Typical user log directories are: Mac OS X: ~/Library/Logs/<AppName> Unix: ~/.cache/<AppName>/log # or under $XDG_CACHE_HOME if defined Win XP: C:\Documents and Settings\<username>\Local Settings\Application Data\<AppAuthor>\<AppName>\Logs @@ -364,8 +406,8 @@ def user_log_dir(appname=None, appauthor=None, version=None, opinion=True): class AppDirs(object): """Convenience wrapper for getting application dirs.""" - def __init__(self, appname, appauthor=None, version=None, roaming=False, - multipath=False): + def __init__(self, appname=None, appauthor=None, version=None, + roaming=False, multipath=False): self.appname = appname self.appauthor = appauthor self.version = version @@ -398,6 +440,11 @@ class AppDirs(object): version=self.version) @property + def user_state_dir(self): + return user_state_dir(self.appname, self.appauthor, + version=self.version) + + @property def user_log_dir(self): return user_log_dir(self.appname, self.appauthor, version=self.version) @@ -410,7 +457,10 @@ def _get_win_folder_from_registry(csidl_name): registry for this guarantees us the correct answer for all CSIDL_* names. """ - import _winreg + if PY3: + import winreg as _winreg + else: + import _winreg shell_folder_name = { "CSIDL_APPDATA": "AppData", @@ -500,7 +550,7 @@ def _get_win_folder_with_jna(csidl_name): if has_high_char: buf = array.zeros('c', buf_size) kernel = win32.Kernel32.INSTANCE - if kernal.GetShortPathName(dir, buf, buf_size): + if kernel.GetShortPathName(dir, buf, buf_size): dir = jna.Native.toString(buf.tostring()).rstrip("\0") return dir @@ -527,9 +577,15 @@ if __name__ == "__main__": appname = "MyApp" appauthor = "MyCompany" - props = ("user_data_dir", "site_data_dir", - "user_config_dir", "site_config_dir", - "user_cache_dir", "user_log_dir") + props = ("user_data_dir", + "user_config_dir", + "user_cache_dir", + "user_state_dir", + "user_log_dir", + "site_data_dir", + "site_config_dir") + + print("-- app dirs %s --" % __version__) print("-- app dirs (with optional 'version')") dirs = AppDirs(appname, appauthor, version="1.0") diff --git a/libs/backports/configparser2/__init__.py b/libs/backports/configparser2/__init__.py new file mode 100644 index 000000000..972413562 --- /dev/null +++ b/libs/backports/configparser2/__init__.py @@ -0,0 +1,1340 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""Configuration file parser. + +A configuration file consists of sections, lead by a "[section]" header, +and followed by "name: value" entries, with continuations and such in +the style of RFC 822. + +Intrinsic defaults can be specified by passing them into the +ConfigParser constructor as a dictionary. + +class: + +ConfigParser -- responsible for parsing a list of + configuration files, and managing the parsed database. + + methods: + + __init__(defaults=None, dict_type=_default_dict, allow_no_value=False, + delimiters=('=', ':'), comment_prefixes=('#', ';'), + inline_comment_prefixes=None, strict=True, + empty_lines_in_values=True): + Create the parser. When `defaults' is given, it is initialized into the + dictionary or intrinsic defaults. The keys must be strings, the values + must be appropriate for %()s string interpolation. + + When `dict_type' is given, it will be used to create the dictionary + objects for the list of sections, for the options within a section, and + for the default values. + + When `delimiters' is given, it will be used as the set of substrings + that divide keys from values. + + When `comment_prefixes' is given, it will be used as the set of + substrings that prefix comments in empty lines. Comments can be + indented. + + When `inline_comment_prefixes' is given, it will be used as the set of + substrings that prefix comments in non-empty lines. + + When `strict` is True, the parser won't allow for any section or option + duplicates while reading from a single source (file, string or + dictionary). Default is True. + + When `empty_lines_in_values' is False (default: True), each empty line + marks the end of an option. Otherwise, internal empty lines of + a multiline option are kept as part of the value. + + When `allow_no_value' is True (default: False), options without + values are accepted; the value presented for these is None. + + sections() + Return all the configuration section names, sans DEFAULT. + + has_section(section) + Return whether the given section exists. + + has_option(section, option) + Return whether the given option exists in the given section. + + options(section) + Return list of configuration options for the named section. + + read(filenames, encoding=None) + Read and parse the list of named configuration files, given by + name. A single filename is also allowed. Non-existing files + are ignored. Return list of successfully read files. + + read_file(f, filename=None) + Read and parse one configuration file, given as a file object. + The filename defaults to f.name; it is only used in error + messages (if f has no `name' attribute, the string `<???>' is used). + + read_string(string) + Read configuration from a given string. + + read_dict(dictionary) + Read configuration from a dictionary. Keys are section names, + values are dictionaries with keys and values that should be present + in the section. If the used dictionary type preserves order, sections + and their keys will be added in order. Values are automatically + converted to strings. + + get(section, option, raw=False, vars=None, fallback=_UNSET) + Return a string value for the named option. All % interpolations are + expanded in the return values, based on the defaults passed into the + constructor and the DEFAULT section. Additional substitutions may be + provided using the `vars' argument, which must be a dictionary whose + contents override any pre-existing defaults. If `option' is a key in + `vars', the value from `vars' is used. + + getint(section, options, raw=False, vars=None, fallback=_UNSET) + Like get(), but convert value to an integer. + + getfloat(section, options, raw=False, vars=None, fallback=_UNSET) + Like get(), but convert value to a float. + + getboolean(section, options, raw=False, vars=None, fallback=_UNSET) + Like get(), but convert value to a boolean (currently case + insensitively defined as 0, false, no, off for False, and 1, true, + yes, on for True). Returns False or True. + + items(section=_UNSET, raw=False, vars=None) + If section is given, return a list of tuples with (name, value) for + each option in the section. Otherwise, return a list of tuples with + (section_name, section_proxy) for each section, including DEFAULTSECT. + + remove_section(section) + Remove the given file section and all its options. + + remove_option(section, option) + Remove the given option from the given section. + + set(section, option, value) + Set the given option. + + write(fp, space_around_delimiters=True) + Write the configuration state in .ini format. If + `space_around_delimiters' is True (the default), delimiters + between keys and values are surrounded by spaces. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from collections import MutableMapping +import functools +import io +import itertools +import re +import sys +import warnings + +from backports.configparser2.helpers import OrderedDict as _default_dict +from backports.configparser2.helpers import ChainMap as _ChainMap +from backports.configparser2.helpers import from_none, open, str, PY2 + +__all__ = ["NoSectionError", "DuplicateOptionError", "DuplicateSectionError", + "NoOptionError", "InterpolationError", "InterpolationDepthError", + "InterpolationSyntaxError", "ParsingError", + "MissingSectionHeaderError", + "ConfigParser", "SafeConfigParser", "RawConfigParser", + "DEFAULTSECT", "MAX_INTERPOLATION_DEPTH"] + +DEFAULTSECT = "DEFAULT" + +MAX_INTERPOLATION_DEPTH = 10 + + +# exception classes +class Error(Exception): + """Base class for ConfigParser exceptions.""" + + def __init__(self, msg=''): + self.message = msg + Exception.__init__(self, msg) + + def __repr__(self): + return self.message + + __str__ = __repr__ + + +class NoSectionError(Error): + """Raised when no section matches a requested option.""" + + def __init__(self, section): + Error.__init__(self, 'No section: %r' % (section,)) + self.section = section + self.args = (section, ) + + +class DuplicateSectionError(Error): + """Raised when a section is repeated in an input source. + + Possible repetitions that raise this exception are: multiple creation + using the API or in strict parsers when a section is found more than once + in a single input file, string or dictionary. + """ + + def __init__(self, section, source=None, lineno=None): + msg = [repr(section), " already exists"] + if source is not None: + message = ["While reading from ", repr(source)] + if lineno is not None: + message.append(" [line {0:2d}]".format(lineno)) + message.append(": section ") + message.extend(msg) + msg = message + else: + msg.insert(0, "Section ") + Error.__init__(self, "".join(msg)) + self.section = section + self.source = source + self.lineno = lineno + self.args = (section, source, lineno) + + +class DuplicateOptionError(Error): + """Raised by strict parsers when an option is repeated in an input source. + + Current implementation raises this exception only when an option is found + more than once in a single file, string or dictionary. + """ + + def __init__(self, section, option, source=None, lineno=None): + msg = [repr(option), " in section ", repr(section), + " already exists"] + if source is not None: + message = ["While reading from ", repr(source)] + if lineno is not None: + message.append(" [line {0:2d}]".format(lineno)) + message.append(": option ") + message.extend(msg) + msg = message + else: + msg.insert(0, "Option ") + Error.__init__(self, "".join(msg)) + self.section = section + self.option = option + self.source = source + self.lineno = lineno + self.args = (section, option, source, lineno) + + +class NoOptionError(Error): + """A requested option was not found.""" + + def __init__(self, option, section): + Error.__init__(self, "No option %r in section: %r" % + (option, section)) + self.option = option + self.section = section + self.args = (option, section) + + +class InterpolationError(Error): + """Base class for interpolation-related exceptions.""" + + def __init__(self, option, section, msg): + Error.__init__(self, msg) + self.option = option + self.section = section + self.args = (option, section, msg) + + +class InterpolationMissingOptionError(InterpolationError): + """A string substitution required a setting which was not available.""" + + def __init__(self, option, section, rawval, reference): + msg = ("Bad value substitution:\n" + "\tsection: [%s]\n" + "\toption : %s\n" + "\tkey : %s\n" + "\trawval : %s\n" + % (section, option, reference, rawval)) + InterpolationError.__init__(self, option, section, msg) + self.reference = reference + self.args = (option, section, rawval, reference) + + +class InterpolationSyntaxError(InterpolationError): + """Raised when the source text contains invalid syntax. + + Current implementation raises this exception when the source text into + which substitutions are made does not conform to the required syntax. + """ + + +class InterpolationDepthError(InterpolationError): + """Raised when substitutions are nested too deeply.""" + + def __init__(self, option, section, rawval): + msg = ("Value interpolation too deeply recursive:\n" + "\tsection: [%s]\n" + "\toption : %s\n" + "\trawval : %s\n" + % (section, option, rawval)) + InterpolationError.__init__(self, option, section, msg) + self.args = (option, section, rawval) + + +class ParsingError(Error): + """Raised when a configuration file does not follow legal syntax.""" + + def __init__(self, source=None, filename=None): + # Exactly one of `source'/`filename' arguments has to be given. + # `filename' kept for compatibility. + if filename and source: + raise ValueError("Cannot specify both `filename' and `source'. " + "Use `source'.") + elif not filename and not source: + raise ValueError("Required argument `source' not given.") + elif filename: + source = filename + Error.__init__(self, 'Source contains parsing errors: %r' % source) + self.source = source + self.errors = [] + self.args = (source, ) + + @property + def filename(self): + """Deprecated, use `source'.""" + warnings.warn( + "The 'filename' attribute will be removed in future versions. " + "Use 'source' instead.", + DeprecationWarning, stacklevel=2 + ) + return self.source + + @filename.setter + def filename(self, value): + """Deprecated, user `source'.""" + warnings.warn( + "The 'filename' attribute will be removed in future versions. " + "Use 'source' instead.", + DeprecationWarning, stacklevel=2 + ) + self.source = value + + def append(self, lineno, line): + self.errors.append((lineno, line)) + self.message += '\n\t[line %2d]: %s' % (lineno, line) + + +class MissingSectionHeaderError(ParsingError): + """Raised when a key-value pair is found before any section header.""" + + def __init__(self, filename, lineno, line): + Error.__init__( + self, + 'File contains no section headers.\nfile: %r, line: %d\n%r' % + (filename, lineno, line)) + self.source = filename + self.lineno = lineno + self.line = line + self.args = (filename, lineno, line) + + +# Used in parser getters to indicate the default behaviour when a specific +# option is not found it to raise an exception. Created to enable `None' as +# a valid fallback value. +_UNSET = object() + + +class Interpolation(object): + """Dummy interpolation that passes the value through with no changes.""" + + def before_get(self, parser, section, option, value, defaults): + return value + + def before_set(self, parser, section, option, value): + return value + + def before_read(self, parser, section, option, value): + return value + + def before_write(self, parser, section, option, value): + return value + + +class BasicInterpolation(Interpolation): + """Interpolation as implemented in the classic ConfigParser. + + The option values can contain format strings which refer to other values in + the same section, or values in the special default section. + + For example: + + something: %(dir)s/whatever + + would resolve the "%(dir)s" to the value of dir. All reference + expansions are done late, on demand. If a user needs to use a bare % in + a configuration file, she can escape it by writing %%. Other other % usage + is considered a user error and raises `InterpolationSyntaxError'.""" + + _KEYCRE = re.compile(r"%\(([^)]+)\)s") + + def before_get(self, parser, section, option, value, defaults): + L = [] + self._interpolate_some(parser, option, L, value, section, defaults, 1) + return ''.join(L) + + def before_set(self, parser, section, option, value): + tmp_value = value.replace('%%', '') # escaped percent signs + tmp_value = self._KEYCRE.sub('', tmp_value) # valid syntax + if '%' in tmp_value: + raise ValueError("invalid interpolation syntax in %r at " + "position %d" % (value, tmp_value.find('%'))) + return value + + def _interpolate_some(self, parser, option, accum, rest, section, map, + depth): + if depth > MAX_INTERPOLATION_DEPTH: + raise InterpolationDepthError(option, section, rest) + while rest: + p = rest.find("%") + if p < 0: + accum.append(rest) + return + if p > 0: + accum.append(rest[:p]) + rest = rest[p:] + # p is no longer used + c = rest[1:2] + if c == "%": + accum.append("%") + rest = rest[2:] + elif c == "(": + m = self._KEYCRE.match(rest) + if m is None: + raise InterpolationSyntaxError(option, section, + "bad interpolation variable reference %r" % rest) + var = parser.optionxform(m.group(1)) + rest = rest[m.end():] + try: + v = map[var] + except KeyError: + raise from_none(InterpolationMissingOptionError( + option, section, rest, var)) + if "%" in v: + self._interpolate_some(parser, option, accum, v, + section, map, depth + 1) + else: + accum.append(v) + else: + raise InterpolationSyntaxError( + option, section, + "'%%' must be followed by '%%' or '(', " + "found: %r" % (rest,)) + + +class ExtendedInterpolation(Interpolation): + """Advanced variant of interpolation, supports the syntax used by + `zc.buildout'. Enables interpolation between sections.""" + + _KEYCRE = re.compile(r"\$\{([^}]+)\}") + + def before_get(self, parser, section, option, value, defaults): + L = [] + self._interpolate_some(parser, option, L, value, section, defaults, 1) + return ''.join(L) + + def before_set(self, parser, section, option, value): + tmp_value = value.replace('$$', '') # escaped dollar signs + tmp_value = self._KEYCRE.sub('', tmp_value) # valid syntax + if '$' in tmp_value: + raise ValueError("invalid interpolation syntax in %r at " + "position %d" % (value, tmp_value.find('$'))) + return value + + def _interpolate_some(self, parser, option, accum, rest, section, map, + depth): + if depth > MAX_INTERPOLATION_DEPTH: + raise InterpolationDepthError(option, section, rest) + while rest: + p = rest.find("$") + if p < 0: + accum.append(rest) + return + if p > 0: + accum.append(rest[:p]) + rest = rest[p:] + # p is no longer used + c = rest[1:2] + if c == "$": + accum.append("$") + rest = rest[2:] + elif c == "{": + m = self._KEYCRE.match(rest) + if m is None: + raise InterpolationSyntaxError(option, section, + "bad interpolation variable reference %r" % rest) + path = m.group(1).split(':') + rest = rest[m.end():] + sect = section + opt = option + try: + if len(path) == 1: + opt = parser.optionxform(path[0]) + v = map[opt] + elif len(path) == 2: + sect = path[0] + opt = parser.optionxform(path[1]) + v = parser.get(sect, opt, raw=True) + else: + raise InterpolationSyntaxError( + option, section, + "More than one ':' found: %r" % (rest,)) + except (KeyError, NoSectionError, NoOptionError): + raise from_none(InterpolationMissingOptionError( + option, section, rest, ":".join(path))) + if "$" in v: + self._interpolate_some(parser, opt, accum, v, sect, + dict(parser.items(sect, raw=True)), + depth + 1) + else: + accum.append(v) + else: + raise InterpolationSyntaxError( + option, section, + "'$' must be followed by '$' or '{', " + "found: %r" % (rest,)) + + +class LegacyInterpolation(Interpolation): + """Deprecated interpolation used in old versions of ConfigParser. + Use BasicInterpolation or ExtendedInterpolation instead.""" + + _KEYCRE = re.compile(r"%\(([^)]*)\)s|.") + + def before_get(self, parser, section, option, value, vars): + rawval = value + depth = MAX_INTERPOLATION_DEPTH + while depth: # Loop through this until it's done + depth -= 1 + if value and "%(" in value: + replace = functools.partial(self._interpolation_replace, + parser=parser) + value = self._KEYCRE.sub(replace, value) + try: + value = value % vars + except KeyError as e: + raise from_none(InterpolationMissingOptionError( + option, section, rawval, e.args[0])) + else: + break + if value and "%(" in value: + raise InterpolationDepthError(option, section, rawval) + return value + + def before_set(self, parser, section, option, value): + return value + + @staticmethod + def _interpolation_replace(match, parser): + s = match.group(1) + if s is None: + return match.group() + else: + return "%%(%s)s" % parser.optionxform(s) + + +class RawConfigParser(MutableMapping): + """ConfigParser that does not do interpolation.""" + + # Regular expressions for parsing section headers and options + _SECT_TMPL = r""" + \[ # [ + (?P<header>[^]]+) # very permissive! + \] # ] + """ + _OPT_TMPL = r""" + (?P<option>.*?) # very permissive! + \s*(?P<vi>{delim})\s* # any number of space/tab, + # followed by any of the + # allowed delimiters, + # followed by any space/tab + (?P<value>.*)$ # everything up to eol + """ + _OPT_NV_TMPL = r""" + (?P<option>.*?) # very permissive! + \s*(?: # any number of space/tab, + (?P<vi>{delim})\s* # optionally followed by + # any of the allowed + # delimiters, followed by any + # space/tab + (?P<value>.*))?$ # everything up to eol + """ + # Interpolation algorithm to be used if the user does not specify another + _DEFAULT_INTERPOLATION = Interpolation() + # Compiled regular expression for matching sections + SECTCRE = re.compile(_SECT_TMPL, re.VERBOSE) + # Compiled regular expression for matching options with typical separators + OPTCRE = re.compile(_OPT_TMPL.format(delim="=|:"), re.VERBOSE) + # Compiled regular expression for matching options with optional values + # delimited using typical separators + OPTCRE_NV = re.compile(_OPT_NV_TMPL.format(delim="=|:"), re.VERBOSE) + # Compiled regular expression for matching leading whitespace in a line + NONSPACECRE = re.compile(r"\S") + # Possible boolean values in the configuration. + BOOLEAN_STATES = {'1': True, 'yes': True, 'true': True, 'on': True, + '0': False, 'no': False, 'false': False, 'off': False} + + def __init__(self, defaults=None, dict_type=_default_dict, + allow_no_value=False, **kwargs): + + # keyword-only arguments + delimiters = kwargs.get('delimiters', ('=', ':')) + comment_prefixes = kwargs.get('comment_prefixes', ('#', ';')) + inline_comment_prefixes = kwargs.get('inline_comment_prefixes', None) + strict = kwargs.get('strict', True) + empty_lines_in_values = kwargs.get('empty_lines_in_values', True) + default_section = kwargs.get('default_section', DEFAULTSECT) + interpolation = kwargs.get('interpolation', _UNSET) + + self._dict = dict_type + self._sections = self._dict() + self._defaults = self._dict() + self._proxies = self._dict() + self._proxies[default_section] = SectionProxy(self, default_section) + if defaults: + for key, value in defaults.items(): + self._defaults[self.optionxform(key)] = value + self._delimiters = tuple(delimiters) + if delimiters == ('=', ':'): + self._optcre = self.OPTCRE_NV if allow_no_value else self.OPTCRE + else: + d = "|".join(re.escape(d) for d in delimiters) + if allow_no_value: + self._optcre = re.compile(self._OPT_NV_TMPL.format(delim=d), + re.VERBOSE) + else: + self._optcre = re.compile(self._OPT_TMPL.format(delim=d), + re.VERBOSE) + self._comment_prefixes = tuple(comment_prefixes or ()) + self._inline_comment_prefixes = tuple(inline_comment_prefixes or ()) + self._strict = strict + self._allow_no_value = allow_no_value + self._empty_lines_in_values = empty_lines_in_values + self.default_section=default_section + self._interpolation = interpolation + if self._interpolation is _UNSET: + self._interpolation = self._DEFAULT_INTERPOLATION + if self._interpolation is None: + self._interpolation = Interpolation() + + def defaults(self): + return self._defaults + + def sections(self): + """Return a list of section names, excluding [DEFAULT]""" + # self._sections will never have [DEFAULT] in it + return list(self._sections.keys()) + + def add_section(self, section): + """Create a new section in the configuration. + + Raise DuplicateSectionError if a section by the specified name + already exists. Raise ValueError if name is DEFAULT. + """ + if section == self.default_section: + raise ValueError('Invalid section name: %r' % section) + + if section in self._sections: + raise DuplicateSectionError(section) + self._sections[section] = self._dict() + self._proxies[section] = SectionProxy(self, section) + + def has_section(self, section): + """Indicate whether the named section is present in the configuration. + + The DEFAULT section is not acknowledged. + """ + return section in self._sections + + def options(self, section): + """Return a list of option names for the given section name.""" + try: + opts = self._sections[section].copy() + except KeyError: + raise from_none(NoSectionError(section)) + opts.update(self._defaults) + return list(opts.keys()) + + def read(self, filenames, encoding=None): + """Read and parse a filename or a list of filenames. + + Files that cannot be opened are silently ignored; this is + designed so that you can specify a list of potential + configuration file locations (e.g. current directory, user's + home directory, systemwide directory), and all existing + configuration files in the list will be read. A single + filename may also be given. + + Return list of successfully read files. + """ + if PY2 and isinstance(filenames, bytes): + # we allow for a little unholy magic for Python 2 so that + # people not using unicode_literals can still use the library + # conveniently + warnings.warn( + "You passed a bytestring as `filenames`. This will not work" + " on Python 3. Use `cp.read_file()` or switch to using Unicode" + " strings across the board.", + DeprecationWarning, + stacklevel=2, + ) + filenames = [filenames] + elif isinstance(filenames, str): + filenames = [filenames] + read_ok = [] + for filename in filenames: + try: + with open(filename, encoding=encoding) as fp: + self._read(fp, filename) + except IOError: + continue + read_ok.append(filename) + return read_ok + + def read_file(self, f, source=None): + """Like read() but the argument must be a file-like object. + + The `f' argument must be iterable, returning one line at a time. + Optional second argument is the `source' specifying the name of the + file being read. If not given, it is taken from f.name. If `f' has no + `name' attribute, `<???>' is used. + """ + if source is None: + try: + source = f.name + except AttributeError: + source = '<???>' + self._read(f, source) + + def read_string(self, string, source='<string>'): + """Read configuration from a given string.""" + sfile = io.StringIO(string) + self.read_file(sfile, source) + + def read_dict(self, dictionary, source='<dict>'): + """Read configuration from a dictionary. + + Keys are section names, values are dictionaries with keys and values + that should be present in the section. If the used dictionary type + preserves order, sections and their keys will be added in order. + + All types held in the dictionary are converted to strings during + reading, including section names, option names and keys. + + Optional second argument is the `source' specifying the name of the + dictionary being read. + """ + elements_added = set() + for section, keys in dictionary.items(): + section = str(section) + try: + self.add_section(section) + except (DuplicateSectionError, ValueError): + if self._strict and section in elements_added: + raise + elements_added.add(section) + for key, value in keys.items(): + key = self.optionxform(str(key)) + if value is not None: + value = str(value) + if self._strict and (section, key) in elements_added: + raise DuplicateOptionError(section, key, source) + elements_added.add((section, key)) + self.set(section, key, value) + + def readfp(self, fp, filename=None): + """Deprecated, use read_file instead.""" + warnings.warn( + "This method will be removed in future versions. " + "Use 'parser.read_file()' instead.", + DeprecationWarning, stacklevel=2 + ) + self.read_file(fp, source=filename) + + def get(self, section, option, **kwargs): + """Get an option value for a given section. + + If `vars' is provided, it must be a dictionary. The option is looked up + in `vars' (if provided), `section', and in `DEFAULTSECT' in that order. + If the key is not found and `fallback' is provided, it is used as + a fallback value. `None' can be provided as a `fallback' value. + + If interpolation is enabled and the optional argument `raw' is False, + all interpolations are expanded in the return values. + + Arguments `raw', `vars', and `fallback' are keyword only. + + The section DEFAULT is special. + """ + # keyword-only arguments + raw = kwargs.get('raw', False) + vars = kwargs.get('vars', None) + fallback = kwargs.get('fallback', _UNSET) + + try: + d = self._unify_values(section, vars) + except NoSectionError: + if fallback is _UNSET: + raise + else: + return fallback + option = self.optionxform(option) + try: + value = d[option] + except KeyError: + if fallback is _UNSET: + raise NoOptionError(option, section) + else: + return fallback + + if raw or value is None: + return value + else: + return self._interpolation.before_get(self, section, option, value, + d) + + def _get(self, section, conv, option, **kwargs): + return conv(self.get(section, option, **kwargs)) + + def getint(self, section, option, **kwargs): + # keyword-only arguments + raw = kwargs.get('raw', False) + vars = kwargs.get('vars', None) + fallback = kwargs.get('fallback', _UNSET) + + try: + return self._get(section, int, option, raw=raw, vars=vars) + except (NoSectionError, NoOptionError): + if fallback is _UNSET: + raise + else: + return fallback + + def getfloat(self, section, option, **kwargs): + # keyword-only arguments + raw = kwargs.get('raw', False) + vars = kwargs.get('vars', None) + fallback = kwargs.get('fallback', _UNSET) + + try: + return self._get(section, float, option, raw=raw, vars=vars) + except (NoSectionError, NoOptionError): + if fallback is _UNSET: + raise + else: + return fallback + + def getboolean(self, section, option, **kwargs): + # keyword-only arguments + raw = kwargs.get('raw', False) + vars = kwargs.get('vars', None) + fallback = kwargs.get('fallback', _UNSET) + + try: + return self._get(section, self._convert_to_boolean, option, + raw=raw, vars=vars) + except (NoSectionError, NoOptionError): + if fallback is _UNSET: + raise + else: + return fallback + + def items(self, section=_UNSET, raw=False, vars=None): + """Return a list of (name, value) tuples for each option in a section. + + All % interpolations are expanded in the return values, based on the + defaults passed into the constructor, unless the optional argument + `raw' is true. Additional substitutions may be provided using the + `vars' argument, which must be a dictionary whose contents overrides + any pre-existing defaults. + + The section DEFAULT is special. + """ + if section is _UNSET: + return super(RawConfigParser, self).items() + d = self._defaults.copy() + try: + d.update(self._sections[section]) + except KeyError: + if section != self.default_section: + raise NoSectionError(section) + # Update with the entry specific variables + if vars: + for key, value in vars.items(): + d[self.optionxform(key)] = value + value_getter = lambda option: self._interpolation.before_get(self, + section, option, d[option], d) + if raw: + value_getter = lambda option: d[option] + return [(option, value_getter(option)) for option in d.keys()] + + def popitem(self): + """Remove a section from the parser and return it as + a (section_name, section_proxy) tuple. If no section is present, raise + KeyError. + + The section DEFAULT is never returned because it cannot be removed. + """ + for key in self.sections(): + value = self[key] + del self[key] + return key, value + raise KeyError + + def optionxform(self, optionstr): + return optionstr.lower() + + def has_option(self, section, option): + """Check for the existence of a given option in a given section. + If the specified `section' is None or an empty string, DEFAULT is + assumed. If the specified `section' does not exist, returns False.""" + if not section or section == self.default_section: + option = self.optionxform(option) + return option in self._defaults + elif section not in self._sections: + return False + else: + option = self.optionxform(option) + return (option in self._sections[section] + or option in self._defaults) + + def set(self, section, option, value=None): + """Set an option.""" + if value: + value = self._interpolation.before_set(self, section, option, + value) + if not section or section == self.default_section: + sectdict = self._defaults + else: + try: + sectdict = self._sections[section] + except KeyError: + raise from_none(NoSectionError(section)) + sectdict[self.optionxform(option)] = value + + def write(self, fp, space_around_delimiters=True): + """Write an .ini-format representation of the configuration state. + + If `space_around_delimiters' is True (the default), delimiters + between keys and values are surrounded by spaces. + """ + if space_around_delimiters: + d = " {0} ".format(self._delimiters[0]) + else: + d = self._delimiters[0] + if self._defaults: + self._write_section(fp, self.default_section, + self._defaults.items(), d) + for section in self._sections: + self._write_section(fp, section, + self._sections[section].items(), d) + + def _write_section(self, fp, section_name, section_items, delimiter): + """Write a single section to the specified `fp'.""" + fp.write("[{0}]\n".format(section_name)) + for key, value in section_items: + value = self._interpolation.before_write(self, section_name, key, + value) + if value is not None or not self._allow_no_value: + value = delimiter + str(value).replace('\n', '\n\t') + else: + value = "" + fp.write("{0}{1}\n".format(key, value)) + fp.write("\n") + + def remove_option(self, section, option): + """Remove an option.""" + if not section or section == self.default_section: + sectdict = self._defaults + else: + try: + sectdict = self._sections[section] + except KeyError: + raise from_none(NoSectionError(section)) + option = self.optionxform(option) + existed = option in sectdict + if existed: + del sectdict[option] + return existed + + def remove_section(self, section): + """Remove a file section.""" + existed = section in self._sections + if existed: + del self._sections[section] + del self._proxies[section] + return existed + + def __getitem__(self, key): + if key != self.default_section and not self.has_section(key): + raise KeyError(key) + return self._proxies[key] + + def __setitem__(self, key, value): + # To conform with the mapping protocol, overwrites existing values in + # the section. + + # XXX this is not atomic if read_dict fails at any point. Then again, + # no update method in configparser is atomic in this implementation. + if key == self.default_section: + self._defaults.clear() + elif key in self._sections: + self._sections[key].clear() + self.read_dict({key: value}) + + def __delitem__(self, key): + if key == self.default_section: + raise ValueError("Cannot remove the default section.") + if not self.has_section(key): + raise KeyError(key) + self.remove_section(key) + + def __contains__(self, key): + return key == self.default_section or self.has_section(key) + + def __len__(self): + return len(self._sections) + 1 # the default section + + def __iter__(self): + # XXX does it break when underlying container state changed? + return itertools.chain((self.default_section,), self._sections.keys()) + + def _read(self, fp, fpname): + """Parse a sectioned configuration file. + + Each section in a configuration file contains a header, indicated by + a name in square brackets (`[]'), plus key/value options, indicated by + `name' and `value' delimited with a specific substring (`=' or `:' by + default). + + Values can span multiple lines, as long as they are indented deeper + than the first line of the value. Depending on the parser's mode, blank + lines may be treated as parts of multiline values or ignored. + + Configuration files may include comments, prefixed by specific + characters (`#' and `;' by default). Comments may appear on their own + in an otherwise empty line or may be entered in lines holding values or + section names. + """ + elements_added = set() + cursect = None # None, or a dictionary + sectname = None + optname = None + lineno = 0 + indent_level = 0 + e = None # None, or an exception + for lineno, line in enumerate(fp, start=1): + comment_start = sys.maxsize + # strip inline comments + inline_prefixes = dict( + (p, -1) for p in self._inline_comment_prefixes) + while comment_start == sys.maxsize and inline_prefixes: + next_prefixes = {} + for prefix, index in inline_prefixes.items(): + index = line.find(prefix, index+1) + if index == -1: + continue + next_prefixes[prefix] = index + if index == 0 or (index > 0 and line[index-1].isspace()): + comment_start = min(comment_start, index) + inline_prefixes = next_prefixes + # strip full line comments + for prefix in self._comment_prefixes: + if line.strip().startswith(prefix): + comment_start = 0 + break + if comment_start == sys.maxsize: + comment_start = None + value = line[:comment_start].strip() + if not value: + if self._empty_lines_in_values: + # add empty line to the value, but only if there was no + # comment on the line + if (comment_start is None and + cursect is not None and + optname and + cursect[optname] is not None): + cursect[optname].append('') # newlines added at join + else: + # empty line marks end of value + indent_level = sys.maxsize + continue + # continuation line? + first_nonspace = self.NONSPACECRE.search(line) + cur_indent_level = first_nonspace.start() if first_nonspace else 0 + if (cursect is not None and optname and + cur_indent_level > indent_level): + cursect[optname].append(value) + # a section header or option header? + else: + indent_level = cur_indent_level + # is it a section header? + mo = self.SECTCRE.match(value) + if mo: + sectname = mo.group('header') + if sectname in self._sections: + if self._strict and sectname in elements_added: + raise DuplicateSectionError(sectname, fpname, + lineno) + cursect = self._sections[sectname] + elements_added.add(sectname) + elif sectname == self.default_section: + cursect = self._defaults + else: + cursect = self._dict() + self._sections[sectname] = cursect + self._proxies[sectname] = SectionProxy(self, sectname) + elements_added.add(sectname) + # So sections can't start with a continuation line + optname = None + # no section header in the file? + elif cursect is None: + raise MissingSectionHeaderError(fpname, lineno, line) + # an option line? + else: + mo = self._optcre.match(value) + if mo: + optname, vi, optval = mo.group('option', 'vi', 'value') + if not optname: + e = self._handle_error(e, fpname, lineno, line) + optname = self.optionxform(optname.rstrip()) + if (self._strict and + (sectname, optname) in elements_added): + raise DuplicateOptionError(sectname, optname, + fpname, lineno) + elements_added.add((sectname, optname)) + # This check is fine because the OPTCRE cannot + # match if it would set optval to None + if optval is not None: + optval = optval.strip() + cursect[optname] = [optval] + else: + # valueless option handling + cursect[optname] = None + else: + # a non-fatal parsing error occurred. set up the + # exception but keep going. the exception will be + # raised at the end of the file and will contain a + # list of all bogus lines + e = self._handle_error(e, fpname, lineno, line) + # if any parsing errors occurred, raise an exception + if e: + raise e + self._join_multiline_values() + + def _join_multiline_values(self): + defaults = self.default_section, self._defaults + all_sections = itertools.chain((defaults,), + self._sections.items()) + for section, options in all_sections: + for name, val in options.items(): + if isinstance(val, list): + val = '\n'.join(val).rstrip() + options[name] = self._interpolation.before_read(self, + section, + name, val) + + def _handle_error(self, exc, fpname, lineno, line): + if not exc: + exc = ParsingError(fpname) + exc.append(lineno, repr(line)) + return exc + + def _unify_values(self, section, vars): + """Create a sequence of lookups with 'vars' taking priority over + the 'section' which takes priority over the DEFAULTSECT. + + """ + sectiondict = {} + try: + sectiondict = self._sections[section] + except KeyError: + if section != self.default_section: + raise NoSectionError(section) + # Update with the entry specific variables + vardict = {} + if vars: + for key, value in vars.items(): + if value is not None: + value = str(value) + vardict[self.optionxform(key)] = value + return _ChainMap(vardict, sectiondict, self._defaults) + + def _convert_to_boolean(self, value): + """Return a boolean value translating from other types if necessary. + """ + if value.lower() not in self.BOOLEAN_STATES: + raise ValueError('Not a boolean: %s' % value) + return self.BOOLEAN_STATES[value.lower()] + + def _validate_value_types(self, **kwargs): + """Raises a TypeError for non-string values. + + The only legal non-string value if we allow valueless + options is None, so we need to check if the value is a + string if: + - we do not allow valueless options, or + - we allow valueless options but the value is not None + + For compatibility reasons this method is not used in classic set() + for RawConfigParsers. It is invoked in every case for mapping protocol + access and in ConfigParser.set(). + """ + # keyword-only arguments + section = kwargs.get('section', "") + option = kwargs.get('option', "") + value = kwargs.get('value', "") + + if PY2 and bytes in (type(section), type(option), type(value)): + # we allow for a little unholy magic for Python 2 so that + # people not using unicode_literals can still use the library + # conveniently + warnings.warn( + "You passed a bytestring. Implicitly decoding as UTF-8 string." + " This will not work on Python 3. Please switch to using" + " Unicode strings across the board.", + DeprecationWarning, + stacklevel=2, + ) + if isinstance(section, bytes): + section = section.decode('utf8') + if isinstance(option, bytes): + option = option.decode('utf8') + if isinstance(value, bytes): + value = value.decode('utf8') + + if not isinstance(section, str): + raise TypeError("section names must be strings") + if not isinstance(option, str): + raise TypeError("option keys must be strings") + if not self._allow_no_value or value: + if not isinstance(value, str): + raise TypeError("option values must be strings") + + return section, option, value + + +class ConfigParser(RawConfigParser): + """ConfigParser implementing interpolation.""" + + _DEFAULT_INTERPOLATION = BasicInterpolation() + + def set(self, section, option, value=None): + """Set an option. Extends RawConfigParser.set by validating type and + interpolation syntax on the value.""" + _, option, value = self._validate_value_types(option=option, value=value) + super(ConfigParser, self).set(section, option, value) + + def add_section(self, section): + """Create a new section in the configuration. Extends + RawConfigParser.add_section by validating if the section name is + a string.""" + section, _, _ = self._validate_value_types(section=section) + super(ConfigParser, self).add_section(section) + + +class SafeConfigParser(ConfigParser): + """ConfigParser alias for backwards compatibility purposes.""" + + def __init__(self, *args, **kwargs): + super(SafeConfigParser, self).__init__(*args, **kwargs) + warnings.warn( + "The SafeConfigParser class has been renamed to ConfigParser " + "in Python 3.2. This alias will be removed in future versions." + " Use ConfigParser directly instead.", + DeprecationWarning, stacklevel=2 + ) + + +class SectionProxy(MutableMapping): + """A proxy for a single section from a parser.""" + + def __init__(self, parser, name): + """Creates a view on a section of the specified `name` in `parser`.""" + self._parser = parser + self._name = name + + def __repr__(self): + return '<Section: {0}>'.format(self._name) + + def __getitem__(self, key): + if not self._parser.has_option(self._name, key): + raise KeyError(key) + return self._parser.get(self._name, key) + + def __setitem__(self, key, value): + _, key, value = self._parser._validate_value_types(option=key, value=value) + return self._parser.set(self._name, key, value) + + def __delitem__(self, key): + if not (self._parser.has_option(self._name, key) and + self._parser.remove_option(self._name, key)): + raise KeyError(key) + + def __contains__(self, key): + return self._parser.has_option(self._name, key) + + def __len__(self): + return len(self._options()) + + def __iter__(self): + return self._options().__iter__() + + def _options(self): + if self._name != self._parser.default_section: + return self._parser.options(self._name) + else: + return self._parser.defaults() + + def get(self, option, fallback=None, **kwargs): + # keyword-only arguments + raw = kwargs.get('raw', False) + vars = kwargs.get('vars', None) + + return self._parser.get(self._name, option, raw=raw, vars=vars, + fallback=fallback) + + def getint(self, option, fallback=None, **kwargs): + # keyword-only arguments + raw = kwargs.get('raw', False) + vars = kwargs.get('vars', None) + + return self._parser.getint(self._name, option, raw=raw, vars=vars, + fallback=fallback) + + def getfloat(self, option, fallback=None, **kwargs): + # keyword-only arguments + raw = kwargs.get('raw', False) + vars = kwargs.get('vars', None) + + return self._parser.getfloat(self._name, option, raw=raw, vars=vars, + fallback=fallback) + + def getboolean(self, option, fallback=None, **kwargs): + # keyword-only arguments + raw = kwargs.get('raw', False) + vars = kwargs.get('vars', None) + + return self._parser.getboolean(self._name, option, raw=raw, vars=vars, + fallback=fallback) + + @property + def parser(self): + # The parser object of the proxy is read-only. + return self._parser + + @property + def name(self): + # The name of the section on a proxy is read-only. + return self._name diff --git a/libs/backports/configparser2/helpers.py b/libs/backports/configparser2/helpers.py new file mode 100644 index 000000000..c47662f3c --- /dev/null +++ b/libs/backports/configparser2/helpers.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from collections import MutableMapping +try: + from collections import UserDict +except ImportError: + from UserDict import UserDict + +try: + from collections import OrderedDict +except ImportError: + from ordereddict import OrderedDict + +from io import open +import sys +try: + from thread import get_ident +except ImportError: + try: + from _thread import get_ident + except ImportError: + from _dummy_thread import get_ident + + +PY2 = sys.version_info[0] == 2 +PY3 = sys.version_info[0] == 3 + +str = type('str') + + +def from_none(exc): + """raise from_none(ValueError('a')) == raise ValueError('a') from None""" + exc.__cause__ = None + exc.__suppress_context__ = True + return exc + + +# from reprlib 3.2.1 +def recursive_repr(fillvalue='...'): + 'Decorator to make a repr function return fillvalue for a recursive call' + + def decorating_function(user_function): + repr_running = set() + + def wrapper(self): + key = id(self), get_ident() + if key in repr_running: + return fillvalue + repr_running.add(key) + try: + result = user_function(self) + finally: + repr_running.discard(key) + return result + + # Can't use functools.wraps() here because of bootstrap issues + wrapper.__module__ = getattr(user_function, '__module__') + wrapper.__doc__ = getattr(user_function, '__doc__') + wrapper.__name__ = getattr(user_function, '__name__') + wrapper.__annotations__ = getattr(user_function, '__annotations__', {}) + return wrapper + + return decorating_function + +# from collections 3.2.1 +class _ChainMap(MutableMapping): + ''' A ChainMap groups multiple dicts (or other mappings) together + to create a single, updateable view. + + The underlying mappings are stored in a list. That list is public and can + accessed or updated using the *maps* attribute. There is no other state. + + Lookups search the underlying mappings successively until a key is found. + In contrast, writes, updates, and deletions only operate on the first + mapping. + + ''' + + def __init__(self, *maps): + '''Initialize a ChainMap by setting *maps* to the given mappings. + If no mappings are provided, a single empty dictionary is used. + + ''' + self.maps = list(maps) or [{}] # always at least one map + + def __missing__(self, key): + raise KeyError(key) + + def __getitem__(self, key): + for mapping in self.maps: + try: + return mapping[key] # can't use 'key in mapping' with defaultdict + except KeyError: + pass + return self.__missing__(key) # support subclasses that define __missing__ + + def get(self, key, default=None): + return self[key] if key in self else default + + def __len__(self): + return len(set().union(*self.maps)) # reuses stored hash values if possible + + def __iter__(self): + return iter(set().union(*self.maps)) + + def __contains__(self, key): + return any(key in m for m in self.maps) + + @recursive_repr() + def __repr__(self): + return '{0.__class__.__name__}({1})'.format( + self, ', '.join(map(repr, self.maps))) + + @classmethod + def fromkeys(cls, iterable, *args): + 'Create a ChainMap with a single dict created from the iterable.' + return cls(dict.fromkeys(iterable, *args)) + + def copy(self): + 'New ChainMap or subclass with a new copy of maps[0] and refs to maps[1:]' + return self.__class__(self.maps[0].copy(), *self.maps[1:]) + + __copy__ = copy + + def new_child(self): # like Django's Context.push() + 'New ChainMap with a new dict followed by all previous maps.' + return self.__class__({}, *self.maps) + + @property + def parents(self): # like Django's Context.pop() + 'New ChainMap from maps[1:].' + return self.__class__(*self.maps[1:]) + + def __setitem__(self, key, value): + self.maps[0][key] = value + + def __delitem__(self, key): + try: + del self.maps[0][key] + except KeyError: + raise KeyError('Key not found in the first mapping: {!r}'.format(key)) + + def popitem(self): + 'Remove and return an item pair from maps[0]. Raise KeyError is maps[0] is empty.' + try: + return self.maps[0].popitem() + except KeyError: + raise KeyError('No keys found in the first mapping.') + + def pop(self, key, *args): + 'Remove *key* from maps[0] and return its value. Raise KeyError if *key* not in maps[0].' + try: + return self.maps[0].pop(key, *args) + except KeyError: + raise KeyError('Key not found in the first mapping: {!r}'.format(key)) + + def clear(self): + 'Clear maps[0], leaving maps[1:] intact.' + self.maps[0].clear() + + +try: + from collections import ChainMap +except ImportError: + ChainMap = _ChainMap diff --git a/libs/certifi/__init__.py b/libs/certifi/__init__.py index 632db8e13..8e358e4c8 100644 --- a/libs/certifi/__init__.py +++ b/libs/certifi/__init__.py @@ -1,3 +1,3 @@ from .core import where -__version__ = "2019.03.09" +__version__ = "2019.09.11" diff --git a/libs/certifi/cacert.pem b/libs/certifi/cacert.pem index 84636dde7..70fa91f61 100644 --- a/libs/certifi/cacert.pem +++ b/libs/certifi/cacert.pem @@ -771,36 +771,6 @@ vEsXCS+0yx5DaMkHJ8HSXPfqIbloEpw8nL+e/IBcm2PN7EeqJSdnoDfzAIJ9VNep +OkuE6N36B9K -----END CERTIFICATE----- -# Issuer: CN=Class 2 Primary CA O=Certplus -# Subject: CN=Class 2 Primary CA O=Certplus -# Label: "Certplus Class 2 Primary CA" -# Serial: 177770208045934040241468760488327595043 -# MD5 Fingerprint: 88:2c:8c:52:b8:a2:3c:f3:f7:bb:03:ea:ae:ac:42:0b -# SHA1 Fingerprint: 74:20:74:41:72:9c:dd:92:ec:79:31:d8:23:10:8d:c2:81:92:e2:bb -# SHA256 Fingerprint: 0f:99:3c:8a:ef:97:ba:af:56:87:14:0e:d5:9a:d1:82:1b:b4:af:ac:f0:aa:9a:58:b5:d5:7a:33:8a:3a:fb:cb ------BEGIN CERTIFICATE----- -MIIDkjCCAnqgAwIBAgIRAIW9S/PY2uNp9pTXX8OlRCMwDQYJKoZIhvcNAQEFBQAw -PTELMAkGA1UEBhMCRlIxETAPBgNVBAoTCENlcnRwbHVzMRswGQYDVQQDExJDbGFz -cyAyIFByaW1hcnkgQ0EwHhcNOTkwNzA3MTcwNTAwWhcNMTkwNzA2MjM1OTU5WjA9 -MQswCQYDVQQGEwJGUjERMA8GA1UEChMIQ2VydHBsdXMxGzAZBgNVBAMTEkNsYXNz -IDIgUHJpbWFyeSBDQTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBANxQ -ltAS+DXSCHh6tlJw/W/uz7kRy1134ezpfgSN1sxvc0NXYKwzCkTsA18cgCSR5aiR -VhKC9+Ar9NuuYS6JEI1rbLqzAr3VNsVINyPi8Fo3UjMXEuLRYE2+L0ER4/YXJQyL -kcAbmXuZVg2v7tK8R1fjeUl7NIknJITesezpWE7+Tt9avkGtrAjFGA7v0lPubNCd -EgETjdyAYveVqUSISnFOYFWe2yMZeVYHDD9jC1yw4r5+FfyUM1hBOHTE4Y+L3yas -H7WLO7dDWWuwJKZtkIvEcupdM5i3y95ee++U8Rs+yskhwcWYAqqi9lt3m/V+llU0 -HGdpwPFC40es/CgcZlUCAwEAAaOBjDCBiTAPBgNVHRMECDAGAQH/AgEKMAsGA1Ud -DwQEAwIBBjAdBgNVHQ4EFgQU43Mt38sOKAze3bOkynm4jrvoMIkwEQYJYIZIAYb4 -QgEBBAQDAgEGMDcGA1UdHwQwMC4wLKAqoCiGJmh0dHA6Ly93d3cuY2VydHBsdXMu -Y29tL0NSTC9jbGFzczIuY3JsMA0GCSqGSIb3DQEBBQUAA4IBAQCnVM+IRBnL39R/ -AN9WM2K191EBkOvDP9GIROkkXe/nFL0gt5o8AP5tn9uQ3Nf0YtaLcF3n5QRIqWh8 -yfFC82x/xXp8HVGIutIKPidd3i1RTtMTZGnkLuPT55sJmabglZvOGtd/vjzOUrMR -FcEPF80Du5wlFbqidon8BvEY0JNLDnyCt6X09l/+7UCmnYR0ObncHoUW2ikbhiMA -ybuJfm6AiB4vFLQDJKgybwOaRywwvlbGp0ICcBvqQNi6BQNwB6SW//1IMwrh3KWB -kJtN3X3n57LNXMhqlfil9o3EXXgIvnsG1knPGTZQIy4I5p4FTUcY1Rbpsda2ENW7 -l7+ijrRU ------END CERTIFICATE----- - # Issuer: CN=DST Root CA X3 O=Digital Signature Trust Co. # Subject: CN=DST Root CA X3 O=Digital Signature Trust Co. # Label: "DST Root CA X3" @@ -1219,36 +1189,6 @@ t0QmwCbAr1UwnjvVNioZBPRcHv/PLLf/0P2HQBHVESO7SMAhqaQoLf0V+LBOK/Qw WyH8EZE0vkHve52Xdf+XlcCWWC/qu0bXu+TZLg== -----END CERTIFICATE----- -# Issuer: CN=Deutsche Telekom Root CA 2 O=Deutsche Telekom AG OU=T-TeleSec Trust Center -# Subject: CN=Deutsche Telekom Root CA 2 O=Deutsche Telekom AG OU=T-TeleSec Trust Center -# Label: "Deutsche Telekom Root CA 2" -# Serial: 38 -# MD5 Fingerprint: 74:01:4a:91:b1:08:c4:58:ce:47:cd:f0:dd:11:53:08 -# SHA1 Fingerprint: 85:a4:08:c0:9c:19:3e:5d:51:58:7d:cd:d6:13:30:fd:8c:de:37:bf -# SHA256 Fingerprint: b6:19:1a:50:d0:c3:97:7f:7d:a9:9b:cd:aa:c8:6a:22:7d:ae:b9:67:9e:c7:0b:a3:b0:c9:d9:22:71:c1:70:d3 ------BEGIN CERTIFICATE----- -MIIDnzCCAoegAwIBAgIBJjANBgkqhkiG9w0BAQUFADBxMQswCQYDVQQGEwJERTEc -MBoGA1UEChMTRGV1dHNjaGUgVGVsZWtvbSBBRzEfMB0GA1UECxMWVC1UZWxlU2Vj -IFRydXN0IENlbnRlcjEjMCEGA1UEAxMaRGV1dHNjaGUgVGVsZWtvbSBSb290IENB -IDIwHhcNOTkwNzA5MTIxMTAwWhcNMTkwNzA5MjM1OTAwWjBxMQswCQYDVQQGEwJE -RTEcMBoGA1UEChMTRGV1dHNjaGUgVGVsZWtvbSBBRzEfMB0GA1UECxMWVC1UZWxl -U2VjIFRydXN0IENlbnRlcjEjMCEGA1UEAxMaRGV1dHNjaGUgVGVsZWtvbSBSb290 -IENBIDIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQCrC6M14IspFLEU -ha88EOQ5bzVdSq7d6mGNlUn0b2SjGmBmpKlAIoTZ1KXleJMOaAGtuU1cOs7TuKhC -QN/Po7qCWWqSG6wcmtoIKyUn+WkjR/Hg6yx6m/UTAtB+NHzCnjwAWav12gz1Mjwr -rFDa1sPeg5TKqAyZMg4ISFZbavva4VhYAUlfckE8FQYBjl2tqriTtM2e66foai1S -NNs671x1Udrb8zH57nGYMsRUFUQM+ZtV7a3fGAigo4aKSe5TBY8ZTNXeWHmb0moc -QqvF1afPaA+W5OFhmHZhyJF81j4A4pFQh+GdCuatl9Idxjp9y7zaAzTVjlsB9WoH -txa2bkp/AgMBAAGjQjBAMB0GA1UdDgQWBBQxw3kbuvVT1xfgiXotF2wKsyudMzAP -BgNVHRMECDAGAQH/AgEFMA4GA1UdDwEB/wQEAwIBBjANBgkqhkiG9w0BAQUFAAOC -AQEAlGRZrTlk5ynrE/5aw4sTV8gEJPB0d8Bg42f76Ymmg7+Wgnxu1MM9756Abrsp -tJh6sTtU6zkXR34ajgv8HzFZMQSyzhfzLMdiNlXiItiJVbSYSKpk+tYcNthEeFpa -IzpXl/V6ME+un2pMSyuOoAPjPuCp1NJ70rOo4nI8rZ7/gFnkm0W09juwzTkZmDLl -6iFhkOQxIY40sfcvNUqFENrnijchvllj4PKFiDFT1FQUhXB59C4Gdyd1Lx+4ivn+ -xbrYNuSD7Odlt79jWvNGr4GUN9RBjNYj1h7P9WgbRGOiWrqnNVmh5XAFmw4jV5mU -Cm26OWMohpLzGITY+9HPBVZkVw== ------END CERTIFICATE----- - # Issuer: CN=Cybertrust Global Root O=Cybertrust, Inc # Subject: CN=Cybertrust Global Root O=Cybertrust, Inc # Label: "Cybertrust Global Root" @@ -3453,46 +3393,6 @@ AAoACxGV2lZFA4gKn2fQ1XmxqI1AbQ3CekD6819kR5LLU7m7Wc5P/dAVUwHY3+vZ 5nbv0CO7O6l5s9UCKc2Jo5YPSjXnTkLAdc0Hz+Ys63su -----END CERTIFICATE----- -# Issuer: CN=Certinomis - Root CA O=Certinomis OU=0002 433998903 -# Subject: CN=Certinomis - Root CA O=Certinomis OU=0002 433998903 -# Label: "Certinomis - Root CA" -# Serial: 1 -# MD5 Fingerprint: 14:0a:fd:8d:a8:28:b5:38:69:db:56:7e:61:22:03:3f -# SHA1 Fingerprint: 9d:70:bb:01:a5:a4:a0:18:11:2e:f7:1c:01:b9:32:c5:34:e7:88:a8 -# SHA256 Fingerprint: 2a:99:f5:bc:11:74:b7:3c:bb:1d:62:08:84:e0:1c:34:e5:1c:cb:39:78:da:12:5f:0e:33:26:88:83:bf:41:58 ------BEGIN CERTIFICATE----- -MIIFkjCCA3qgAwIBAgIBATANBgkqhkiG9w0BAQsFADBaMQswCQYDVQQGEwJGUjET -MBEGA1UEChMKQ2VydGlub21pczEXMBUGA1UECxMOMDAwMiA0MzM5OTg5MDMxHTAb -BgNVBAMTFENlcnRpbm9taXMgLSBSb290IENBMB4XDTEzMTAyMTA5MTcxOFoXDTMz -MTAyMTA5MTcxOFowWjELMAkGA1UEBhMCRlIxEzARBgNVBAoTCkNlcnRpbm9taXMx -FzAVBgNVBAsTDjAwMDIgNDMzOTk4OTAzMR0wGwYDVQQDExRDZXJ0aW5vbWlzIC0g -Um9vdCBDQTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBANTMCQosP5L2 -fxSeC5yaah1AMGT9qt8OHgZbn1CF6s2Nq0Nn3rD6foCWnoR4kkjW4znuzuRZWJfl -LieY6pOod5tK8O90gC3rMB+12ceAnGInkYjwSond3IjmFPnVAy//ldu9n+ws+hQV -WZUKxkd8aRi5pwP5ynapz8dvtF4F/u7BUrJ1Mofs7SlmO/NKFoL21prbcpjp3vDF -TKWrteoB4owuZH9kb/2jJZOLyKIOSY008B/sWEUuNKqEUL3nskoTuLAPrjhdsKkb -5nPJWqHZZkCqqU2mNAKthH6yI8H7KsZn9DS2sJVqM09xRLWtwHkziOC/7aOgFLSc -CbAK42C++PhmiM1b8XcF4LVzbsF9Ri6OSyemzTUK/eVNfaoqoynHWmgE6OXWk6Ri -wsXm9E/G+Z8ajYJJGYrKWUM66A0ywfRMEwNvbqY/kXPLynNvEiCL7sCCeN5LLsJJ -wx3tFvYk9CcbXFcx3FXuqB5vbKziRcxXV4p1VxngtViZSTYxPDMBbRZKzbgqg4SG -m/lg0h9tkQPTYKbVPZrdd5A9NaSfD171UkRpucC63M9933zZxKyGIjK8e2uR73r4 -F2iw4lNVYC2vPsKD2NkJK/DAZNuHi5HMkesE/Xa0lZrmFAYb1TQdvtj/dBxThZng -WVJKYe2InmtJiUZ+IFrZ50rlau7SZRFDAgMBAAGjYzBhMA4GA1UdDwEB/wQEAwIB -BjAPBgNVHRMBAf8EBTADAQH/MB0GA1UdDgQWBBTvkUz1pcMw6C8I6tNxIqSSaHh0 -2TAfBgNVHSMEGDAWgBTvkUz1pcMw6C8I6tNxIqSSaHh02TANBgkqhkiG9w0BAQsF -AAOCAgEAfj1U2iJdGlg+O1QnurrMyOMaauo++RLrVl89UM7g6kgmJs95Vn6RHJk/ -0KGRHCwPT5iVWVO90CLYiF2cN/z7ZMF4jIuaYAnq1fohX9B0ZedQxb8uuQsLrbWw -F6YSjNRieOpWauwK0kDDPAUwPk2Ut59KA9N9J0u2/kTO+hkzGm2kQtHdzMjI1xZS -g081lLMSVX3l4kLr5JyTCcBMWwerx20RoFAXlCOotQqSD7J6wWAsOMwaplv/8gzj -qh8c3LigkyfeY+N/IZ865Z764BNqdeuWXGKRlI5nU7aJ+BIJy29SWwNyhlCVCNSN -h4YVH5Uk2KRvms6knZtt0rJ2BobGVgjF6wnaNsIbW0G+YSrjcOa4pvi2WsS9Iff/ -ql+hbHY5ZtbqTFXhADObE5hjyW/QASAJN1LnDE8+zbz1X5YnpyACleAu6AdBBR8V -btaw5BngDwKTACdyxYvRVB9dSsNAl35VpnzBMwQUAR1JIGkLGZOdblgi90AMRgwj -Y/M50n92Uaf0yKHxDHYiI0ZSKS3io0EHVmmY0gUJvGnHWmHNj4FgFU2A3ZDifcRQ -8ow7bkrHxuaAKzyBvBGAFhAn1/DNP3nMcyrDflOR1m749fPH0FFNjkulW+YZFzvW -gQncItzujrnEj1PhZ7szuIgVRs/taTX/dQ1G885x4cVrhkIGuUE= ------END CERTIFICATE----- - # Issuer: CN=OISTE WISeKey Global Root GB CA O=WISeKey OU=OISTE Foundation Endorsed # Subject: CN=OISTE WISeKey Global Root GB CA O=WISeKey OU=OISTE Foundation Endorsed # Label: "OISTE WISeKey Global Root GB CA" diff --git a/libs/cloudscraper/user_agent/__init__.py b/libs/cloudscraper/user_agent/__init__.py index 45190fbe4..480c55e5f 100644 --- a/libs/cloudscraper/user_agent/__init__.py +++ b/libs/cloudscraper/user_agent/__init__.py @@ -21,10 +21,8 @@ class User_Agent(): def loadUserAgent(self, *args, **kwargs): browser = kwargs.pop('browser', 'chrome') - user_agents = json.load( - open(os.path.join(os.path.dirname(__file__), 'browsers.json'), 'r'), - object_pairs_hook=OrderedDict - ) + with open(os.path.join(os.path.dirname(__file__), 'browsers.json'), 'r') as file: + user_agents = json.load(file, object_pairs_hook=OrderedDict) if not user_agents.get(browser): logging.error('Sorry "{}" browser User-Agent was not found.'.format(browser)) diff --git a/libs/deathbycaptcha.py b/libs/deathbycaptcha.py index 3c2fafb77..05aa8f68d 100644 --- a/libs/deathbycaptcha.py +++ b/libs/deathbycaptcha.py @@ -80,7 +80,6 @@ import sys import threading import time import urllib -import urllib2 try: from json import read as json_decode, write as json_encode except ImportError: @@ -146,7 +145,7 @@ class Client(object): def _log(self, cmd, msg=''): if self.is_verbose: - print '%d %s %s' % (time.time(), cmd, msg.rstrip()) + print('%d %s %s' % (time.time(), cmd, msg.rstrip())) return self def close(self): @@ -251,7 +250,7 @@ class HttpClient(Client): data=payload, headers=headers )).read() - except urllib2.HTTPError, err: + except urllib2.HTTPError as err: if 403 == err.code: raise AccessDeniedException('Access denied, please check' ' your credentials and/or balance') @@ -357,7 +356,7 @@ class SocketClient(Client): self.socket.settimeout(0) try: self.socket.connect(host) - except socket.error, err: + except socket.error as err: if (err.args[0] not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)): self.close() @@ -392,7 +391,7 @@ class SocketClient(Client): raise IOError('recv(): connection lost') else: response += s - except socket.error, err: + except socket.error as err: if (err.args[0] not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)): raise err @@ -416,10 +415,10 @@ class SocketClient(Client): try: sock = self.connect() response = self._sendrecv(sock, request) - except IOError, err: + except IOError as err: sys.stderr.write(str(err) + "\n") self.close() - except socket.error, err: + except socket.error as err: sys.stderr.write(str(err) + "\n") self.close() raise IOError('Connection refused') @@ -493,20 +492,20 @@ if '__main__' == __name__: client = SocketClient(sys.argv[1], sys.argv[2]) client.is_verbose = True - print 'Your balance is %s US cents' % client.get_balance() + print('Your balance is %s US cents' % client.get_balance()) for fn in sys.argv[3:]: try: # Put your CAPTCHA image file name or file-like object, and optional # solving timeout (in seconds) here: captcha = client.decode(fn, DEFAULT_TIMEOUT) - except Exception, e: + except Exception as e: sys.stderr.write('Failed uploading CAPTCHA: %s\n' % (e, )) captcha = None if captcha: - print 'CAPTCHA %d solved: %s' % \ - (captcha['captcha'], captcha['text']) + print('CAPTCHA %d solved: %s' % \ + (captcha['captcha'], captcha['text'])) # Report as incorrectly solved if needed. Make sure the CAPTCHA was # in fact incorrectly solved! diff --git a/libs/decorator.py b/libs/decorator.py index 44303eedc..34fd527c9 100644 --- a/libs/decorator.py +++ b/libs/decorator.py @@ -40,7 +40,7 @@ import operator import itertools import collections -__version__ = '4.3.0' +__version__ = '4.4.0' if sys.version >= '3': from inspect import getfullargspec @@ -65,6 +65,12 @@ except AttributeError: # let's assume there are no coroutine functions in old Python def iscoroutinefunction(f): return False +try: + from inspect import isgeneratorfunction +except ImportError: + # assume no generator function in old Python versions + def isgeneratorfunction(caller): + return False DEF = re.compile(r'\s*def\s*([_\w][_\w\d]*)\s*\(') @@ -173,7 +179,8 @@ class FunctionMaker(object): # Ensure each generated function has a unique filename for profilers # (such as cProfile) that depend on the tuple of (<filename>, # <definition line>, <function name>) being unique. - filename = '<decorator-gen-%d>' % (next(self._compile_count),) + filename = '<%s:decorator-gen-%d>' % ( + __file__, next(self._compile_count)) try: code = compile(src, filename, 'single') exec(code, evaldict) @@ -218,6 +225,8 @@ class FunctionMaker(object): def decorate(func, caller, extras=()): """ decorate(func, caller) decorates a function using a caller. + If the caller is a generator function, the resulting function + will be a generator function. """ evaldict = dict(_call_=caller, _func_=func) es = '' @@ -225,9 +234,23 @@ def decorate(func, caller, extras=()): ex = '_e%d_' % i evaldict[ex] = extra es += ex + ', ' - fun = FunctionMaker.create( - func, "return _call_(_func_, %s%%(shortsignature)s)" % es, - evaldict, __wrapped__=func) + + if '3.5' <= sys.version < '3.6': + # with Python 3.5 isgeneratorfunction returns True for all coroutines + # however we know that it is NOT possible to have a generator + # coroutine in python 3.5: PEP525 was not there yet + generatorcaller = isgeneratorfunction( + caller) and not iscoroutinefunction(caller) + else: + generatorcaller = isgeneratorfunction(caller) + if generatorcaller: + fun = FunctionMaker.create( + func, "for res in _call_(_func_, %s%%(shortsignature)s):\n" + " yield res" % es, evaldict, __wrapped__=func) + else: + fun = FunctionMaker.create( + func, "return _call_(_func_, %s%%(shortsignature)s)" % es, + evaldict, __wrapped__=func) if hasattr(func, '__qualname__'): fun.__qualname__ = func.__qualname__ return fun @@ -261,12 +284,12 @@ def decorator(caller, _func=None): doc = caller.__call__.__doc__ evaldict = dict(_call=caller, _decorate_=decorate) dec = FunctionMaker.create( - '%s(%s func)' % (name, defaultargs), + '%s(func, %s)' % (name, defaultargs), 'if func is None: return lambda func: _decorate_(func, _call, (%s))\n' 'return _decorate_(func, _call, (%s))' % (defaultargs, defaultargs), evaldict, doc=doc, module=caller.__module__, __wrapped__=caller) if defaults: - dec.__defaults__ = defaults + (None,) + dec.__defaults__ = (None,) + defaults return dec diff --git a/libs/dogpile/__init__.py b/libs/dogpile/__init__.py index 99c384364..fc8fd4524 100644 --- a/libs/dogpile/__init__.py +++ b/libs/dogpile/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.6.5' +__version__ = '0.7.1' from .lock import Lock # noqa from .lock import NeedRegenerationException # noqa diff --git a/libs/dogpile/cache/region.py b/libs/dogpile/cache/region.py index 1896cbd81..261a8db48 100644 --- a/libs/dogpile/cache/region.py +++ b/libs/dogpile/cache/region.py @@ -10,8 +10,9 @@ from ..util import compat import time import datetime from numbers import Number -from functools import wraps +from functools import wraps, partial import threading +from decorator import decorate _backend_loader = PluginLoader("dogpile.cache") register_backend = _backend_loader.register @@ -188,7 +189,7 @@ class DefaultInvalidationStrategy(RegionInvalidationStrategy): class CacheRegion(object): - """A front end to a particular cache backend. + r"""A front end to a particular cache backend. :param name: Optional, a string name for the region. This isn't used internally @@ -484,6 +485,26 @@ class CacheRegion(object): else: return self._LockWrapper() + # cached value + _actual_backend = None + + @property + def actual_backend(self): + """Return the ultimate backend underneath any proxies. + + The backend might be the result of one or more ``proxy.wrap`` + applications. If so, derive the actual underlying backend. + + .. versionadded:: 0.6.6 + + """ + if self._actual_backend is None: + _backend = self.backend + while hasattr(_backend, 'proxied'): + _backend = _backend.proxied + self._actual_backend = _backend + return self._actual_backend + def invalidate(self, hard=True): """Invalidate this :class:`.CacheRegion`. @@ -723,7 +744,8 @@ class CacheRegion(object): ] def get_or_create( - self, key, creator, expiration_time=None, should_cache_fn=None): + self, key, creator, expiration_time=None, should_cache_fn=None, + creator_args=None): """Return a cached value based on the given key. If the value does not exist or is considered to be expired @@ -759,6 +781,11 @@ class CacheRegion(object): :param creator: function which creates a new value. + :param creator_args: optional tuple of (args, kwargs) that will be + passed to the creator function if present. + + .. versionadded:: 0.7.0 + :param expiration_time: optional expiration time which will overide the expiration time already configured on this :class:`.CacheRegion` if not None. To set no expiration, use the value -1. @@ -799,7 +826,7 @@ class CacheRegion(object): value = self.backend.get(key) if (value is NO_VALUE or value.metadata['v'] != value_version or self.region_invalidator.is_hard_invalidated( - value.metadata["ct"])): + value.metadata["ct"])): raise NeedRegenerationException() ct = value.metadata["ct"] if self.region_invalidator.is_soft_invalidated(ct): @@ -808,7 +835,10 @@ class CacheRegion(object): return value.payload, ct def gen_value(): - created_value = creator() + if creator_args: + created_value = creator(*creator_args[0], **creator_args[1]) + else: + created_value = creator() value = self._value(created_value) if not should_cache_fn or \ @@ -831,8 +861,13 @@ class CacheRegion(object): if self.async_creation_runner: def async_creator(mutex): - return self.async_creation_runner( - self, orig_key, creator, mutex) + if creator_args: + @wraps(creator) + def go(): + return creator(*creator_args[0], **creator_args[1]) + else: + go = creator + return self.async_creation_runner(self, orig_key, go, mutex) else: async_creator = None @@ -896,7 +931,7 @@ class CacheRegion(object): if (value is NO_VALUE or value.metadata['v'] != value_version or self.region_invalidator.is_hard_invalidated( - value.metadata['v'])): + value.metadata['ct'])): # dogpile.core understands a 0 here as # "the value is not available", e.g. # _has_value() will return False. @@ -1228,26 +1263,31 @@ class CacheRegion(object): if function_key_generator is None: function_key_generator = self.function_key_generator - def decorator(fn): + def get_or_create_for_user_func(key_generator, user_func, *arg, **kw): + key = key_generator(*arg, **kw) + + timeout = expiration_time() if expiration_time_is_callable \ + else expiration_time + return self.get_or_create(key, user_func, timeout, + should_cache_fn, (arg, kw)) + + def cache_decorator(user_func): if to_str is compat.string_type: # backwards compatible - key_generator = function_key_generator(namespace, fn) + key_generator = function_key_generator(namespace, user_func) else: key_generator = function_key_generator( - namespace, fn, + namespace, user_func, to_str=to_str) - @wraps(fn) - def decorate(*arg, **kw): + def refresh(*arg, **kw): + """ + Like invalidate, but regenerates the value instead + """ key = key_generator(*arg, **kw) - - @wraps(fn) - def creator(): - return fn(*arg, **kw) - timeout = expiration_time() if expiration_time_is_callable \ - else expiration_time - return self.get_or_create(key, creator, timeout, - should_cache_fn) + value = user_func(*arg, **kw) + self.set(key, value) + return value def invalidate(*arg, **kw): key = key_generator(*arg, **kw) @@ -1261,20 +1301,18 @@ class CacheRegion(object): key = key_generator(*arg, **kw) return self.get(key) - def refresh(*arg, **kw): - key = key_generator(*arg, **kw) - value = fn(*arg, **kw) - self.set(key, value) - return value + user_func.set = set_ + user_func.invalidate = invalidate + user_func.get = get + user_func.refresh = refresh + user_func.original = user_func - decorate.set = set_ - decorate.invalidate = invalidate - decorate.refresh = refresh - decorate.get = get - decorate.original = fn + # Use `decorate` to preserve the signature of :param:`user_func`. - return decorate - return decorator + return decorate(user_func, partial( + get_or_create_for_user_func, key_generator)) + + return cache_decorator def cache_multi_on_arguments( self, namespace=None, expiration_time=None, @@ -1402,50 +1440,49 @@ class CacheRegion(object): if function_multi_key_generator is None: function_multi_key_generator = self.function_multi_key_generator - def decorator(fn): - key_generator = function_multi_key_generator( - namespace, fn, - to_str=to_str) - - @wraps(fn) - def decorate(*arg, **kw): - cache_keys = arg - keys = key_generator(*arg, **kw) - key_lookup = dict(zip(keys, cache_keys)) - - @wraps(fn) - def creator(*keys_to_create): - return fn(*[key_lookup[k] for k in keys_to_create]) - - timeout = expiration_time() if expiration_time_is_callable \ - else expiration_time + def get_or_create_for_user_func(key_generator, user_func, *arg, **kw): + cache_keys = arg + keys = key_generator(*arg, **kw) + key_lookup = dict(zip(keys, cache_keys)) + + @wraps(user_func) + def creator(*keys_to_create): + return user_func(*[key_lookup[k] for k in keys_to_create]) + + timeout = expiration_time() if expiration_time_is_callable \ + else expiration_time + + if asdict: + def dict_create(*keys): + d_values = creator(*keys) + return [ + d_values.get(key_lookup[k], NO_VALUE) + for k in keys] + + def wrap_cache_fn(value): + if value is NO_VALUE: + return False + elif not should_cache_fn: + return True + else: + return should_cache_fn(value) + + result = self.get_or_create_multi( + keys, dict_create, timeout, wrap_cache_fn) + result = dict( + (k, v) for k, v in zip(cache_keys, result) + if v is not NO_VALUE) + else: + result = self.get_or_create_multi( + keys, creator, timeout, + should_cache_fn) - if asdict: - def dict_create(*keys): - d_values = creator(*keys) - return [ - d_values.get(key_lookup[k], NO_VALUE) - for k in keys] - - def wrap_cache_fn(value): - if value is NO_VALUE: - return False - elif not should_cache_fn: - return True - else: - return should_cache_fn(value) - - result = self.get_or_create_multi( - keys, dict_create, timeout, wrap_cache_fn) - result = dict( - (k, v) for k, v in zip(cache_keys, result) - if v is not NO_VALUE) - else: - result = self.get_or_create_multi( - keys, creator, timeout, - should_cache_fn) + return result - return result + def cache_decorator(user_func): + key_generator = function_multi_key_generator( + namespace, user_func, + to_str=to_str) def invalidate(*arg): keys = key_generator(*arg) @@ -1466,7 +1503,7 @@ class CacheRegion(object): def refresh(*arg): keys = key_generator(*arg) - values = fn(*arg) + values = user_func(*arg) if asdict: self.set_multi( dict(zip(keys, [values[a] for a in arg])) @@ -1478,13 +1515,18 @@ class CacheRegion(object): ) return values - decorate.set = set_ - decorate.invalidate = invalidate - decorate.refresh = refresh - decorate.get = get + user_func.set = set_ + user_func.invalidate = invalidate + user_func.refresh = refresh + user_func.get = get + + # Use `decorate` to preserve the signature of :param:`user_func`. + + return decorate(user_func, partial(get_or_create_for_user_func, key_generator)) + + return cache_decorator + - return decorate - return decorator def make_region(*arg, **kw): diff --git a/libs/dogpile/cache/util.py b/libs/dogpile/cache/util.py index 4b56f6429..16bcd1c97 100644 --- a/libs/dogpile/cache/util.py +++ b/libs/dogpile/cache/util.py @@ -1,5 +1,4 @@ from hashlib import sha1 -import inspect from ..util import compat from ..util import langhelpers @@ -28,7 +27,7 @@ def function_key_generator(namespace, fn, to_str=compat.string_type): else: namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace) - args = inspect.getargspec(fn) + args = compat.inspect_getargspec(fn) has_self = args[0] and args[0][0] in ('self', 'cls') def generate_key(*args, **kw): @@ -50,7 +49,7 @@ def function_multi_key_generator(namespace, fn, to_str=compat.string_type): else: namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace) - args = inspect.getargspec(fn) + args = compat.inspect_getargspec(fn) has_self = args[0] and args[0][0] in ('self', 'cls') def generate_keys(*args, **kw): @@ -88,7 +87,7 @@ def kwarg_function_key_generator(namespace, fn, to_str=compat.string_type): else: namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace) - argspec = inspect.getargspec(fn) + argspec = compat.inspect_getargspec(fn) default_list = list(argspec.defaults or []) # Reverse the list, as we want to compare the argspec by negative index, # meaning default_list[0] should be args[-1], which works well with diff --git a/libs/dogpile/lock.py b/libs/dogpile/lock.py index 29f342d84..2ac22dcfe 100644 --- a/libs/dogpile/lock.py +++ b/libs/dogpile/lock.py @@ -69,11 +69,10 @@ class Lock(object): """Return true if the expiration time is reached, or no value is available.""" - return not self._has_value(createdtime) or \ - ( - self.expiretime is not None and - time.time() - createdtime > self.expiretime - ) + return not self._has_value(createdtime) or ( + self.expiretime is not None and + time.time() - createdtime > self.expiretime + ) def _has_value(self, createdtime): """Return true if the creation function has proceeded @@ -91,68 +90,100 @@ class Lock(object): value = NOT_REGENERATED createdtime = -1 - generated = self._enter_create(createdtime) + generated = self._enter_create(value, createdtime) if generated is not NOT_REGENERATED: generated, createdtime = generated return generated elif value is NOT_REGENERATED: + # we called upon the creator, and it said that it + # didn't regenerate. this typically means another + # thread is running the creation function, and that the + # cache should still have a value. However, + # we don't have a value at all, which is unusual since we just + # checked for it, so check again (TODO: is this a real codepath?) try: value, createdtime = value_fn() return value except NeedRegenerationException: - raise Exception("Generation function should " - "have just been called by a concurrent " - "thread.") + raise Exception( + "Generation function should " + "have just been called by a concurrent " + "thread.") else: return value - def _enter_create(self, createdtime): - + def _enter_create(self, value, createdtime): if not self._is_expired(createdtime): return NOT_REGENERATED - async = False + _async = False if self._has_value(createdtime): + has_value = True if not self.mutex.acquire(False): - log.debug("creation function in progress " - "elsewhere, returning") + log.debug( + "creation function in progress " + "elsewhere, returning") return NOT_REGENERATED else: + has_value = False log.debug("no value, waiting for create lock") self.mutex.acquire() try: log.debug("value creation lock %r acquired" % self.mutex) - # see if someone created the value already - try: - value, createdtime = self.value_and_created_fn() - except NeedRegenerationException: - pass - else: - if not self._is_expired(createdtime): - log.debug("value already present") - return value, createdtime - elif self.async_creator: - log.debug("Passing creation lock to async runner") - self.async_creator(self.mutex) - async = True - return value, createdtime - - log.debug("Calling creation function") - created = self.creator() - return created + if not has_value: + # we entered without a value, or at least with "creationtime == + # 0". Run the "getter" function again, to see if another + # thread has already generated the value while we waited on the + # mutex, or if the caller is otherwise telling us there is a + # value already which allows us to use async regeneration. (the + # latter is used by the multi-key routine). + try: + value, createdtime = self.value_and_created_fn() + except NeedRegenerationException: + # nope, nobody created the value, we're it. + # we must create it right now + pass + else: + has_value = True + # caller is telling us there is a value and that we can + # use async creation if it is expired. + if not self._is_expired(createdtime): + # it's not expired, return it + log.debug("Concurrent thread created the value") + return value, createdtime + + # otherwise it's expired, call creator again + + if has_value and self.async_creator: + # we have a value we can return, safe to use async_creator + log.debug("Passing creation lock to async runner") + + # so...run it! + self.async_creator(self.mutex) + _async = True + + # and return the expired value for now + return value, createdtime + + # it's expired, and it's our turn to create it synchronously, *or*, + # there's no value at all, and we have to create it synchronously + log.debug( + "Calling creation function for %s value", + "not-yet-present" if not has_value else + "previously expired" + ) + return self.creator() finally: - if not async: + if not _async: self.mutex.release() log.debug("Released creation lock") - def __enter__(self): return self._enter() def __exit__(self, type, value, traceback): pass - diff --git a/libs/dogpile/util/compat.py b/libs/dogpile/util/compat.py index d29bb1dac..198c76276 100644 --- a/libs/dogpile/util/compat.py +++ b/libs/dogpile/util/compat.py @@ -51,11 +51,33 @@ else: import thread # noqa +if py3k: + import collections + ArgSpec = collections.namedtuple( + "ArgSpec", + ["args", "varargs", "keywords", "defaults"]) + + from inspect import getfullargspec as inspect_getfullargspec + + def inspect_getargspec(func): + return ArgSpec( + *inspect_getfullargspec(func)[0:4] + ) +else: + from inspect import getargspec as inspect_getargspec # noqa + if py3k or jython: import pickle else: import cPickle as pickle # noqa +if py3k: + def read_config_file(config, fileobj): + return config.read_file(fileobj) +else: + def read_config_file(config, fileobj): + return config.readfp(fileobj) + def timedelta_total_seconds(td): if py27: diff --git a/libs/dogpile/util/nameregistry.py b/libs/dogpile/util/nameregistry.py index a5102b238..7087f7cd6 100644 --- a/libs/dogpile/util/nameregistry.py +++ b/libs/dogpile/util/nameregistry.py @@ -50,7 +50,7 @@ class NameRegistry(object): self.creator = creator def get(self, identifier, *args, **kw): - """Get and possibly create the value. + r"""Get and possibly create the value. :param identifier: Hash key for the value. If the creation function is called, this identifier @@ -75,10 +75,12 @@ class NameRegistry(object): if identifier in self._values: return self._values[identifier] else: - self._values[identifier] = value = self.creator(identifier, *args, **kw) + self._values[identifier] = value = self.creator( + identifier, *args, **kw) return value except KeyError: - self._values[identifier] = value = self.creator(identifier, *args, **kw) + self._values[identifier] = value = self.creator( + identifier, *args, **kw) return value finally: self._mutex.release() diff --git a/libs/dogpile/util/readwrite_lock.py b/libs/dogpile/util/readwrite_lock.py index 2196ed7d7..9b953edb8 100644 --- a/libs/dogpile/util/readwrite_lock.py +++ b/libs/dogpile/util/readwrite_lock.py @@ -23,7 +23,7 @@ class ReadWriteMutex(object): def __init__(self): # counts how many asynchronous methods are executing - self.async = 0 + self.async_ = 0 # pointer to thread that is the current sync operation self.current_sync_operation = None @@ -31,7 +31,7 @@ class ReadWriteMutex(object): # condition object to lock on self.condition = threading.Condition(threading.Lock()) - def acquire_read_lock(self, wait = True): + def acquire_read_lock(self, wait=True): """Acquire the 'read' lock.""" self.condition.acquire() try: @@ -45,7 +45,7 @@ class ReadWriteMutex(object): if self.current_sync_operation is not None: return False - self.async += 1 + self.async_ += 1 log.debug("%s acquired read lock", self) finally: self.condition.release() @@ -57,23 +57,23 @@ class ReadWriteMutex(object): """Release the 'read' lock.""" self.condition.acquire() try: - self.async -= 1 + self.async_ -= 1 # check if we are the last asynchronous reader thread # out the door. - if self.async == 0: + if self.async_ == 0: # yes. so if a sync operation is waiting, notifyAll to wake # it up if self.current_sync_operation is not None: self.condition.notifyAll() - elif self.async < 0: + elif self.async_ < 0: raise LockError("Synchronizer error - too many " "release_read_locks called") log.debug("%s released read lock", self) finally: self.condition.release() - def acquire_write_lock(self, wait = True): + def acquire_write_lock(self, wait=True): """Acquire the 'write' lock.""" self.condition.acquire() try: @@ -96,7 +96,7 @@ class ReadWriteMutex(object): self.current_sync_operation = threading.currentThread() # now wait again for asyncs to finish - if self.async > 0: + if self.async_ > 0: if wait: # wait self.condition.wait() diff --git a/libs/fcache/cache.py b/libs/fcache/cache.py index 695f916c3..e1510c233 100644 --- a/libs/fcache/cache.py +++ b/libs/fcache/cache.py @@ -4,13 +4,9 @@ import os import pickle import shutil import tempfile -import traceback -import hashlib import appdirs -from scandir import scandir, scandir_generic as _scandir_generic - try: from collections.abc import MutableMapping unicode = str @@ -90,7 +86,7 @@ class FileCache(MutableMapping): """ def __init__(self, appname, flag='c', mode=0o666, keyencoding='utf-8', - serialize=True, app_cache_dir=None, key_file_ext=".txt"): + serialize=True, app_cache_dir=None): """Initialize a :class:`FileCache` object.""" if not isinstance(flag, str): raise TypeError("flag must be str not '{}'".format(type(flag))) @@ -131,7 +127,6 @@ class FileCache(MutableMapping): self._mode = mode self._keyencoding = keyencoding self._serialize = serialize - self.key_file_ext = key_file_ext def _parse_appname(self, appname): """Splits an appname into the appname and subcache components.""" @@ -185,16 +180,7 @@ class FileCache(MutableMapping): self._sync = True for ekey in self._buffer: filename = self._key_to_filename(ekey) - try: - self._write_to_file(filename, self._buffer[ekey]) - except: - logger.error("Couldn't write content from %r to cache file: %r: %s", ekey, filename, - traceback.format_exc()) - try: - self.__write_to_file(filename + self.key_file_ext, ekey) - except: - logger.error("Couldn't write content from %r to cache file: %r: %s", ekey, filename, - traceback.format_exc()) + self._write_to_file(filename, self._buffer[ekey]) self._buffer.clear() self._sync = False @@ -203,7 +189,8 @@ class FileCache(MutableMapping): raise ValueError("invalid operation on closed cache") def _encode_key(self, key): - """ + """Encode key using *hex_codec* for constructing a cache filename. + Keys are implicitly converted to :class:`bytes` if passed as :class:`str`. @@ -212,15 +199,16 @@ class FileCache(MutableMapping): key = key.encode(self._keyencoding) elif not isinstance(key, bytes): raise TypeError("key must be bytes or str") - return key.decode(self._keyencoding) + return codecs.encode(key, 'hex_codec').decode(self._keyencoding) def _decode_key(self, key): - """ + """Decode key using hex_codec to retrieve the original key. + Keys are returned as :class:`str` if serialization is enabled. Keys are returned as :class:`bytes` if serialization is disabled. """ - bkey = key.encode(self._keyencoding) + bkey = codecs.decode(key.encode(self._keyencoding), 'hex_codec') return bkey.decode(self._keyencoding) if self._serialize else bkey def _dumps(self, value): @@ -231,27 +219,19 @@ class FileCache(MutableMapping): def _key_to_filename(self, key): """Convert an encoded key to an absolute cache filename.""" - if isinstance(key, unicode): - key = key.encode(self._keyencoding) - return os.path.join(self.cache_dir, hashlib.md5(key).hexdigest()) + return os.path.join(self.cache_dir, key) def _filename_to_key(self, absfilename): """Convert an absolute cache filename to a key name.""" - hkey_hdr_fn = absfilename + self.key_file_ext - if os.path.isfile(hkey_hdr_fn): - with open(hkey_hdr_fn, 'rb') as f: - key = f.read() - return key.decode(self._keyencoding) if self._serialize else key + return os.path.split(absfilename)[1] - def _all_filenames(self, scandir_generic=True): + def _all_filenames(self): """Return a list of absolute cache filenames""" - _scandir = _scandir_generic if scandir_generic else scandir try: - for entry in _scandir(self.cache_dir): - if entry.is_file(follow_symlinks=False) and not entry.name.endswith(self.key_file_ext): - yield os.path.join(self.cache_dir, entry.name) + return [os.path.join(self.cache_dir, filename) for filename in + os.listdir(self.cache_dir)] except (FileNotFoundError, OSError): - raise StopIteration + return [] def _all_keys(self): """Return a list of all encoded key names.""" @@ -261,17 +241,14 @@ class FileCache(MutableMapping): else: return set(file_keys + list(self._buffer)) - def __write_to_file(self, filename, value): + def _write_to_file(self, filename, bytesvalue): """Write bytesvalue to filename.""" fh, tmp = tempfile.mkstemp() with os.fdopen(fh, self._flag) as f: - f.write(value) + f.write(self._dumps(bytesvalue)) rename(tmp, filename) os.chmod(filename, self._mode) - def _write_to_file(self, filename, bytesvalue): - self.__write_to_file(filename, self._dumps(bytesvalue)) - def _read_from_file(self, filename): """Read data from filename.""" try: @@ -288,7 +265,6 @@ class FileCache(MutableMapping): else: filename = self._key_to_filename(ekey) self._write_to_file(filename, value) - self.__write_to_file(filename + self.key_file_ext, ekey) def __getitem__(self, key): ekey = self._encode_key(key) @@ -298,9 +274,8 @@ class FileCache(MutableMapping): except KeyError: pass filename = self._key_to_filename(ekey) - if not os.path.isfile(filename): + if filename not in self._all_filenames(): raise KeyError(key) - return self._read_from_file(filename) def __delitem__(self, key): @@ -317,11 +292,6 @@ class FileCache(MutableMapping): except (IOError, OSError): pass - try: - os.remove(filename + self.key_file_ext) - except (IOError, OSError): - pass - def __iter__(self): for key in self._all_keys(): yield self._decode_key(key) @@ -331,10 +301,4 @@ class FileCache(MutableMapping): def __contains__(self, key): ekey = self._encode_key(key) - if not self._sync: - try: - return ekey in self._buffer - except KeyError: - pass - filename = self._key_to_filename(ekey) - return os.path.isfile(filename) + return ekey in self._all_keys() diff --git a/libs/idna/core.py b/libs/idna/core.py index 090c2c18d..104624ad2 100644 --- a/libs/idna/core.py +++ b/libs/idna/core.py @@ -267,10 +267,7 @@ def alabel(label): try: label = label.encode('ascii') - try: - ulabel(label) - except IDNAError: - raise IDNAError('The label {0} is not a valid A-label'.format(label)) + ulabel(label) if not valid_label_length(label): raise IDNAError('Label too long') return label diff --git a/libs/idna/idnadata.py b/libs/idna/idnadata.py index 17974e233..a80c959d2 100644 --- a/libs/idna/idnadata.py +++ b/libs/idna/idnadata.py @@ -1,6 +1,6 @@ # This file is automatically generated by tools/idna-data -__version__ = "10.0.0" +__version__ = "11.0.0" scripts = { 'Greek': ( 0x37000000374, @@ -49,7 +49,7 @@ scripts = { 0x30210000302a, 0x30380000303c, 0x340000004db6, - 0x4e0000009feb, + 0x4e0000009ff0, 0xf9000000fa6e, 0xfa700000fada, 0x200000002a6d7, @@ -62,7 +62,7 @@ scripts = { 'Hebrew': ( 0x591000005c8, 0x5d0000005eb, - 0x5f0000005f5, + 0x5ef000005f5, 0xfb1d0000fb37, 0xfb380000fb3d, 0xfb3e0000fb3f, @@ -248,6 +248,7 @@ joining_types = { 0x6fb: 68, 0x6fc: 68, 0x6ff: 68, + 0x70f: 84, 0x710: 82, 0x712: 68, 0x713: 68, @@ -522,6 +523,7 @@ joining_types = { 0x1875: 68, 0x1876: 68, 0x1877: 68, + 0x1878: 68, 0x1880: 85, 0x1881: 85, 0x1882: 85, @@ -690,6 +692,70 @@ joining_types = { 0x10bad: 68, 0x10bae: 68, 0x10baf: 85, + 0x10d00: 76, + 0x10d01: 68, + 0x10d02: 68, + 0x10d03: 68, + 0x10d04: 68, + 0x10d05: 68, + 0x10d06: 68, + 0x10d07: 68, + 0x10d08: 68, + 0x10d09: 68, + 0x10d0a: 68, + 0x10d0b: 68, + 0x10d0c: 68, + 0x10d0d: 68, + 0x10d0e: 68, + 0x10d0f: 68, + 0x10d10: 68, + 0x10d11: 68, + 0x10d12: 68, + 0x10d13: 68, + 0x10d14: 68, + 0x10d15: 68, + 0x10d16: 68, + 0x10d17: 68, + 0x10d18: 68, + 0x10d19: 68, + 0x10d1a: 68, + 0x10d1b: 68, + 0x10d1c: 68, + 0x10d1d: 68, + 0x10d1e: 68, + 0x10d1f: 68, + 0x10d20: 68, + 0x10d21: 68, + 0x10d22: 82, + 0x10d23: 68, + 0x10f30: 68, + 0x10f31: 68, + 0x10f32: 68, + 0x10f33: 82, + 0x10f34: 68, + 0x10f35: 68, + 0x10f36: 68, + 0x10f37: 68, + 0x10f38: 68, + 0x10f39: 68, + 0x10f3a: 68, + 0x10f3b: 68, + 0x10f3c: 68, + 0x10f3d: 68, + 0x10f3e: 68, + 0x10f3f: 68, + 0x10f40: 68, + 0x10f41: 68, + 0x10f42: 68, + 0x10f43: 68, + 0x10f44: 68, + 0x10f45: 85, + 0x10f51: 68, + 0x10f52: 68, + 0x10f53: 68, + 0x10f54: 82, + 0x110bd: 85, + 0x110cd: 85, 0x1e900: 68, 0x1e901: 68, 0x1e902: 68, @@ -1034,14 +1100,15 @@ codepoint_classes = { 0x52d0000052e, 0x52f00000530, 0x5590000055a, - 0x56100000587, + 0x56000000587, + 0x58800000589, 0x591000005be, 0x5bf000005c0, 0x5c1000005c3, 0x5c4000005c6, 0x5c7000005c8, 0x5d0000005eb, - 0x5f0000005f3, + 0x5ef000005f3, 0x6100000061b, 0x62000000640, 0x64100000660, @@ -1054,12 +1121,13 @@ codepoint_classes = { 0x7100000074b, 0x74d000007b2, 0x7c0000007f6, + 0x7fd000007fe, 0x8000000082e, 0x8400000085c, 0x8600000086b, 0x8a0000008b5, 0x8b6000008be, - 0x8d4000008e2, + 0x8d3000008e2, 0x8e300000958, 0x96000000964, 0x96600000970, @@ -1077,6 +1145,7 @@ codepoint_classes = { 0x9e0000009e4, 0x9e6000009f2, 0x9fc000009fd, + 0x9fe000009ff, 0xa0100000a04, 0xa0500000a0b, 0xa0f00000a11, @@ -1136,8 +1205,7 @@ codepoint_classes = { 0xbd000000bd1, 0xbd700000bd8, 0xbe600000bf0, - 0xc0000000c04, - 0xc0500000c0d, + 0xc0000000c0d, 0xc0e00000c11, 0xc1200000c29, 0xc2a00000c3a, @@ -1276,7 +1344,7 @@ codepoint_classes = { 0x17dc000017de, 0x17e0000017ea, 0x18100000181a, - 0x182000001878, + 0x182000001879, 0x1880000018ab, 0x18b0000018f6, 0x19000000191f, @@ -1544,11 +1612,11 @@ codepoint_classes = { 0x309d0000309f, 0x30a1000030fb, 0x30fc000030ff, - 0x31050000312f, + 0x310500003130, 0x31a0000031bb, 0x31f000003200, 0x340000004db6, - 0x4e0000009feb, + 0x4e0000009ff0, 0xa0000000a48d, 0xa4d00000a4fe, 0xa5000000a60d, @@ -1655,8 +1723,10 @@ codepoint_classes = { 0xa7a50000a7a6, 0xa7a70000a7a8, 0xa7a90000a7aa, + 0xa7af0000a7b0, 0xa7b50000a7b6, 0xa7b70000a7b8, + 0xa7b90000a7ba, 0xa7f70000a7f8, 0xa7fa0000a828, 0xa8400000a874, @@ -1664,8 +1734,7 @@ codepoint_classes = { 0xa8d00000a8da, 0xa8e00000a8f8, 0xa8fb0000a8fc, - 0xa8fd0000a8fe, - 0xa9000000a92e, + 0xa8fd0000a92e, 0xa9300000a954, 0xa9800000a9c1, 0xa9cf0000a9da, @@ -1743,7 +1812,7 @@ codepoint_classes = { 0x10a0500010a07, 0x10a0c00010a14, 0x10a1500010a18, - 0x10a1900010a34, + 0x10a1900010a36, 0x10a3800010a3b, 0x10a3f00010a40, 0x10a6000010a7d, @@ -1756,6 +1825,11 @@ codepoint_classes = { 0x10b8000010b92, 0x10c0000010c49, 0x10cc000010cf3, + 0x10d0000010d28, + 0x10d3000010d3a, + 0x10f0000010f1d, + 0x10f2700010f28, + 0x10f3000010f51, 0x1100000011047, 0x1106600011070, 0x1107f000110bb, @@ -1763,10 +1837,11 @@ codepoint_classes = { 0x110f0000110fa, 0x1110000011135, 0x1113600011140, + 0x1114400011147, 0x1115000011174, 0x1117600011177, 0x11180000111c5, - 0x111ca000111cd, + 0x111c9000111cd, 0x111d0000111db, 0x111dc000111dd, 0x1120000011212, @@ -1786,7 +1861,7 @@ codepoint_classes = { 0x1132a00011331, 0x1133200011334, 0x113350001133a, - 0x1133c00011345, + 0x1133b00011345, 0x1134700011349, 0x1134b0001134e, 0x1135000011351, @@ -1796,6 +1871,7 @@ codepoint_classes = { 0x1137000011375, 0x114000001144b, 0x114500001145a, + 0x1145e0001145f, 0x11480000114c6, 0x114c7000114c8, 0x114d0000114da, @@ -1807,15 +1883,17 @@ codepoint_classes = { 0x116500001165a, 0x11680000116b8, 0x116c0000116ca, - 0x117000001171a, + 0x117000001171b, 0x1171d0001172c, 0x117300001173a, + 0x118000001183b, 0x118c0000118ea, 0x118ff00011900, 0x11a0000011a3f, 0x11a4700011a48, 0x11a5000011a84, 0x11a8600011a9a, + 0x11a9d00011a9e, 0x11ac000011af9, 0x11c0000011c09, 0x11c0a00011c37, @@ -1831,6 +1909,13 @@ codepoint_classes = { 0x11d3c00011d3e, 0x11d3f00011d48, 0x11d5000011d5a, + 0x11d6000011d66, + 0x11d6700011d69, + 0x11d6a00011d8f, + 0x11d9000011d92, + 0x11d9300011d99, + 0x11da000011daa, + 0x11ee000011ef7, 0x120000001239a, 0x1248000012544, 0x130000001342f, @@ -1845,11 +1930,12 @@ codepoint_classes = { 0x16b5000016b5a, 0x16b6300016b78, 0x16b7d00016b90, + 0x16e6000016e80, 0x16f0000016f45, 0x16f5000016f7f, 0x16f8f00016fa0, 0x16fe000016fe2, - 0x17000000187ed, + 0x17000000187f2, 0x1880000018af3, 0x1b0000001b11f, 0x1b1700001b2fc, diff --git a/libs/idna/package_data.py b/libs/idna/package_data.py index 39c192bae..257e89893 100644 --- a/libs/idna/package_data.py +++ b/libs/idna/package_data.py @@ -1,2 +1,2 @@ -__version__ = '2.7' +__version__ = '2.8' diff --git a/libs/idna/uts46data.py b/libs/idna/uts46data.py index 79731cb9e..a68ed4c0e 100644 --- a/libs/idna/uts46data.py +++ b/libs/idna/uts46data.py @@ -4,7 +4,7 @@ """IDNA Mapping Table from UTS46.""" -__version__ = "10.0.0" +__version__ = "11.0.0" def _seg_0(): return [ (0x0, '3'), @@ -1029,11 +1029,8 @@ def _seg_9(): (0x556, 'M', u'ֆ'), (0x557, 'X'), (0x559, 'V'), - (0x560, 'X'), - (0x561, 'V'), (0x587, 'M', u'եւ'), - (0x588, 'X'), - (0x589, 'V'), + (0x588, 'V'), (0x58B, 'X'), (0x58D, 'V'), (0x590, 'X'), @@ -1041,15 +1038,15 @@ def _seg_9(): (0x5C8, 'X'), (0x5D0, 'V'), (0x5EB, 'X'), - (0x5F0, 'V'), + (0x5EF, 'V'), (0x5F5, 'X'), + (0x606, 'V'), + (0x61C, 'X'), + (0x61E, 'V'), ] def _seg_10(): return [ - (0x606, 'V'), - (0x61C, 'X'), - (0x61E, 'V'), (0x675, 'M', u'اٴ'), (0x676, 'M', u'وٴ'), (0x677, 'M', u'ۇٴ'), @@ -1064,7 +1061,7 @@ def _seg_10(): (0x7B2, 'X'), (0x7C0, 'V'), (0x7FB, 'X'), - (0x800, 'V'), + (0x7FD, 'V'), (0x82E, 'X'), (0x830, 'V'), (0x83F, 'X'), @@ -1078,7 +1075,7 @@ def _seg_10(): (0x8B5, 'X'), (0x8B6, 'V'), (0x8BE, 'X'), - (0x8D4, 'V'), + (0x8D3, 'V'), (0x8E2, 'X'), (0x8E3, 'V'), (0x958, 'M', u'क़'), @@ -1118,7 +1115,7 @@ def _seg_10(): (0x9E0, 'V'), (0x9E4, 'X'), (0x9E6, 'V'), - (0x9FE, 'X'), + (0x9FF, 'X'), (0xA01, 'V'), (0xA04, 'X'), (0xA05, 'V'), @@ -1147,19 +1144,19 @@ def _seg_10(): (0xA4E, 'X'), (0xA51, 'V'), (0xA52, 'X'), + (0xA59, 'M', u'ਖ਼'), + (0xA5A, 'M', u'ਗ਼'), + (0xA5B, 'M', u'ਜ਼'), ] def _seg_11(): return [ - (0xA59, 'M', u'ਖ਼'), - (0xA5A, 'M', u'ਗ਼'), - (0xA5B, 'M', u'ਜ਼'), (0xA5C, 'V'), (0xA5D, 'X'), (0xA5E, 'M', u'ਫ਼'), (0xA5F, 'X'), (0xA66, 'V'), - (0xA76, 'X'), + (0xA77, 'X'), (0xA81, 'V'), (0xA84, 'X'), (0xA85, 'V'), @@ -1250,16 +1247,14 @@ def _seg_11(): (0xBE6, 'V'), (0xBFB, 'X'), (0xC00, 'V'), - (0xC04, 'X'), - ] - -def _seg_12(): - return [ - (0xC05, 'V'), (0xC0D, 'X'), (0xC0E, 'V'), (0xC11, 'X'), (0xC12, 'V'), + ] + +def _seg_12(): + return [ (0xC29, 'X'), (0xC2A, 'V'), (0xC3A, 'X'), @@ -1278,8 +1273,6 @@ def _seg_12(): (0xC66, 'V'), (0xC70, 'X'), (0xC78, 'V'), - (0xC84, 'X'), - (0xC85, 'V'), (0xC8D, 'X'), (0xC8E, 'V'), (0xC91, 'X'), @@ -1355,10 +1348,6 @@ def _seg_12(): (0xE83, 'X'), (0xE84, 'V'), (0xE85, 'X'), - ] - -def _seg_13(): - return [ (0xE87, 'V'), (0xE89, 'X'), (0xE8A, 'V'), @@ -1366,6 +1355,10 @@ def _seg_13(): (0xE8D, 'V'), (0xE8E, 'X'), (0xE94, 'V'), + ] + +def _seg_13(): + return [ (0xE98, 'X'), (0xE99, 'V'), (0xEA0, 'X'), @@ -1459,10 +1452,6 @@ def _seg_13(): (0x124E, 'X'), (0x1250, 'V'), (0x1257, 'X'), - ] - -def _seg_14(): - return [ (0x1258, 'V'), (0x1259, 'X'), (0x125A, 'V'), @@ -1470,6 +1459,10 @@ def _seg_14(): (0x1260, 'V'), (0x1289, 'X'), (0x128A, 'V'), + ] + +def _seg_14(): + return [ (0x128E, 'X'), (0x1290, 'V'), (0x12B1, 'X'), @@ -1538,7 +1531,7 @@ def _seg_14(): (0x1810, 'V'), (0x181A, 'X'), (0x1820, 'V'), - (0x1878, 'X'), + (0x1879, 'X'), (0x1880, 'V'), (0x18AB, 'X'), (0x18B0, 'V'), @@ -1563,10 +1556,6 @@ def _seg_14(): (0x19DB, 'X'), (0x19DE, 'V'), (0x1A1C, 'X'), - ] - -def _seg_15(): - return [ (0x1A1E, 'V'), (0x1A5F, 'X'), (0x1A60, 'V'), @@ -1574,6 +1563,10 @@ def _seg_15(): (0x1A7F, 'V'), (0x1A8A, 'X'), (0x1A90, 'V'), + ] + +def _seg_15(): + return [ (0x1A9A, 'X'), (0x1AA0, 'V'), (0x1AAE, 'X'), @@ -1667,10 +1660,6 @@ def _seg_15(): (0x1D68, 'M', u'ρ'), (0x1D69, 'M', u'φ'), (0x1D6A, 'M', u'χ'), - ] - -def _seg_16(): - return [ (0x1D6B, 'V'), (0x1D78, 'M', u'н'), (0x1D79, 'V'), @@ -1678,6 +1667,10 @@ def _seg_16(): (0x1D9C, 'M', u'c'), (0x1D9D, 'M', u'ɕ'), (0x1D9E, 'M', u'ð'), + ] + +def _seg_16(): + return [ (0x1D9F, 'M', u'ɜ'), (0x1DA0, 'M', u'f'), (0x1DA1, 'M', u'ɟ'), @@ -1771,10 +1764,6 @@ def _seg_16(): (0x1E36, 'M', u'ḷ'), (0x1E37, 'V'), (0x1E38, 'M', u'ḹ'), - ] - -def _seg_17(): - return [ (0x1E39, 'V'), (0x1E3A, 'M', u'ḻ'), (0x1E3B, 'V'), @@ -1782,6 +1771,10 @@ def _seg_17(): (0x1E3D, 'V'), (0x1E3E, 'M', u'ḿ'), (0x1E3F, 'V'), + ] + +def _seg_17(): + return [ (0x1E40, 'M', u'ṁ'), (0x1E41, 'V'), (0x1E42, 'M', u'ṃ'), @@ -1875,10 +1868,6 @@ def _seg_17(): (0x1E9F, 'V'), (0x1EA0, 'M', u'ạ'), (0x1EA1, 'V'), - ] - -def _seg_18(): - return [ (0x1EA2, 'M', u'ả'), (0x1EA3, 'V'), (0x1EA4, 'M', u'ấ'), @@ -1886,6 +1875,10 @@ def _seg_18(): (0x1EA6, 'M', u'ầ'), (0x1EA7, 'V'), (0x1EA8, 'M', u'ẩ'), + ] + +def _seg_18(): + return [ (0x1EA9, 'V'), (0x1EAA, 'M', u'ẫ'), (0x1EAB, 'V'), @@ -1979,10 +1972,6 @@ def _seg_18(): (0x1F0B, 'M', u'ἃ'), (0x1F0C, 'M', u'ἄ'), (0x1F0D, 'M', u'ἅ'), - ] - -def _seg_19(): - return [ (0x1F0E, 'M', u'ἆ'), (0x1F0F, 'M', u'ἇ'), (0x1F10, 'V'), @@ -1990,6 +1979,10 @@ def _seg_19(): (0x1F18, 'M', u'ἐ'), (0x1F19, 'M', u'ἑ'), (0x1F1A, 'M', u'ἒ'), + ] + +def _seg_19(): + return [ (0x1F1B, 'M', u'ἓ'), (0x1F1C, 'M', u'ἔ'), (0x1F1D, 'M', u'ἕ'), @@ -2083,10 +2076,6 @@ def _seg_19(): (0x1F9A, 'M', u'ἢι'), (0x1F9B, 'M', u'ἣι'), (0x1F9C, 'M', u'ἤι'), - ] - -def _seg_20(): - return [ (0x1F9D, 'M', u'ἥι'), (0x1F9E, 'M', u'ἦι'), (0x1F9F, 'M', u'ἧι'), @@ -2094,6 +2083,10 @@ def _seg_20(): (0x1FA1, 'M', u'ὡι'), (0x1FA2, 'M', u'ὢι'), (0x1FA3, 'M', u'ὣι'), + ] + +def _seg_20(): + return [ (0x1FA4, 'M', u'ὤι'), (0x1FA5, 'M', u'ὥι'), (0x1FA6, 'M', u'ὦι'), @@ -2187,10 +2180,6 @@ def _seg_20(): (0x2024, 'X'), (0x2027, 'V'), (0x2028, 'X'), - ] - -def _seg_21(): - return [ (0x202F, '3', u' '), (0x2030, 'V'), (0x2033, 'M', u'′′'), @@ -2198,6 +2187,10 @@ def _seg_21(): (0x2035, 'V'), (0x2036, 'M', u'‵‵'), (0x2037, 'M', u'‵‵‵'), + ] + +def _seg_21(): + return [ (0x2038, 'V'), (0x203C, '3', u'!!'), (0x203D, 'V'), @@ -2291,10 +2284,6 @@ def _seg_21(): (0x2120, 'M', u'sm'), (0x2121, 'M', u'tel'), (0x2122, 'M', u'tm'), - ] - -def _seg_22(): - return [ (0x2123, 'V'), (0x2124, 'M', u'z'), (0x2125, 'V'), @@ -2302,6 +2291,10 @@ def _seg_22(): (0x2127, 'V'), (0x2128, 'M', u'z'), (0x2129, 'V'), + ] + +def _seg_22(): + return [ (0x212A, 'M', u'k'), (0x212B, 'M', u'å'), (0x212C, 'M', u'b'), @@ -2395,10 +2388,6 @@ def _seg_22(): (0x226E, '3'), (0x2270, 'V'), (0x2329, 'M', u'〈'), - ] - -def _seg_23(): - return [ (0x232A, 'M', u'〉'), (0x232B, 'V'), (0x2427, 'X'), @@ -2406,6 +2395,10 @@ def _seg_23(): (0x244B, 'X'), (0x2460, 'M', u'1'), (0x2461, 'M', u'2'), + ] + +def _seg_23(): + return [ (0x2462, 'M', u'3'), (0x2463, 'M', u'4'), (0x2464, 'M', u'5'), @@ -2499,10 +2492,6 @@ def _seg_23(): (0x24CF, 'M', u'z'), (0x24D0, 'M', u'a'), (0x24D1, 'M', u'b'), - ] - -def _seg_24(): - return [ (0x24D2, 'M', u'c'), (0x24D3, 'M', u'd'), (0x24D4, 'M', u'e'), @@ -2510,6 +2499,10 @@ def _seg_24(): (0x24D6, 'M', u'g'), (0x24D7, 'M', u'h'), (0x24D8, 'M', u'i'), + ] + +def _seg_24(): + return [ (0x24D9, 'M', u'j'), (0x24DA, 'M', u'k'), (0x24DB, 'M', u'l'), @@ -2541,13 +2534,9 @@ def _seg_24(): (0x2B76, 'V'), (0x2B96, 'X'), (0x2B98, 'V'), - (0x2BBA, 'X'), - (0x2BBD, 'V'), (0x2BC9, 'X'), (0x2BCA, 'V'), - (0x2BD3, 'X'), - (0x2BEC, 'V'), - (0x2BF0, 'X'), + (0x2BFF, 'X'), (0x2C00, 'M', u'ⰰ'), (0x2C01, 'M', u'ⰱ'), (0x2C02, 'M', u'ⰲ'), @@ -2603,10 +2592,6 @@ def _seg_24(): (0x2C62, 'M', u'ɫ'), (0x2C63, 'M', u'ᵽ'), (0x2C64, 'M', u'ɽ'), - ] - -def _seg_25(): - return [ (0x2C65, 'V'), (0x2C67, 'M', u'ⱨ'), (0x2C68, 'V'), @@ -2618,6 +2603,10 @@ def _seg_25(): (0x2C6E, 'M', u'ɱ'), (0x2C6F, 'M', u'ɐ'), (0x2C70, 'M', u'ɒ'), + ] + +def _seg_25(): + return [ (0x2C71, 'V'), (0x2C72, 'M', u'ⱳ'), (0x2C73, 'V'), @@ -2707,10 +2696,6 @@ def _seg_25(): (0x2CCD, 'V'), (0x2CCE, 'M', u'ⳏ'), (0x2CCF, 'V'), - ] - -def _seg_26(): - return [ (0x2CD0, 'M', u'ⳑ'), (0x2CD1, 'V'), (0x2CD2, 'M', u'ⳓ'), @@ -2722,6 +2707,10 @@ def _seg_26(): (0x2CD8, 'M', u'ⳙ'), (0x2CD9, 'V'), (0x2CDA, 'M', u'ⳛ'), + ] + +def _seg_26(): + return [ (0x2CDB, 'V'), (0x2CDC, 'M', u'ⳝ'), (0x2CDD, 'V'), @@ -2768,7 +2757,7 @@ def _seg_26(): (0x2DD8, 'V'), (0x2DDF, 'X'), (0x2DE0, 'V'), - (0x2E4A, 'X'), + (0x2E4F, 'X'), (0x2E80, 'V'), (0x2E9A, 'X'), (0x2E9B, 'V'), @@ -2811,10 +2800,6 @@ def _seg_26(): (0x2F20, 'M', u'士'), (0x2F21, 'M', u'夂'), (0x2F22, 'M', u'夊'), - ] - -def _seg_27(): - return [ (0x2F23, 'M', u'夕'), (0x2F24, 'M', u'大'), (0x2F25, 'M', u'女'), @@ -2826,6 +2811,10 @@ def _seg_27(): (0x2F2B, 'M', u'尸'), (0x2F2C, 'M', u'屮'), (0x2F2D, 'M', u'山'), + ] + +def _seg_27(): + return [ (0x2F2E, 'M', u'巛'), (0x2F2F, 'M', u'工'), (0x2F30, 'M', u'己'), @@ -2915,10 +2904,6 @@ def _seg_27(): (0x2F84, 'M', u'至'), (0x2F85, 'M', u'臼'), (0x2F86, 'M', u'舌'), - ] - -def _seg_28(): - return [ (0x2F87, 'M', u'舛'), (0x2F88, 'M', u'舟'), (0x2F89, 'M', u'艮'), @@ -2930,6 +2915,10 @@ def _seg_28(): (0x2F8F, 'M', u'行'), (0x2F90, 'M', u'衣'), (0x2F91, 'M', u'襾'), + ] + +def _seg_28(): + return [ (0x2F92, 'M', u'見'), (0x2F93, 'M', u'角'), (0x2F94, 'M', u'言'), @@ -3019,13 +3008,9 @@ def _seg_28(): (0x309F, 'M', u'より'), (0x30A0, 'V'), (0x30FF, 'M', u'コト'), - ] - -def _seg_29(): - return [ (0x3100, 'X'), (0x3105, 'V'), - (0x312F, 'X'), + (0x3130, 'X'), (0x3131, 'M', u'ᄀ'), (0x3132, 'M', u'ᄁ'), (0x3133, 'M', u'ᆪ'), @@ -3034,6 +3019,10 @@ def _seg_29(): (0x3136, 'M', u'ᆭ'), (0x3137, 'M', u'ᄃ'), (0x3138, 'M', u'ᄄ'), + ] + +def _seg_29(): + return [ (0x3139, 'M', u'ᄅ'), (0x313A, 'M', u'ᆰ'), (0x313B, 'M', u'ᆱ'), @@ -3123,10 +3112,6 @@ def _seg_29(): (0x318F, 'X'), (0x3190, 'V'), (0x3192, 'M', u'一'), - ] - -def _seg_30(): - return [ (0x3193, 'M', u'二'), (0x3194, 'M', u'三'), (0x3195, 'M', u'四'), @@ -3138,6 +3123,10 @@ def _seg_30(): (0x319B, 'M', u'丙'), (0x319C, 'M', u'丁'), (0x319D, 'M', u'天'), + ] + +def _seg_30(): + return [ (0x319E, 'M', u'地'), (0x319F, 'M', u'人'), (0x31A0, 'V'), @@ -3227,10 +3216,6 @@ def _seg_30(): (0x3256, 'M', u'26'), (0x3257, 'M', u'27'), (0x3258, 'M', u'28'), - ] - -def _seg_31(): - return [ (0x3259, 'M', u'29'), (0x325A, 'M', u'30'), (0x325B, 'M', u'31'), @@ -3242,6 +3227,10 @@ def _seg_31(): (0x3261, 'M', u'ᄂ'), (0x3262, 'M', u'ᄃ'), (0x3263, 'M', u'ᄅ'), + ] + +def _seg_31(): + return [ (0x3264, 'M', u'ᄆ'), (0x3265, 'M', u'ᄇ'), (0x3266, 'M', u'ᄉ'), @@ -3331,10 +3320,6 @@ def _seg_31(): (0x32BA, 'M', u'45'), (0x32BB, 'M', u'46'), (0x32BC, 'M', u'47'), - ] - -def _seg_32(): - return [ (0x32BD, 'M', u'48'), (0x32BE, 'M', u'49'), (0x32BF, 'M', u'50'), @@ -3346,6 +3331,10 @@ def _seg_32(): (0x32C5, 'M', u'6月'), (0x32C6, 'M', u'7月'), (0x32C7, 'M', u'8月'), + ] + +def _seg_32(): + return [ (0x32C8, 'M', u'9月'), (0x32C9, 'M', u'10月'), (0x32CA, 'M', u'11月'), @@ -3435,10 +3424,6 @@ def _seg_32(): (0x331E, 'M', u'コーポ'), (0x331F, 'M', u'サイクル'), (0x3320, 'M', u'サンチーム'), - ] - -def _seg_33(): - return [ (0x3321, 'M', u'シリング'), (0x3322, 'M', u'センチ'), (0x3323, 'M', u'セント'), @@ -3450,6 +3435,10 @@ def _seg_33(): (0x3329, 'M', u'ノット'), (0x332A, 'M', u'ハイツ'), (0x332B, 'M', u'パーセント'), + ] + +def _seg_33(): + return [ (0x332C, 'M', u'パーツ'), (0x332D, 'M', u'バーレル'), (0x332E, 'M', u'ピアストル'), @@ -3539,10 +3528,6 @@ def _seg_33(): (0x3382, 'M', u'μa'), (0x3383, 'M', u'ma'), (0x3384, 'M', u'ka'), - ] - -def _seg_34(): - return [ (0x3385, 'M', u'kb'), (0x3386, 'M', u'mb'), (0x3387, 'M', u'gb'), @@ -3554,6 +3539,10 @@ def _seg_34(): (0x338D, 'M', u'μg'), (0x338E, 'M', u'mg'), (0x338F, 'M', u'kg'), + ] + +def _seg_34(): + return [ (0x3390, 'M', u'hz'), (0x3391, 'M', u'khz'), (0x3392, 'M', u'mhz'), @@ -3643,10 +3632,6 @@ def _seg_34(): (0x33E6, 'M', u'7日'), (0x33E7, 'M', u'8日'), (0x33E8, 'M', u'9日'), - ] - -def _seg_35(): - return [ (0x33E9, 'M', u'10日'), (0x33EA, 'M', u'11日'), (0x33EB, 'M', u'12日'), @@ -3658,6 +3643,10 @@ def _seg_35(): (0x33F1, 'M', u'18日'), (0x33F2, 'M', u'19日'), (0x33F3, 'M', u'20日'), + ] + +def _seg_35(): + return [ (0x33F4, 'M', u'21日'), (0x33F5, 'M', u'22日'), (0x33F6, 'M', u'23日'), @@ -3673,7 +3662,7 @@ def _seg_35(): (0x3400, 'V'), (0x4DB6, 'X'), (0x4DC0, 'V'), - (0x9FEB, 'X'), + (0x9FF0, 'X'), (0xA000, 'V'), (0xA48D, 'X'), (0xA490, 'V'), @@ -3747,10 +3736,6 @@ def _seg_35(): (0xA692, 'M', u'ꚓ'), (0xA693, 'V'), (0xA694, 'M', u'ꚕ'), - ] - -def _seg_36(): - return [ (0xA695, 'V'), (0xA696, 'M', u'ꚗ'), (0xA697, 'V'), @@ -3762,6 +3747,10 @@ def _seg_36(): (0xA69D, 'M', u'ь'), (0xA69E, 'V'), (0xA6F8, 'X'), + ] + +def _seg_36(): + return [ (0xA700, 'V'), (0xA722, 'M', u'ꜣ'), (0xA723, 'V'), @@ -3851,10 +3840,6 @@ def _seg_36(): (0xA780, 'M', u'ꞁ'), (0xA781, 'V'), (0xA782, 'M', u'ꞃ'), - ] - -def _seg_37(): - return [ (0xA783, 'V'), (0xA784, 'M', u'ꞅ'), (0xA785, 'V'), @@ -3866,6 +3851,10 @@ def _seg_37(): (0xA78E, 'V'), (0xA790, 'M', u'ꞑ'), (0xA791, 'V'), + ] + +def _seg_37(): + return [ (0xA792, 'M', u'ꞓ'), (0xA793, 'V'), (0xA796, 'M', u'ꞗ'), @@ -3893,7 +3882,7 @@ def _seg_37(): (0xA7AC, 'M', u'ɡ'), (0xA7AD, 'M', u'ɬ'), (0xA7AE, 'M', u'ɪ'), - (0xA7AF, 'X'), + (0xA7AF, 'V'), (0xA7B0, 'M', u'ʞ'), (0xA7B1, 'M', u'ʇ'), (0xA7B2, 'M', u'ʝ'), @@ -3903,6 +3892,8 @@ def _seg_37(): (0xA7B6, 'M', u'ꞷ'), (0xA7B7, 'V'), (0xA7B8, 'X'), + (0xA7B9, 'V'), + (0xA7BA, 'X'), (0xA7F7, 'V'), (0xA7F8, 'M', u'ħ'), (0xA7F9, 'M', u'œ'), @@ -3917,8 +3908,6 @@ def _seg_37(): (0xA8CE, 'V'), (0xA8DA, 'X'), (0xA8E0, 'V'), - (0xA8FE, 'X'), - (0xA900, 'V'), (0xA954, 'X'), (0xA95F, 'V'), (0xA97D, 'X'), @@ -3955,10 +3944,6 @@ def _seg_37(): (0xAB5F, 'M', u'ꭒ'), (0xAB60, 'V'), (0xAB66, 'X'), - ] - -def _seg_38(): - return [ (0xAB70, 'M', u'Ꭰ'), (0xAB71, 'M', u'Ꭱ'), (0xAB72, 'M', u'Ꭲ'), @@ -3970,6 +3955,10 @@ def _seg_38(): (0xAB78, 'M', u'Ꭸ'), (0xAB79, 'M', u'Ꭹ'), (0xAB7A, 'M', u'Ꭺ'), + ] + +def _seg_38(): + return [ (0xAB7B, 'M', u'Ꭻ'), (0xAB7C, 'M', u'Ꭼ'), (0xAB7D, 'M', u'Ꭽ'), @@ -4059,10 +4048,6 @@ def _seg_38(): (0xF907, 'M', u'龜'), (0xF909, 'M', u'契'), (0xF90A, 'M', u'金'), - ] - -def _seg_39(): - return [ (0xF90B, 'M', u'喇'), (0xF90C, 'M', u'奈'), (0xF90D, 'M', u'懶'), @@ -4074,6 +4059,10 @@ def _seg_39(): (0xF913, 'M', u'邏'), (0xF914, 'M', u'樂'), (0xF915, 'M', u'洛'), + ] + +def _seg_39(): + return [ (0xF916, 'M', u'烙'), (0xF917, 'M', u'珞'), (0xF918, 'M', u'落'), @@ -4163,10 +4152,6 @@ def _seg_39(): (0xF96C, 'M', u'塞'), (0xF96D, 'M', u'省'), (0xF96E, 'M', u'葉'), - ] - -def _seg_40(): - return [ (0xF96F, 'M', u'說'), (0xF970, 'M', u'殺'), (0xF971, 'M', u'辰'), @@ -4178,6 +4163,10 @@ def _seg_40(): (0xF977, 'M', u'亮'), (0xF978, 'M', u'兩'), (0xF979, 'M', u'凉'), + ] + +def _seg_40(): + return [ (0xF97A, 'M', u'梁'), (0xF97B, 'M', u'糧'), (0xF97C, 'M', u'良'), @@ -4267,10 +4256,6 @@ def _seg_40(): (0xF9D0, 'M', u'類'), (0xF9D1, 'M', u'六'), (0xF9D2, 'M', u'戮'), - ] - -def _seg_41(): - return [ (0xF9D3, 'M', u'陸'), (0xF9D4, 'M', u'倫'), (0xF9D5, 'M', u'崙'), @@ -4282,6 +4267,10 @@ def _seg_41(): (0xF9DB, 'M', u'率'), (0xF9DC, 'M', u'隆'), (0xF9DD, 'M', u'利'), + ] + +def _seg_41(): + return [ (0xF9DE, 'M', u'吏'), (0xF9DF, 'M', u'履'), (0xF9E0, 'M', u'易'), @@ -4371,10 +4360,6 @@ def _seg_41(): (0xFA39, 'M', u'塀'), (0xFA3A, 'M', u'墨'), (0xFA3B, 'M', u'層'), - ] - -def _seg_42(): - return [ (0xFA3C, 'M', u'屮'), (0xFA3D, 'M', u'悔'), (0xFA3E, 'M', u'慨'), @@ -4386,6 +4371,10 @@ def _seg_42(): (0xFA44, 'M', u'梅'), (0xFA45, 'M', u'海'), (0xFA46, 'M', u'渚'), + ] + +def _seg_42(): + return [ (0xFA47, 'M', u'漢'), (0xFA48, 'M', u'煮'), (0xFA49, 'M', u'爫'), @@ -4475,10 +4464,6 @@ def _seg_42(): (0xFA9F, 'M', u'犯'), (0xFAA0, 'M', u'猪'), (0xFAA1, 'M', u'瑱'), - ] - -def _seg_43(): - return [ (0xFAA2, 'M', u'甆'), (0xFAA3, 'M', u'画'), (0xFAA4, 'M', u'瘝'), @@ -4490,6 +4475,10 @@ def _seg_43(): (0xFAAA, 'M', u'着'), (0xFAAB, 'M', u'磌'), (0xFAAC, 'M', u'窱'), + ] + +def _seg_43(): + return [ (0xFAAD, 'M', u'節'), (0xFAAE, 'M', u'类'), (0xFAAF, 'M', u'絛'), @@ -4579,10 +4568,6 @@ def _seg_43(): (0xFB38, 'M', u'טּ'), (0xFB39, 'M', u'יּ'), (0xFB3A, 'M', u'ךּ'), - ] - -def _seg_44(): - return [ (0xFB3B, 'M', u'כּ'), (0xFB3C, 'M', u'לּ'), (0xFB3D, 'X'), @@ -4594,6 +4579,10 @@ def _seg_44(): (0xFB43, 'M', u'ףּ'), (0xFB44, 'M', u'פּ'), (0xFB45, 'X'), + ] + +def _seg_44(): + return [ (0xFB46, 'M', u'צּ'), (0xFB47, 'M', u'קּ'), (0xFB48, 'M', u'רּ'), @@ -4683,10 +4672,6 @@ def _seg_44(): (0xFC19, 'M', u'خج'), (0xFC1A, 'M', u'خح'), (0xFC1B, 'M', u'خم'), - ] - -def _seg_45(): - return [ (0xFC1C, 'M', u'سج'), (0xFC1D, 'M', u'سح'), (0xFC1E, 'M', u'سخ'), @@ -4698,6 +4683,10 @@ def _seg_45(): (0xFC24, 'M', u'ضخ'), (0xFC25, 'M', u'ضم'), (0xFC26, 'M', u'طح'), + ] + +def _seg_45(): + return [ (0xFC27, 'M', u'طم'), (0xFC28, 'M', u'ظم'), (0xFC29, 'M', u'عج'), @@ -4787,10 +4776,6 @@ def _seg_45(): (0xFC7D, 'M', u'في'), (0xFC7E, 'M', u'قى'), (0xFC7F, 'M', u'قي'), - ] - -def _seg_46(): - return [ (0xFC80, 'M', u'كا'), (0xFC81, 'M', u'كل'), (0xFC82, 'M', u'كم'), @@ -4802,6 +4787,10 @@ def _seg_46(): (0xFC88, 'M', u'ما'), (0xFC89, 'M', u'مم'), (0xFC8A, 'M', u'نر'), + ] + +def _seg_46(): + return [ (0xFC8B, 'M', u'نز'), (0xFC8C, 'M', u'نم'), (0xFC8D, 'M', u'نن'), @@ -4891,10 +4880,6 @@ def _seg_46(): (0xFCE1, 'M', u'بم'), (0xFCE2, 'M', u'به'), (0xFCE3, 'M', u'تم'), - ] - -def _seg_47(): - return [ (0xFCE4, 'M', u'ته'), (0xFCE5, 'M', u'ثم'), (0xFCE6, 'M', u'ثه'), @@ -4906,6 +4891,10 @@ def _seg_47(): (0xFCEC, 'M', u'كم'), (0xFCED, 'M', u'لم'), (0xFCEE, 'M', u'نم'), + ] + +def _seg_47(): + return [ (0xFCEF, 'M', u'نه'), (0xFCF0, 'M', u'يم'), (0xFCF1, 'M', u'يه'), @@ -4995,10 +4984,6 @@ def _seg_47(): (0xFD57, 'M', u'تمخ'), (0xFD58, 'M', u'جمح'), (0xFD5A, 'M', u'حمي'), - ] - -def _seg_48(): - return [ (0xFD5B, 'M', u'حمى'), (0xFD5C, 'M', u'سحج'), (0xFD5D, 'M', u'سجح'), @@ -5010,6 +4995,10 @@ def _seg_48(): (0xFD66, 'M', u'صمم'), (0xFD67, 'M', u'شحم'), (0xFD69, 'M', u'شجي'), + ] + +def _seg_48(): + return [ (0xFD6A, 'M', u'شمخ'), (0xFD6C, 'M', u'شمم'), (0xFD6E, 'M', u'ضحى'), @@ -5099,10 +5088,6 @@ def _seg_48(): (0xFDF3, 'M', u'اكبر'), (0xFDF4, 'M', u'محمد'), (0xFDF5, 'M', u'صلعم'), - ] - -def _seg_49(): - return [ (0xFDF6, 'M', u'رسول'), (0xFDF7, 'M', u'عليه'), (0xFDF8, 'M', u'وسلم'), @@ -5114,6 +5099,10 @@ def _seg_49(): (0xFDFE, 'X'), (0xFE00, 'I'), (0xFE10, '3', u','), + ] + +def _seg_49(): + return [ (0xFE11, 'M', u'、'), (0xFE12, 'X'), (0xFE13, '3', u':'), @@ -5203,10 +5192,6 @@ def _seg_49(): (0xFE8F, 'M', u'ب'), (0xFE93, 'M', u'ة'), (0xFE95, 'M', u'ت'), - ] - -def _seg_50(): - return [ (0xFE99, 'M', u'ث'), (0xFE9D, 'M', u'ج'), (0xFEA1, 'M', u'ح'), @@ -5218,6 +5203,10 @@ def _seg_50(): (0xFEB1, 'M', u'س'), (0xFEB5, 'M', u'ش'), (0xFEB9, 'M', u'ص'), + ] + +def _seg_50(): + return [ (0xFEBD, 'M', u'ض'), (0xFEC1, 'M', u'ط'), (0xFEC5, 'M', u'ظ'), @@ -5307,10 +5296,6 @@ def _seg_50(): (0xFF41, 'M', u'a'), (0xFF42, 'M', u'b'), (0xFF43, 'M', u'c'), - ] - -def _seg_51(): - return [ (0xFF44, 'M', u'd'), (0xFF45, 'M', u'e'), (0xFF46, 'M', u'f'), @@ -5322,6 +5307,10 @@ def _seg_51(): (0xFF4C, 'M', u'l'), (0xFF4D, 'M', u'm'), (0xFF4E, 'M', u'n'), + ] + +def _seg_51(): + return [ (0xFF4F, 'M', u'o'), (0xFF50, 'M', u'p'), (0xFF51, 'M', u'q'), @@ -5411,10 +5400,6 @@ def _seg_51(): (0xFFA5, 'M', u'ᆬ'), (0xFFA6, 'M', u'ᆭ'), (0xFFA7, 'M', u'ᄃ'), - ] - -def _seg_52(): - return [ (0xFFA8, 'M', u'ᄄ'), (0xFFA9, 'M', u'ᄅ'), (0xFFAA, 'M', u'ᆰ'), @@ -5426,6 +5411,10 @@ def _seg_52(): (0xFFB0, 'M', u'ᄚ'), (0xFFB1, 'M', u'ᄆ'), (0xFFB2, 'M', u'ᄇ'), + ] + +def _seg_52(): + return [ (0xFFB3, 'M', u'ᄈ'), (0xFFB4, 'M', u'ᄡ'), (0xFFB5, 'M', u'ᄉ'), @@ -5515,10 +5504,6 @@ def _seg_52(): (0x10300, 'V'), (0x10324, 'X'), (0x1032D, 'V'), - ] - -def _seg_53(): - return [ (0x1034B, 'X'), (0x10350, 'V'), (0x1037B, 'X'), @@ -5530,6 +5515,10 @@ def _seg_53(): (0x103D6, 'X'), (0x10400, 'M', u'𐐨'), (0x10401, 'M', u'𐐩'), + ] + +def _seg_53(): + return [ (0x10402, 'M', u'𐐪'), (0x10403, 'M', u'𐐫'), (0x10404, 'M', u'𐐬'), @@ -5619,10 +5608,6 @@ def _seg_53(): (0x10570, 'X'), (0x10600, 'V'), (0x10737, 'X'), - ] - -def _seg_54(): - return [ (0x10740, 'V'), (0x10756, 'X'), (0x10760, 'V'), @@ -5634,6 +5619,10 @@ def _seg_54(): (0x1080A, 'V'), (0x10836, 'X'), (0x10837, 'V'), + ] + +def _seg_54(): + return [ (0x10839, 'X'), (0x1083C, 'V'), (0x1083D, 'X'), @@ -5666,11 +5655,11 @@ def _seg_54(): (0x10A15, 'V'), (0x10A18, 'X'), (0x10A19, 'V'), - (0x10A34, 'X'), + (0x10A36, 'X'), (0x10A38, 'V'), (0x10A3B, 'X'), (0x10A3F, 'V'), - (0x10A48, 'X'), + (0x10A49, 'X'), (0x10A50, 'V'), (0x10A59, 'X'), (0x10A60, 'V'), @@ -5723,10 +5712,6 @@ def _seg_54(): (0x10C9B, 'M', u'𐳛'), (0x10C9C, 'M', u'𐳜'), (0x10C9D, 'M', u'𐳝'), - ] - -def _seg_55(): - return [ (0x10C9E, 'M', u'𐳞'), (0x10C9F, 'M', u'𐳟'), (0x10CA0, 'M', u'𐳠'), @@ -5738,6 +5723,10 @@ def _seg_55(): (0x10CA6, 'M', u'𐳦'), (0x10CA7, 'M', u'𐳧'), (0x10CA8, 'M', u'𐳨'), + ] + +def _seg_55(): + return [ (0x10CA9, 'M', u'𐳩'), (0x10CAA, 'M', u'𐳪'), (0x10CAB, 'M', u'𐳫'), @@ -5752,9 +5741,15 @@ def _seg_55(): (0x10CC0, 'V'), (0x10CF3, 'X'), (0x10CFA, 'V'), - (0x10D00, 'X'), + (0x10D28, 'X'), + (0x10D30, 'V'), + (0x10D3A, 'X'), (0x10E60, 'V'), (0x10E7F, 'X'), + (0x10F00, 'V'), + (0x10F28, 'X'), + (0x10F30, 'V'), + (0x10F5A, 'X'), (0x11000, 'V'), (0x1104E, 'X'), (0x11052, 'V'), @@ -5770,7 +5765,7 @@ def _seg_55(): (0x11100, 'V'), (0x11135, 'X'), (0x11136, 'V'), - (0x11144, 'X'), + (0x11147, 'X'), (0x11150, 'V'), (0x11177, 'X'), (0x11180, 'V'), @@ -5811,7 +5806,7 @@ def _seg_55(): (0x11334, 'X'), (0x11335, 'V'), (0x1133A, 'X'), - (0x1133C, 'V'), + (0x1133B, 'V'), (0x11345, 'X'), (0x11347, 'V'), (0x11349, 'X'), @@ -5827,16 +5822,16 @@ def _seg_55(): (0x1136D, 'X'), (0x11370, 'V'), (0x11375, 'X'), - ] - -def _seg_56(): - return [ (0x11400, 'V'), (0x1145A, 'X'), (0x1145B, 'V'), (0x1145C, 'X'), (0x1145D, 'V'), - (0x1145E, 'X'), + ] + +def _seg_56(): + return [ + (0x1145F, 'X'), (0x11480, 'V'), (0x114C8, 'X'), (0x114D0, 'V'), @@ -5856,11 +5851,13 @@ def _seg_56(): (0x116C0, 'V'), (0x116CA, 'X'), (0x11700, 'V'), - (0x1171A, 'X'), + (0x1171B, 'X'), (0x1171D, 'V'), (0x1172C, 'X'), (0x11730, 'V'), (0x11740, 'X'), + (0x11800, 'V'), + (0x1183C, 'X'), (0x118A0, 'M', u'𑣀'), (0x118A1, 'M', u'𑣁'), (0x118A2, 'M', u'𑣂'), @@ -5902,8 +5899,6 @@ def _seg_56(): (0x11A50, 'V'), (0x11A84, 'X'), (0x11A86, 'V'), - (0x11A9D, 'X'), - (0x11A9E, 'V'), (0x11AA3, 'X'), (0x11AC0, 'V'), (0x11AF9, 'X'), @@ -5931,14 +5926,28 @@ def _seg_56(): (0x11D3B, 'X'), (0x11D3C, 'V'), (0x11D3E, 'X'), - ] - -def _seg_57(): - return [ (0x11D3F, 'V'), (0x11D48, 'X'), (0x11D50, 'V'), (0x11D5A, 'X'), + (0x11D60, 'V'), + ] + +def _seg_57(): + return [ + (0x11D66, 'X'), + (0x11D67, 'V'), + (0x11D69, 'X'), + (0x11D6A, 'V'), + (0x11D8F, 'X'), + (0x11D90, 'V'), + (0x11D92, 'X'), + (0x11D93, 'V'), + (0x11D99, 'X'), + (0x11DA0, 'V'), + (0x11DAA, 'X'), + (0x11EE0, 'V'), + (0x11EF9, 'X'), (0x12000, 'V'), (0x1239A, 'X'), (0x12400, 'V'), @@ -5973,6 +5982,8 @@ def _seg_57(): (0x16B78, 'X'), (0x16B7D, 'V'), (0x16B90, 'X'), + (0x16E60, 'V'), + (0x16E9B, 'X'), (0x16F00, 'V'), (0x16F45, 'X'), (0x16F50, 'V'), @@ -5982,7 +5993,7 @@ def _seg_57(): (0x16FE0, 'V'), (0x16FE2, 'X'), (0x17000, 'V'), - (0x187ED, 'X'), + (0x187F2, 'X'), (0x18800, 'V'), (0x18AF3, 'X'), (0x1B000, 'V'), @@ -6024,21 +6035,23 @@ def _seg_57(): (0x1D1C1, 'V'), (0x1D1E9, 'X'), (0x1D200, 'V'), + ] + +def _seg_58(): + return [ (0x1D246, 'X'), + (0x1D2E0, 'V'), + (0x1D2F4, 'X'), (0x1D300, 'V'), (0x1D357, 'X'), (0x1D360, 'V'), - (0x1D372, 'X'), + (0x1D379, 'X'), (0x1D400, 'M', u'a'), (0x1D401, 'M', u'b'), (0x1D402, 'M', u'c'), (0x1D403, 'M', u'd'), (0x1D404, 'M', u'e'), (0x1D405, 'M', u'f'), - ] - -def _seg_58(): - return [ (0x1D406, 'M', u'g'), (0x1D407, 'M', u'h'), (0x1D408, 'M', u'i'), @@ -6126,6 +6139,10 @@ def _seg_58(): (0x1D45A, 'M', u'm'), (0x1D45B, 'M', u'n'), (0x1D45C, 'M', u'o'), + ] + +def _seg_59(): + return [ (0x1D45D, 'M', u'p'), (0x1D45E, 'M', u'q'), (0x1D45F, 'M', u'r'), @@ -6139,10 +6156,6 @@ def _seg_58(): (0x1D467, 'M', u'z'), (0x1D468, 'M', u'a'), (0x1D469, 'M', u'b'), - ] - -def _seg_59(): - return [ (0x1D46A, 'M', u'c'), (0x1D46B, 'M', u'd'), (0x1D46C, 'M', u'e'), @@ -6230,6 +6243,10 @@ def _seg_59(): (0x1D4C1, 'M', u'l'), (0x1D4C2, 'M', u'm'), (0x1D4C3, 'M', u'n'), + ] + +def _seg_60(): + return [ (0x1D4C4, 'X'), (0x1D4C5, 'M', u'p'), (0x1D4C6, 'M', u'q'), @@ -6243,10 +6260,6 @@ def _seg_59(): (0x1D4CE, 'M', u'y'), (0x1D4CF, 'M', u'z'), (0x1D4D0, 'M', u'a'), - ] - -def _seg_60(): - return [ (0x1D4D1, 'M', u'b'), (0x1D4D2, 'M', u'c'), (0x1D4D3, 'M', u'd'), @@ -6334,6 +6347,10 @@ def _seg_60(): (0x1D526, 'M', u'i'), (0x1D527, 'M', u'j'), (0x1D528, 'M', u'k'), + ] + +def _seg_61(): + return [ (0x1D529, 'M', u'l'), (0x1D52A, 'M', u'm'), (0x1D52B, 'M', u'n'), @@ -6347,10 +6364,6 @@ def _seg_60(): (0x1D533, 'M', u'v'), (0x1D534, 'M', u'w'), (0x1D535, 'M', u'x'), - ] - -def _seg_61(): - return [ (0x1D536, 'M', u'y'), (0x1D537, 'M', u'z'), (0x1D538, 'M', u'a'), @@ -6438,6 +6451,10 @@ def _seg_61(): (0x1D58C, 'M', u'g'), (0x1D58D, 'M', u'h'), (0x1D58E, 'M', u'i'), + ] + +def _seg_62(): + return [ (0x1D58F, 'M', u'j'), (0x1D590, 'M', u'k'), (0x1D591, 'M', u'l'), @@ -6451,10 +6468,6 @@ def _seg_61(): (0x1D599, 'M', u't'), (0x1D59A, 'M', u'u'), (0x1D59B, 'M', u'v'), - ] - -def _seg_62(): - return [ (0x1D59C, 'M', u'w'), (0x1D59D, 'M', u'x'), (0x1D59E, 'M', u'y'), @@ -6542,6 +6555,10 @@ def _seg_62(): (0x1D5F0, 'M', u'c'), (0x1D5F1, 'M', u'd'), (0x1D5F2, 'M', u'e'), + ] + +def _seg_63(): + return [ (0x1D5F3, 'M', u'f'), (0x1D5F4, 'M', u'g'), (0x1D5F5, 'M', u'h'), @@ -6555,10 +6572,6 @@ def _seg_62(): (0x1D5FD, 'M', u'p'), (0x1D5FE, 'M', u'q'), (0x1D5FF, 'M', u'r'), - ] - -def _seg_63(): - return [ (0x1D600, 'M', u's'), (0x1D601, 'M', u't'), (0x1D602, 'M', u'u'), @@ -6646,6 +6659,10 @@ def _seg_63(): (0x1D654, 'M', u'y'), (0x1D655, 'M', u'z'), (0x1D656, 'M', u'a'), + ] + +def _seg_64(): + return [ (0x1D657, 'M', u'b'), (0x1D658, 'M', u'c'), (0x1D659, 'M', u'd'), @@ -6659,10 +6676,6 @@ def _seg_63(): (0x1D661, 'M', u'l'), (0x1D662, 'M', u'm'), (0x1D663, 'M', u'n'), - ] - -def _seg_64(): - return [ (0x1D664, 'M', u'o'), (0x1D665, 'M', u'p'), (0x1D666, 'M', u'q'), @@ -6750,6 +6763,10 @@ def _seg_64(): (0x1D6B9, 'M', u'θ'), (0x1D6BA, 'M', u'σ'), (0x1D6BB, 'M', u'τ'), + ] + +def _seg_65(): + return [ (0x1D6BC, 'M', u'υ'), (0x1D6BD, 'M', u'φ'), (0x1D6BE, 'M', u'χ'), @@ -6763,10 +6780,6 @@ def _seg_64(): (0x1D6C6, 'M', u'ε'), (0x1D6C7, 'M', u'ζ'), (0x1D6C8, 'M', u'η'), - ] - -def _seg_65(): - return [ (0x1D6C9, 'M', u'θ'), (0x1D6CA, 'M', u'ι'), (0x1D6CB, 'M', u'κ'), @@ -6854,6 +6867,10 @@ def _seg_65(): (0x1D71F, 'M', u'δ'), (0x1D720, 'M', u'ε'), (0x1D721, 'M', u'ζ'), + ] + +def _seg_66(): + return [ (0x1D722, 'M', u'η'), (0x1D723, 'M', u'θ'), (0x1D724, 'M', u'ι'), @@ -6867,10 +6884,6 @@ def _seg_65(): (0x1D72C, 'M', u'ρ'), (0x1D72D, 'M', u'θ'), (0x1D72E, 'M', u'σ'), - ] - -def _seg_66(): - return [ (0x1D72F, 'M', u'τ'), (0x1D730, 'M', u'υ'), (0x1D731, 'M', u'φ'), @@ -6958,6 +6971,10 @@ def _seg_66(): (0x1D785, 'M', u'φ'), (0x1D786, 'M', u'χ'), (0x1D787, 'M', u'ψ'), + ] + +def _seg_67(): + return [ (0x1D788, 'M', u'ω'), (0x1D789, 'M', u'∂'), (0x1D78A, 'M', u'ε'), @@ -6971,10 +6988,6 @@ def _seg_66(): (0x1D792, 'M', u'γ'), (0x1D793, 'M', u'δ'), (0x1D794, 'M', u'ε'), - ] - -def _seg_67(): - return [ (0x1D795, 'M', u'ζ'), (0x1D796, 'M', u'η'), (0x1D797, 'M', u'θ'), @@ -7062,6 +7075,10 @@ def _seg_67(): (0x1D7EC, 'M', u'0'), (0x1D7ED, 'M', u'1'), (0x1D7EE, 'M', u'2'), + ] + +def _seg_68(): + return [ (0x1D7EF, 'M', u'3'), (0x1D7F0, 'M', u'4'), (0x1D7F1, 'M', u'5'), @@ -7075,10 +7092,6 @@ def _seg_67(): (0x1D7F9, 'M', u'3'), (0x1D7FA, 'M', u'4'), (0x1D7FB, 'M', u'5'), - ] - -def _seg_68(): - return [ (0x1D7FC, 'M', u'6'), (0x1D7FD, 'M', u'7'), (0x1D7FE, 'M', u'8'), @@ -7143,6 +7156,8 @@ def _seg_68(): (0x1E95A, 'X'), (0x1E95E, 'V'), (0x1E960, 'X'), + (0x1EC71, 'V'), + (0x1ECB5, 'X'), (0x1EE00, 'M', u'ا'), (0x1EE01, 'M', u'ب'), (0x1EE02, 'M', u'ج'), @@ -7164,6 +7179,10 @@ def _seg_68(): (0x1EE12, 'M', u'ق'), (0x1EE13, 'M', u'ر'), (0x1EE14, 'M', u'ش'), + ] + +def _seg_69(): + return [ (0x1EE15, 'M', u'ت'), (0x1EE16, 'M', u'ث'), (0x1EE17, 'M', u'خ'), @@ -7179,10 +7198,6 @@ def _seg_68(): (0x1EE21, 'M', u'ب'), (0x1EE22, 'M', u'ج'), (0x1EE23, 'X'), - ] - -def _seg_69(): - return [ (0x1EE24, 'M', u'ه'), (0x1EE25, 'X'), (0x1EE27, 'M', u'ح'), @@ -7268,6 +7283,10 @@ def _seg_69(): (0x1EE81, 'M', u'ب'), (0x1EE82, 'M', u'ج'), (0x1EE83, 'M', u'د'), + ] + +def _seg_70(): + return [ (0x1EE84, 'M', u'ه'), (0x1EE85, 'M', u'و'), (0x1EE86, 'M', u'ز'), @@ -7283,10 +7302,6 @@ def _seg_69(): (0x1EE90, 'M', u'ف'), (0x1EE91, 'M', u'ص'), (0x1EE92, 'M', u'ق'), - ] - -def _seg_70(): - return [ (0x1EE93, 'M', u'ر'), (0x1EE94, 'M', u'ش'), (0x1EE95, 'M', u'ت'), @@ -7372,6 +7387,10 @@ def _seg_70(): (0x1F122, '3', u'(s)'), (0x1F123, '3', u'(t)'), (0x1F124, '3', u'(u)'), + ] + +def _seg_71(): + return [ (0x1F125, '3', u'(v)'), (0x1F126, '3', u'(w)'), (0x1F127, '3', u'(x)'), @@ -7382,15 +7401,11 @@ def _seg_70(): (0x1F12C, 'M', u'r'), (0x1F12D, 'M', u'cd'), (0x1F12E, 'M', u'wz'), - (0x1F12F, 'X'), + (0x1F12F, 'V'), (0x1F130, 'M', u'a'), (0x1F131, 'M', u'b'), (0x1F132, 'M', u'c'), (0x1F133, 'M', u'd'), - ] - -def _seg_71(): - return [ (0x1F134, 'M', u'e'), (0x1F135, 'M', u'f'), (0x1F136, 'M', u'g'), @@ -7476,6 +7491,10 @@ def _seg_71(): (0x1F239, 'M', u'割'), (0x1F23A, 'M', u'営'), (0x1F23B, 'M', u'配'), + ] + +def _seg_72(): + return [ (0x1F23C, 'X'), (0x1F240, 'M', u'〔本〕'), (0x1F241, 'M', u'〔三〕'), @@ -7491,21 +7510,17 @@ def _seg_71(): (0x1F251, 'M', u'可'), (0x1F252, 'X'), (0x1F260, 'V'), - ] - -def _seg_72(): - return [ (0x1F266, 'X'), (0x1F300, 'V'), (0x1F6D5, 'X'), (0x1F6E0, 'V'), (0x1F6ED, 'X'), (0x1F6F0, 'V'), - (0x1F6F9, 'X'), + (0x1F6FA, 'X'), (0x1F700, 'V'), (0x1F774, 'X'), (0x1F780, 'V'), - (0x1F7D5, 'X'), + (0x1F7D9, 'X'), (0x1F800, 'V'), (0x1F80C, 'X'), (0x1F810, 'V'), @@ -7521,15 +7536,21 @@ def _seg_72(): (0x1F910, 'V'), (0x1F93F, 'X'), (0x1F940, 'V'), - (0x1F94D, 'X'), - (0x1F950, 'V'), - (0x1F96C, 'X'), - (0x1F980, 'V'), - (0x1F998, 'X'), + (0x1F971, 'X'), + (0x1F973, 'V'), + (0x1F977, 'X'), + (0x1F97A, 'V'), + (0x1F97B, 'X'), + (0x1F97C, 'V'), + (0x1F9A3, 'X'), + (0x1F9B0, 'V'), + (0x1F9BA, 'X'), (0x1F9C0, 'V'), - (0x1F9C1, 'X'), + (0x1F9C3, 'X'), (0x1F9D0, 'V'), - (0x1F9E7, 'X'), + (0x1FA00, 'X'), + (0x1FA60, 'V'), + (0x1FA6E, 'X'), (0x20000, 'V'), (0x2A6D7, 'X'), (0x2A700, 'V'), @@ -7574,6 +7595,10 @@ def _seg_72(): (0x2F81F, 'M', u'㓟'), (0x2F820, 'M', u'刻'), (0x2F821, 'M', u'剆'), + ] + +def _seg_73(): + return [ (0x2F822, 'M', u'割'), (0x2F823, 'M', u'剷'), (0x2F824, 'M', u'㔕'), @@ -7595,10 +7620,6 @@ def _seg_72(): (0x2F836, 'M', u'及'), (0x2F837, 'M', u'叟'), (0x2F838, 'M', u'𠭣'), - ] - -def _seg_73(): - return [ (0x2F839, 'M', u'叫'), (0x2F83A, 'M', u'叱'), (0x2F83B, 'M', u'吆'), @@ -7678,6 +7699,10 @@ def _seg_73(): (0x2F887, 'M', u'幩'), (0x2F888, 'M', u'㡢'), (0x2F889, 'M', u'𢆃'), + ] + +def _seg_74(): + return [ (0x2F88A, 'M', u'㡼'), (0x2F88B, 'M', u'庰'), (0x2F88C, 'M', u'庳'), @@ -7699,10 +7724,6 @@ def _seg_73(): (0x2F89E, 'M', u'志'), (0x2F89F, 'M', u'忹'), (0x2F8A0, 'M', u'悁'), - ] - -def _seg_74(): - return [ (0x2F8A1, 'M', u'㤺'), (0x2F8A2, 'M', u'㤜'), (0x2F8A3, 'M', u'悔'), @@ -7782,6 +7803,10 @@ def _seg_74(): (0x2F8ED, 'M', u'櫛'), (0x2F8EE, 'M', u'㰘'), (0x2F8EF, 'M', u'次'), + ] + +def _seg_75(): + return [ (0x2F8F0, 'M', u'𣢧'), (0x2F8F1, 'M', u'歔'), (0x2F8F2, 'M', u'㱎'), @@ -7803,10 +7828,6 @@ def _seg_74(): (0x2F902, 'M', u'流'), (0x2F903, 'M', u'浩'), (0x2F904, 'M', u'浸'), - ] - -def _seg_75(): - return [ (0x2F905, 'M', u'涅'), (0x2F906, 'M', u'𣴞'), (0x2F907, 'M', u'洴'), @@ -7886,6 +7907,10 @@ def _seg_75(): (0x2F953, 'M', u'祖'), (0x2F954, 'M', u'𥚚'), (0x2F955, 'M', u'𥛅'), + ] + +def _seg_76(): + return [ (0x2F956, 'M', u'福'), (0x2F957, 'M', u'秫'), (0x2F958, 'M', u'䄯'), @@ -7907,10 +7932,6 @@ def _seg_75(): (0x2F969, 'M', u'糣'), (0x2F96A, 'M', u'紀'), (0x2F96B, 'M', u'𥾆'), - ] - -def _seg_76(): - return [ (0x2F96C, 'M', u'絣'), (0x2F96D, 'M', u'䌁'), (0x2F96E, 'M', u'緇'), @@ -7990,6 +8011,10 @@ def _seg_76(): (0x2F9B8, 'M', u'蚈'), (0x2F9B9, 'M', u'蜎'), (0x2F9BA, 'M', u'蛢'), + ] + +def _seg_77(): + return [ (0x2F9BB, 'M', u'蝹'), (0x2F9BC, 'M', u'蜨'), (0x2F9BD, 'M', u'蝫'), @@ -8011,10 +8036,6 @@ def _seg_76(): (0x2F9CD, 'M', u'䚾'), (0x2F9CE, 'M', u'䛇'), (0x2F9CF, 'M', u'誠'), - ] - -def _seg_77(): - return [ (0x2F9D0, 'M', u'諭'), (0x2F9D1, 'M', u'變'), (0x2F9D2, 'M', u'豕'), @@ -8094,6 +8115,10 @@ def _seg_77(): (0x2FA1D, 'M', u'𪘀'), (0x2FA1E, 'X'), (0xE0100, 'I'), + ] + +def _seg_78(): + return [ (0xE01F0, 'X'), ] @@ -8176,4 +8201,5 @@ uts46data = tuple( + _seg_75() + _seg_76() + _seg_77() + + _seg_78() ) diff --git a/libs/ipaddress.py b/libs/ipaddress.py index 26d84ed76..16f03bfd8 100644 --- a/libs/ipaddress.py +++ b/libs/ipaddress.py @@ -1,84 +1,129 @@ -# Python 2.7 port of Python 3.4's ipaddress module. - -# List of compatibility changes: - -# Python 3 uses only new-style classes. -# s/class \(\w\+\):/class \1(object):/ - -# Use iterator versions of map and range: -from itertools import imap as map -range = xrange - -# Except that xrange only supports machine integers, not longs, so... -def long_range(start, end): - while start < end: - yield start - start += 1 - -# This backport uses bytearray instead of bytes, as bytes is the same -# as str in Python 2.7. -bytes = bytearray - -# Python 2 does not support exception chaining. -# s/ from None$// - -# When checking for instances of int, also allow Python 2's long. -_builtin_isinstance = isinstance - -def isinstance(val, types): - if types is int: - types = (int, long) - elif type(types) is tuple and int in types: - types += (long,) - return _builtin_isinstance(val, types) - -# functools.lru_cache is Python 3.2+ only. -# /@functools.lru_cache()/d - -# int().to_bytes is Python 3.2+ only. -# s/\(\w+\)\.to_bytes(/_int_to_bytes(\1, / -def _int_to_bytes(self, length, byteorder, signed=False): - assert byteorder == 'big' and signed is False - if self < 0 or self >= 256**length: - raise OverflowError() - return bytearray(('%0*x' % (length * 2, self)).decode('hex')) - -# int.from_bytes is Python 3.2+ only. -# s/int\.from_bytes(/_int_from_bytes(/g -def _int_from_bytes(what, byteorder, signed=False): - assert byteorder == 'big' and signed is False - return int(str(bytearray(what)).encode('hex'), 16) - -# Python 2.6 has no int.bit_length() +# Copyright 2007 Google Inc. +# Licensed to PSF under a Contributor Agreement. + +"""A fast, lightweight IPv4/IPv6 manipulation library in Python. + +This library is used to create/poke/manipulate IPv4 and IPv6 addresses +and networks. + +""" + +from __future__ import unicode_literals + + +import itertools +import struct + +__version__ = '1.0.22' + +# Compatibility functions +_compat_int_types = (int,) +try: + _compat_int_types = (int, long) +except NameError: + pass +try: + _compat_str = unicode +except NameError: + _compat_str = str + assert bytes != str +if b'\0'[0] == 0: # Python 3 semantics + def _compat_bytes_to_byte_vals(byt): + return byt +else: + def _compat_bytes_to_byte_vals(byt): + return [struct.unpack(b'!B', b)[0] for b in byt] +try: + _compat_int_from_byte_vals = int.from_bytes +except AttributeError: + def _compat_int_from_byte_vals(bytvals, endianess): + assert endianess == 'big' + res = 0 + for bv in bytvals: + assert isinstance(bv, _compat_int_types) + res = (res << 8) + bv + return res + + +def _compat_to_bytes(intval, length, endianess): + assert isinstance(intval, _compat_int_types) + assert endianess == 'big' + if length == 4: + if intval < 0 or intval >= 2 ** 32: + raise struct.error("integer out of range for 'I' format code") + return struct.pack(b'!I', intval) + elif length == 16: + if intval < 0 or intval >= 2 ** 128: + raise struct.error("integer out of range for 'QQ' format code") + return struct.pack(b'!QQ', intval >> 64, intval & 0xffffffffffffffff) + else: + raise NotImplementedError() + + if hasattr(int, 'bit_length'): - # Not `int.bit_length`, since it must also work for `long`. - _int_bit_length = lambda i: i.bit_length() + # Not int.bit_length , since that won't work in 2.7 where long exists + def _compat_bit_length(i): + return i.bit_length() else: - _int_bit_length = lambda i: len(bin(abs(i))) - 2 + def _compat_bit_length(i): + for res in itertools.count(): + if i >> res == 0: + return res +def _compat_range(start, end, step=1): + assert step > 0 + i = start + while i < end: + yield i + i += step -# ---------------------------------------------------------------------------- +class _TotalOrderingMixin(object): + __slots__ = () -# Copyright 2007 Google Inc. -# Licensed to PSF under a Contributor Agreement. + # Helper that derives the other comparison operations from + # __lt__ and __eq__ + # We avoid functools.total_ordering because it doesn't handle + # NotImplemented correctly yet (http://bugs.python.org/issue10042) + def __eq__(self, other): + raise NotImplementedError -"""A fast, lightweight IPv4/IPv6 manipulation library in Python. + def __ne__(self, other): + equal = self.__eq__(other) + if equal is NotImplemented: + return NotImplemented + return not equal -This library is used to create/poke/manipulate IPv4 and IPv6 addresses -and networks. + def __lt__(self, other): + raise NotImplementedError -""" + def __le__(self, other): + less = self.__lt__(other) + if less is NotImplemented or not less: + return self.__eq__(other) + return less -__version__ = '1.0' + def __gt__(self, other): + less = self.__lt__(other) + if less is NotImplemented: + return NotImplemented + equal = self.__eq__(other) + if equal is NotImplemented: + return NotImplemented + return not (less or equal) + def __ge__(self, other): + less = self.__lt__(other) + if less is NotImplemented: + return NotImplemented + return not less -import functools IPV4LENGTH = 32 IPV6LENGTH = 128 + class AddressValueError(ValueError): """A Value Error related to the address.""" @@ -113,6 +158,12 @@ def ip_address(address): except (AddressValueError, NetmaskValueError): pass + if isinstance(address, bytes): + raise AddressValueError( + '%r does not appear to be an IPv4 or IPv6 address. ' + 'Did you pass in a bytes (str in Python 2) instead of' + ' a unicode object?' % address) + raise ValueError('%r does not appear to be an IPv4 or IPv6 address' % address) @@ -143,6 +194,12 @@ def ip_network(address, strict=True): except (AddressValueError, NetmaskValueError): pass + if isinstance(address, bytes): + raise AddressValueError( + '%r does not appear to be an IPv4 or IPv6 network. ' + 'Did you pass in a bytes (str in Python 2) instead of' + ' a unicode object?' % address) + raise ValueError('%r does not appear to be an IPv4 or IPv6 network' % address) @@ -197,8 +254,8 @@ def v4_int_to_packed(address): """ try: - return _int_to_bytes(address, 4, 'big') - except: + return _compat_to_bytes(address, 4, 'big') + except (struct.error, OverflowError): raise ValueError("Address negative or too large for IPv4") @@ -213,36 +270,37 @@ def v6_int_to_packed(address): """ try: - return _int_to_bytes(address, 16, 'big') - except: + return _compat_to_bytes(address, 16, 'big') + except (struct.error, OverflowError): raise ValueError("Address negative or too large for IPv6") def _split_optional_netmask(address): """Helper to split the netmask and raise AddressValueError if needed""" - addr = str(address).split('/') + addr = _compat_str(address).split('/') if len(addr) > 2: raise AddressValueError("Only one '/' permitted in %r" % address) return addr def _find_address_range(addresses): - """Find a sequence of IPv#Address. + """Find a sequence of sorted deduplicated IPv#Address. Args: addresses: a list of IPv#Address objects. - Returns: + Yields: A tuple containing the first and last IP addresses in the sequence. """ - first = last = addresses[0] - for ip in addresses[1:]: - if ip._ip == last._ip + 1: - last = ip - else: - break - return (first, last) + it = iter(addresses) + first = last = next(it) + for ip in it: + if ip._ip != last._ip + 1: + yield first, last + first = ip + last = ip + yield first, last def _count_righthand_zero_bits(number, bits): @@ -258,11 +316,7 @@ def _count_righthand_zero_bits(number, bits): """ if number == 0: return bits - for i in range(bits): - if (number >> i) & 1: - return i - # All bits of interest were zero, even if there are more in the number - return bits + return min(bits, _compat_bit_length(~number & (number - 1))) def summarize_address_range(first, last): @@ -296,7 +350,7 @@ def summarize_address_range(first, last): raise TypeError('first and last must be IP addresses, not networks') if first.version != last.version: raise TypeError("%s and %s are not of the same version" % ( - first, last)) + first, last)) if first > last: raise ValueError('last IP address must be greater than first') @@ -312,16 +366,15 @@ def summarize_address_range(first, last): last_int = last._ip while first_int <= last_int: nbits = min(_count_righthand_zero_bits(first_int, ip_bits), - _int_bit_length(last_int - first_int + 1) - 1) - net = ip('%s/%d' % (first, ip_bits - nbits)) + _compat_bit_length(last_int - first_int + 1) - 1) + net = ip((first_int, ip_bits - nbits)) yield net first_int += 1 << nbits if first_int - 1 == ip._ALL_ONES: break - first = first.__class__(first_int) -def _collapse_addresses_recursive(addresses): +def _collapse_addresses_internal(addresses): """Loops through the addresses, collapsing concurrent netblocks. Example: @@ -331,7 +384,7 @@ def _collapse_addresses_recursive(addresses): ip3 = IPv4Network('192.0.2.128/26') ip4 = IPv4Network('192.0.2.192/26') - _collapse_addresses_recursive([ip1, ip2, ip3, ip4]) -> + _collapse_addresses_internal([ip1, ip2, ip3, ip4]) -> [IPv4Network('192.0.2.0/24')] This shouldn't be called directly; it is called via @@ -345,28 +398,29 @@ def _collapse_addresses_recursive(addresses): passed. """ - while True: - last_addr = None - ret_array = [] - optimized = False - - for cur_addr in addresses: - if not ret_array: - last_addr = cur_addr - ret_array.append(cur_addr) - elif (cur_addr.network_address >= last_addr.network_address and - cur_addr.broadcast_address <= last_addr.broadcast_address): - optimized = True - elif cur_addr == list(last_addr.supernet().subnets())[1]: - ret_array[-1] = last_addr = last_addr.supernet() - optimized = True - else: - last_addr = cur_addr - ret_array.append(cur_addr) - - addresses = ret_array - if not optimized: - return addresses + # First merge + to_merge = list(addresses) + subnets = {} + while to_merge: + net = to_merge.pop() + supernet = net.supernet() + existing = subnets.get(supernet) + if existing is None: + subnets[supernet] = net + elif existing != net: + # Merge consecutive subnets + del subnets[supernet] + to_merge.append(supernet) + # Then iterate over resulting networks, skipping subsumed subnets + last = None + for net in sorted(subnets.values()): + if last is not None: + # Since they are sorted, + # last.network_address <= net.network_address is a given. + if last.broadcast_address >= net.broadcast_address: + continue + yield net + last = net def collapse_addresses(addresses): @@ -387,7 +441,6 @@ def collapse_addresses(addresses): TypeError: If passed a list of mixed version objects. """ - i = 0 addrs = [] ips = [] nets = [] @@ -397,12 +450,12 @@ def collapse_addresses(addresses): if isinstance(ip, _BaseAddress): if ips and ips[-1]._version != ip._version: raise TypeError("%s and %s are not of the same version" % ( - ip, ips[-1])) + ip, ips[-1])) ips.append(ip) elif ip._prefixlen == ip._max_prefixlen: if ips and ips[-1]._version != ip._version: raise TypeError("%s and %s are not of the same version" % ( - ip, ips[-1])) + ip, ips[-1])) try: ips.append(ip.ip) except AttributeError: @@ -410,20 +463,18 @@ def collapse_addresses(addresses): else: if nets and nets[-1]._version != ip._version: raise TypeError("%s and %s are not of the same version" % ( - ip, nets[-1])) + ip, nets[-1])) nets.append(ip) # sort and dedup ips = sorted(set(ips)) - nets = sorted(set(nets)) - while i < len(ips): - (first, last) = _find_address_range(ips[i:]) - i = ips.index(last) + 1 - addrs.extend(summarize_address_range(first, last)) + # find consecutive address ranges in the sorted sequence and summarize them + if ips: + for first, last in _find_address_range(ips): + addrs.extend(summarize_address_range(first, last)) - return iter(_collapse_addresses_recursive(sorted( - addrs + nets, key=_BaseNetwork._get_networks_key))) + return _collapse_addresses_internal(addrs + nets) def get_mixed_type_key(obj): @@ -451,43 +502,12 @@ def get_mixed_type_key(obj): return NotImplemented -class _TotalOrderingMixin(object): - # Helper that derives the other comparison operations from - # __lt__ and __eq__ - # We avoid functools.total_ordering because it doesn't handle - # NotImplemented correctly yet (http://bugs.python.org/issue10042) - def __eq__(self, other): - raise NotImplementedError - def __ne__(self, other): - equal = self.__eq__(other) - if equal is NotImplemented: - return NotImplemented - return not equal - def __lt__(self, other): - raise NotImplementedError - def __le__(self, other): - less = self.__lt__(other) - if less is NotImplemented or not less: - return self.__eq__(other) - return less - def __gt__(self, other): - less = self.__lt__(other) - if less is NotImplemented: - return NotImplemented - equal = self.__eq__(other) - if equal is NotImplemented: - return NotImplemented - return not (less or equal) - def __ge__(self, other): - less = self.__lt__(other) - if less is NotImplemented: - return NotImplemented - return not less - class _IPAddressBase(_TotalOrderingMixin): """The mother class.""" + __slots__ = () + @property def exploded(self): """Return the longhand version of the IP address as a string.""" @@ -496,7 +516,18 @@ class _IPAddressBase(_TotalOrderingMixin): @property def compressed(self): """Return the shorthand version of the IP address as a string.""" - return str(self) + return _compat_str(self) + + @property + def reverse_pointer(self): + """The name of the reverse DNS pointer for the IP address, e.g.: + >>> ipaddress.ip_address("127.0.0.1").reverse_pointer + '1.0.0.127.in-addr.arpa' + >>> ipaddress.ip_address("2001:db8::1").reverse_pointer + '1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.8.b.d.0.1.0.0.2.ip6.arpa' + + """ + return self._reverse_pointer() @property def version(self): @@ -515,11 +546,15 @@ class _IPAddressBase(_TotalOrderingMixin): def _check_packed_address(self, address, expected_len): address_len = len(address) if address_len != expected_len: - msg = "%r (len %d != %d) is not permitted as an IPv%d address" + msg = ( + '%r (len %d != %d) is not permitted as an IPv%d address. ' + 'Did you pass in a bytes (str in Python 2) instead of' + ' a unicode object?') raise AddressValueError(msg % (address, address_len, expected_len, self._version)) - def _ip_int_from_prefix(self, prefixlen): + @classmethod + def _ip_int_from_prefix(cls, prefixlen): """Turn the prefix length into a bitwise netmask Args: @@ -529,13 +564,14 @@ class _IPAddressBase(_TotalOrderingMixin): An integer. """ - return self._ALL_ONES ^ (self._ALL_ONES >> prefixlen) + return cls._ALL_ONES ^ (cls._ALL_ONES >> prefixlen) - def _prefix_from_ip_int(self, ip_int): + @classmethod + def _prefix_from_ip_int(cls, ip_int): """Return prefix length from the bitwise netmask. Args: - ip_int: An integer, the netmask in axpanded bitwise format + ip_int: An integer, the netmask in expanded bitwise format Returns: An integer, the prefix length. @@ -544,22 +580,24 @@ class _IPAddressBase(_TotalOrderingMixin): ValueError: If the input intermingles zeroes & ones """ trailing_zeroes = _count_righthand_zero_bits(ip_int, - self._max_prefixlen) - prefixlen = self._max_prefixlen - trailing_zeroes + cls._max_prefixlen) + prefixlen = cls._max_prefixlen - trailing_zeroes leading_ones = ip_int >> trailing_zeroes all_ones = (1 << prefixlen) - 1 if leading_ones != all_ones: - byteslen = self._max_prefixlen // 8 - details = _int_to_bytes(ip_int, byteslen, 'big') + byteslen = cls._max_prefixlen // 8 + details = _compat_to_bytes(ip_int, byteslen, 'big') msg = 'Netmask pattern %r mixes zeroes & ones' raise ValueError(msg % details) return prefixlen - def _report_invalid_netmask(self, netmask_str): + @classmethod + def _report_invalid_netmask(cls, netmask_str): msg = '%r is not a valid netmask' % netmask_str raise NetmaskValueError(msg) - def _prefix_from_prefix_string(self, prefixlen_str): + @classmethod + def _prefix_from_prefix_string(cls, prefixlen_str): """Return prefix length from a numeric string Args: @@ -574,16 +612,17 @@ class _IPAddressBase(_TotalOrderingMixin): # int allows a leading +/- as well as surrounding whitespace, # so we ensure that isn't the case if not _BaseV4._DECIMAL_DIGITS.issuperset(prefixlen_str): - self._report_invalid_netmask(prefixlen_str) + cls._report_invalid_netmask(prefixlen_str) try: prefixlen = int(prefixlen_str) except ValueError: - self._report_invalid_netmask(prefixlen_str) - if not (0 <= prefixlen <= self._max_prefixlen): - self._report_invalid_netmask(prefixlen_str) + cls._report_invalid_netmask(prefixlen_str) + if not (0 <= prefixlen <= cls._max_prefixlen): + cls._report_invalid_netmask(prefixlen_str) return prefixlen - def _prefix_from_ip_string(self, ip_str): + @classmethod + def _prefix_from_ip_string(cls, ip_str): """Turn a netmask/hostmask string into a prefix length Args: @@ -597,24 +636,27 @@ class _IPAddressBase(_TotalOrderingMixin): """ # Parse the netmask/hostmask like an IP address. try: - ip_int = self._ip_int_from_string(ip_str) + ip_int = cls._ip_int_from_string(ip_str) except AddressValueError: - self._report_invalid_netmask(ip_str) + cls._report_invalid_netmask(ip_str) # Try matching a netmask (this would be /1*0*/ as a bitwise regexp). # Note that the two ambiguous cases (all-ones and all-zeroes) are # treated as netmasks. try: - return self._prefix_from_ip_int(ip_int) + return cls._prefix_from_ip_int(ip_int) except ValueError: pass # Invert the bits, and try matching a /0+1+/ hostmask instead. - ip_int ^= self._ALL_ONES + ip_int ^= cls._ALL_ONES try: - return self._prefix_from_ip_int(ip_int) + return cls._prefix_from_ip_int(ip_int) except ValueError: - self._report_invalid_netmask(ip_str) + cls._report_invalid_netmask(ip_str) + + def __reduce__(self): + return self.__class__, (_compat_str(self),) class _BaseAddress(_IPAddressBase): @@ -625,28 +667,27 @@ class _BaseAddress(_IPAddressBase): used by single IP addresses. """ - def __init__(self, address): - if (not isinstance(address, bytes) - and '/' in str(address)): - raise AddressValueError("Unexpected '/' in %r" % address) + __slots__ = () def __int__(self): return self._ip def __eq__(self, other): try: - return (self._ip == other._ip - and self._version == other._version) + return (self._ip == other._ip and + self._version == other._version) except AttributeError: return NotImplemented def __lt__(self, other): - if self._version != other._version: - raise TypeError('%s and %s are not of the same version' % ( - self, other)) + if not isinstance(other, _IPAddressBase): + return NotImplemented if not isinstance(other, _BaseAddress): raise TypeError('%s and %s are not of the same type' % ( - self, other)) + self, other)) + if self._version != other._version: + raise TypeError('%s and %s are not of the same version' % ( + self, other)) if self._ip != other._ip: return self._ip < other._ip return False @@ -654,20 +695,20 @@ class _BaseAddress(_IPAddressBase): # Shorthand for Integer addition and subtraction. This is not # meant to ever support addition/subtraction of addresses. def __add__(self, other): - if not isinstance(other, int): + if not isinstance(other, _compat_int_types): return NotImplemented return self.__class__(int(self) + other) def __sub__(self, other): - if not isinstance(other, int): + if not isinstance(other, _compat_int_types): return NotImplemented return self.__class__(int(self) - other) def __repr__(self): - return '%s(%r)' % (self.__class__.__name__, str(self)) + return '%s(%r)' % (self.__class__.__name__, _compat_str(self)) def __str__(self): - return str(self._string_from_ip_int(self._ip)) + return _compat_str(self._string_from_ip_int(self._ip)) def __hash__(self): return hash(hex(int(self._ip))) @@ -675,6 +716,9 @@ class _BaseAddress(_IPAddressBase): def _get_address_key(self): return (self._version, self) + def __reduce__(self): + return self.__class__, (self._ip,) + class _BaseNetwork(_IPAddressBase): @@ -688,7 +732,7 @@ class _BaseNetwork(_IPAddressBase): self._cache = {} def __repr__(self): - return '%s(%r)' % (self.__class__.__name__, str(self)) + return '%s(%r)' % (self.__class__.__name__, _compat_str(self)) def __str__(self): return '%s/%d' % (self.network_address, self.prefixlen) @@ -702,13 +746,13 @@ class _BaseNetwork(_IPAddressBase): """ network = int(self.network_address) broadcast = int(self.broadcast_address) - for x in long_range(network + 1, broadcast): + for x in _compat_range(network + 1, broadcast): yield self._address_class(x) def __iter__(self): network = int(self.network_address) broadcast = int(self.broadcast_address) - for x in long_range(network, broadcast + 1): + for x in _compat_range(network, broadcast + 1): yield self._address_class(x) def __getitem__(self, n): @@ -716,21 +760,23 @@ class _BaseNetwork(_IPAddressBase): broadcast = int(self.broadcast_address) if n >= 0: if network + n > broadcast: - raise IndexError + raise IndexError('address out of range') return self._address_class(network + n) else: n += 1 if broadcast + n < network: - raise IndexError + raise IndexError('address out of range') return self._address_class(broadcast + n) def __lt__(self, other): - if self._version != other._version: - raise TypeError('%s and %s are not of the same version' % ( - self, other)) + if not isinstance(other, _IPAddressBase): + return NotImplemented if not isinstance(other, _BaseNetwork): raise TypeError('%s and %s are not of the same type' % ( - self, other)) + self, other)) + if self._version != other._version: + raise TypeError('%s and %s are not of the same version' % ( + self, other)) if self.network_address != other.network_address: return self.network_address < other.network_address if self.netmask != other.netmask: @@ -821,21 +867,21 @@ class _BaseNetwork(_IPAddressBase): addr1 = ip_network('192.0.2.0/28') addr2 = ip_network('192.0.2.1/32') - addr1.address_exclude(addr2) = + list(addr1.address_exclude(addr2)) = [IPv4Network('192.0.2.0/32'), IPv4Network('192.0.2.2/31'), - IPv4Network('192.0.2.4/30'), IPv4Network('192.0.2.8/29')] + IPv4Network('192.0.2.4/30'), IPv4Network('192.0.2.8/29')] or IPv6: addr1 = ip_network('2001:db8::1/32') addr2 = ip_network('2001:db8::1/128') - addr1.address_exclude(addr2) = + list(addr1.address_exclude(addr2)) = [ip_network('2001:db8::1/128'), - ip_network('2001:db8::2/127'), - ip_network('2001:db8::4/126'), - ip_network('2001:db8::8/125'), - ... - ip_network('2001:db8:8000::/33')] + ip_network('2001:db8::2/127'), + ip_network('2001:db8::4/126'), + ip_network('2001:db8::8/125'), + ... + ip_network('2001:db8:8000::/33')] Args: other: An IPv4Network or IPv6Network object of the same type. @@ -852,16 +898,15 @@ class _BaseNetwork(_IPAddressBase): """ if not self._version == other._version: raise TypeError("%s and %s are not of the same version" % ( - self, other)) + self, other)) if not isinstance(other, _BaseNetwork): raise TypeError("%s is not a network object" % other) - if not (other.network_address >= self.network_address and - other.broadcast_address <= self.broadcast_address): + if not other.subnet_of(self): raise ValueError('%s not contained in %s' % (other, self)) if other == self: - raise StopIteration + return # Make sure we're comparing the network of other. other = other.__class__('%s/%s' % (other.network_address, @@ -869,12 +914,10 @@ class _BaseNetwork(_IPAddressBase): s1, s2 = self.subnets() while s1 != other and s2 != other: - if (other.network_address >= s1.network_address and - other.broadcast_address <= s1.broadcast_address): + if other.subnet_of(s1): yield s2 s1, s2 = s1.subnets() - elif (other.network_address >= s2.network_address and - other.broadcast_address <= s2.broadcast_address): + elif other.subnet_of(s2): yield s1 s1, s2 = s2.subnets() else: @@ -927,7 +970,7 @@ class _BaseNetwork(_IPAddressBase): # does this need to raise a ValueError? if self._version != other._version: raise TypeError('%s and %s are not of the same type' % ( - self, other)) + self, other)) # self._version == other._version below here: if self.network_address < other.network_address: return -1 @@ -996,20 +1039,11 @@ class _BaseNetwork(_IPAddressBase): 'prefix length diff %d is invalid for netblock %s' % ( new_prefixlen, self)) - first = self.__class__('%s/%s' % - (self.network_address, - self._prefixlen + prefixlen_diff)) - - yield first - current = first - while True: - broadcast = current.broadcast_address - if broadcast == self.broadcast_address: - return - new_addr = self._address_class(int(broadcast) + 1) - current = self.__class__('%s/%s' % (new_addr, - new_prefixlen)) - + start = int(self.network_address) + end = int(self.broadcast_address) + 1 + step = (int(self.hostmask) + 1) >> prefixlen_diff + for new_addr in _compat_range(start, end, step): + current = self.__class__((new_addr, new_prefixlen)) yield current def supernet(self, prefixlen_diff=1, new_prefix=None): @@ -1043,15 +1077,14 @@ class _BaseNetwork(_IPAddressBase): raise ValueError('cannot set prefixlen_diff and new_prefix') prefixlen_diff = self._prefixlen - new_prefix - if self.prefixlen - prefixlen_diff < 0: + new_prefixlen = self.prefixlen - prefixlen_diff + if new_prefixlen < 0: raise ValueError( 'current prefixlen is %d, cannot have a prefixlen_diff of %d' % (self.prefixlen, prefixlen_diff)) - # TODO (pmoody): optimize this. - t = self.__class__('%s/%d' % (self.network_address, - self.prefixlen - prefixlen_diff), - strict=False) - return t.__class__('%s/%d' % (t.network_address, t.prefixlen)) + return self.__class__(( + int(self.network_address) & (int(self.netmask) << prefixlen_diff), + new_prefixlen)) @property def is_multicast(self): @@ -1065,6 +1098,26 @@ class _BaseNetwork(_IPAddressBase): return (self.network_address.is_multicast and self.broadcast_address.is_multicast) + @staticmethod + def _is_subnet_of(a, b): + try: + # Always false if one is v4 and the other is v6. + if a._version != b._version: + raise TypeError("%s and %s are not of the same version" % (a, b)) + return (b.network_address <= a.network_address and + b.broadcast_address >= a.broadcast_address) + except AttributeError: + raise TypeError("Unable to test subnet containment " + "between %s and %s" % (a, b)) + + def subnet_of(self, other): + """Return True if this network is a subnet of other.""" + return self._is_subnet_of(self, other) + + def supernet_of(self, other): + """Return True if this network is a supernet of other.""" + return self._is_subnet_of(other, self) + @property def is_reserved(self): """Test if the address is otherwise IETF reserved. @@ -1145,21 +1198,49 @@ class _BaseV4(object): """ + __slots__ = () + _version = 4 # Equivalent to 255.255.255.255 or 32 bits of 1's. - _ALL_ONES = (2**IPV4LENGTH) - 1 + _ALL_ONES = (2 ** IPV4LENGTH) - 1 _DECIMAL_DIGITS = frozenset('0123456789') # the valid octets for host and netmasks. only useful for IPv4. - _valid_mask_octets = frozenset((255, 254, 252, 248, 240, 224, 192, 128, 0)) + _valid_mask_octets = frozenset([255, 254, 252, 248, 240, 224, 192, 128, 0]) - def __init__(self, address): - self._version = 4 - self._max_prefixlen = IPV4LENGTH + _max_prefixlen = IPV4LENGTH + # There are only a handful of valid v4 netmasks, so we cache them all + # when constructed (see _make_netmask()). + _netmask_cache = {} def _explode_shorthand_ip_string(self): - return str(self) + return _compat_str(self) + + @classmethod + def _make_netmask(cls, arg): + """Make a (netmask, prefix_len) tuple from the given argument. - def _ip_int_from_string(self, ip_str): + Argument can be: + - an integer (the prefix length) + - a string representing the prefix length (e.g. "24") + - a string representing the prefix netmask (e.g. "255.255.255.0") + """ + if arg not in cls._netmask_cache: + if isinstance(arg, _compat_int_types): + prefixlen = arg + else: + try: + # Check for a netmask in prefix length form + prefixlen = cls._prefix_from_prefix_string(arg) + except NetmaskValueError: + # Check for a netmask or hostmask in dotted-quad form. + # This may raise NetmaskValueError. + prefixlen = cls._prefix_from_ip_string(arg) + netmask = IPv4Address(cls._ip_int_from_prefix(prefixlen)) + cls._netmask_cache[arg] = netmask, prefixlen + return cls._netmask_cache[arg] + + @classmethod + def _ip_int_from_string(cls, ip_str): """Turn the given IP string into an integer for comparison. Args: @@ -1180,11 +1261,13 @@ class _BaseV4(object): raise AddressValueError("Expected 4 octets in %r" % ip_str) try: - return _int_from_bytes(map(self._parse_octet, octets), 'big') + return _compat_int_from_byte_vals( + map(cls._parse_octet, octets), 'big') except ValueError as exc: raise AddressValueError("%s in %r" % (exc, ip_str)) - def _parse_octet(self, octet_str): + @classmethod + def _parse_octet(cls, octet_str): """Convert a decimal octet into an integer. Args: @@ -1200,7 +1283,7 @@ class _BaseV4(object): if not octet_str: raise ValueError("Empty octet not permitted") # Whitelist the characters, since int() allows a lot of bizarre stuff. - if not self._DECIMAL_DIGITS.issuperset(octet_str): + if not cls._DECIMAL_DIGITS.issuperset(octet_str): msg = "Only decimal digits permitted in %r" raise ValueError(msg % octet_str) # We do the length check second, since the invalid character error @@ -1220,7 +1303,8 @@ class _BaseV4(object): raise ValueError("Octet %d (> 255) not permitted" % octet_int) return octet_int - def _string_from_ip_int(self, ip_int): + @classmethod + def _string_from_ip_int(cls, ip_int): """Turns a 32-bit integer into dotted decimal notation. Args: @@ -1230,38 +1314,10 @@ class _BaseV4(object): The IP address as a string in dotted decimal notation. """ - return '.'.join(map(str, _int_to_bytes(ip_int, 4, 'big'))) - - def _is_valid_netmask(self, netmask): - """Verify that the netmask is valid. - - Args: - netmask: A string, either a prefix or dotted decimal - netmask. - - Returns: - A boolean, True if the prefix represents a valid IPv4 - netmask. - - """ - mask = netmask.split('.') - if len(mask) == 4: - try: - for x in mask: - if int(x) not in self._valid_mask_octets: - return False - except ValueError: - # Found something that isn't an integer or isn't valid - return False - for idx, y in enumerate(mask): - if idx > 0 and y > mask[idx - 1]: - return False - return True - try: - netmask = int(netmask) - except ValueError: - return False - return 0 <= netmask <= self._max_prefixlen + return '.'.join(_compat_str(struct.unpack(b'!B', b)[0] + if isinstance(b, bytes) + else b) + for b in _compat_to_bytes(ip_int, 4, 'big')) def _is_hostmask(self, ip_str): """Test if the IP string is a hostmask (rather than a netmask). @@ -1284,6 +1340,15 @@ class _BaseV4(object): return True return False + def _reverse_pointer(self): + """Return the reverse DNS pointer name for the IPv4 address. + + This implements the method described in RFC1035 3.5. + + """ + reverse_octets = _compat_str(self).split('.')[::-1] + return '.'.join(reverse_octets) + '.in-addr.arpa' + @property def max_prefixlen(self): return self._max_prefixlen @@ -1297,6 +1362,8 @@ class IPv4Address(_BaseV4, _BaseAddress): """Represent and manipulate single IPv4 Addresses.""" + __slots__ = ('_ip', '__weakref__') + def __init__(self, address): """ @@ -1313,11 +1380,8 @@ class IPv4Address(_BaseV4, _BaseAddress): AddressValueError: If ipaddress isn't a valid IPv4 address. """ - _BaseAddress.__init__(self, address) - _BaseV4.__init__(self, address) - # Efficient constructor from integer. - if isinstance(address, int): + if isinstance(address, _compat_int_types): self._check_int_address(address) self._ip = address return @@ -1325,12 +1389,15 @@ class IPv4Address(_BaseV4, _BaseAddress): # Constructing from a packed address if isinstance(address, bytes): self._check_packed_address(address, 4) - self._ip = _int_from_bytes(address, 'big') + bvs = _compat_bytes_to_byte_vals(address) + self._ip = _compat_int_from_byte_vals(bvs, 'big') return # Assume input argument to be string or any object representation # which converts into a formatted IP string. - addr_str = str(address) + addr_str = _compat_str(address) + if '/' in addr_str: + raise AddressValueError("Unexpected '/' in %r" % address) self._ip = self._ip_int_from_string(addr_str) @property @@ -1347,8 +1414,7 @@ class IPv4Address(_BaseV4, _BaseAddress): reserved IPv4 Network range. """ - reserved_network = IPv4Network('240.0.0.0/4') - return self in reserved_network + return self in self._constants._reserved_network @property def is_private(self): @@ -1359,21 +1425,13 @@ class IPv4Address(_BaseV4, _BaseAddress): iana-ipv4-special-registry. """ - return (self in IPv4Network('0.0.0.0/8') or - self in IPv4Network('10.0.0.0/8') or - self in IPv4Network('127.0.0.0/8') or - self in IPv4Network('169.254.0.0/16') or - self in IPv4Network('172.16.0.0/12') or - self in IPv4Network('192.0.0.0/29') or - self in IPv4Network('192.0.0.170/31') or - self in IPv4Network('192.0.2.0/24') or - self in IPv4Network('192.168.0.0/16') or - self in IPv4Network('198.18.0.0/15') or - self in IPv4Network('198.51.100.0/24') or - self in IPv4Network('203.0.113.0/24') or - self in IPv4Network('240.0.0.0/4') or - self in IPv4Network('255.255.255.255/32')) + return any(self in net for net in self._constants._private_networks) + @property + def is_global(self): + return ( + self not in self._constants._public_network and + not self.is_private) @property def is_multicast(self): @@ -1384,8 +1442,7 @@ class IPv4Address(_BaseV4, _BaseAddress): See RFC 3171 for details. """ - multicast_network = IPv4Network('224.0.0.0/4') - return self in multicast_network + return self in self._constants._multicast_network @property def is_unspecified(self): @@ -1396,8 +1453,7 @@ class IPv4Address(_BaseV4, _BaseAddress): RFC 5735 3. """ - unspecified_address = IPv4Address('0.0.0.0') - return self == unspecified_address + return self == self._constants._unspecified_address @property def is_loopback(self): @@ -1407,8 +1463,7 @@ class IPv4Address(_BaseV4, _BaseAddress): A boolean, True if the address is a loopback per RFC 3330. """ - loopback_network = IPv4Network('127.0.0.0/8') - return self in loopback_network + return self in self._constants._loopback_network @property def is_link_local(self): @@ -1418,19 +1473,30 @@ class IPv4Address(_BaseV4, _BaseAddress): A boolean, True if the address is link-local per RFC 3927. """ - linklocal_network = IPv4Network('169.254.0.0/16') - return self in linklocal_network + return self in self._constants._linklocal_network class IPv4Interface(IPv4Address): def __init__(self, address): - if isinstance(address, (bytes, int)): + if isinstance(address, (bytes, _compat_int_types)): IPv4Address.__init__(self, address) self.network = IPv4Network(self._ip) self._prefixlen = self._max_prefixlen return + if isinstance(address, tuple): + IPv4Address.__init__(self, address[0]) + if len(address) > 1: + self._prefixlen = int(address[1]) + else: + self._prefixlen = self._max_prefixlen + + self.network = IPv4Network(address, strict=False) + self.netmask = self.network.netmask + self.hostmask = self.network.hostmask + return + addr = _split_optional_netmask(address) IPv4Address.__init__(self, addr[0]) @@ -1461,7 +1527,8 @@ class IPv4Interface(IPv4Address): if address_less is NotImplemented: return NotImplemented try: - return self.network < other.network + return (self.network < other.network or + self.network == other.network and address_less) except AttributeError: # We *do* allow addresses and interfaces to be sorted. The # unassociated address is considered less than all interfaces. @@ -1470,6 +1537,8 @@ class IPv4Interface(IPv4Address): def __hash__(self): return self._ip ^ self._prefixlen ^ int(self.network.network_address) + __reduce__ = _IPAddressBase.__reduce__ + @property def ip(self): return IPv4Address(self._ip) @@ -1542,24 +1611,31 @@ class IPv4Network(_BaseV4, _BaseNetwork): supplied. """ - - _BaseV4.__init__(self, address) _BaseNetwork.__init__(self, address) - # Constructing from a packed address - if isinstance(address, bytes): + # Constructing from a packed address or integer + if isinstance(address, (_compat_int_types, bytes)): self.network_address = IPv4Address(address) - self._prefixlen = self._max_prefixlen - self.netmask = IPv4Address(self._ALL_ONES) - #fixme: address/network test here + self.netmask, self._prefixlen = self._make_netmask( + self._max_prefixlen) + # fixme: address/network test here. return - # Efficient constructor from integer. - if isinstance(address, int): - self.network_address = IPv4Address(address) - self._prefixlen = self._max_prefixlen - self.netmask = IPv4Address(self._ALL_ONES) - #fixme: address/network test here. + if isinstance(address, tuple): + if len(address) > 1: + arg = address[1] + else: + # We weren't given an address[1] + arg = self._max_prefixlen + self.network_address = IPv4Address(address[0]) + self.netmask, self._prefixlen = self._make_netmask(arg) + packed = int(self.network_address) + if packed & int(self.netmask) != packed: + if strict: + raise ValueError('%s has host bits set' % self) + else: + self.network_address = IPv4Address(packed & + int(self.netmask)) return # Assume input argument to be string or any object representation @@ -1568,20 +1644,14 @@ class IPv4Network(_BaseV4, _BaseNetwork): self.network_address = IPv4Address(self._ip_int_from_string(addr[0])) if len(addr) == 2: - try: - # Check for a netmask in prefix length form - self._prefixlen = self._prefix_from_prefix_string(addr[1]) - except NetmaskValueError: - # Check for a netmask or hostmask in dotted-quad form. - # This may raise NetmaskValueError. - self._prefixlen = self._prefix_from_ip_string(addr[1]) + arg = addr[1] else: - self._prefixlen = self._max_prefixlen - self.netmask = IPv4Address(self._ip_int_from_prefix(self._prefixlen)) + arg = self._max_prefixlen + self.netmask, self._prefixlen = self._make_netmask(arg) if strict: if (IPv4Address(int(self.network_address) & int(self.netmask)) != - self.network_address): + self.network_address): raise ValueError('%s has host bits set' % self) self.network_address = IPv4Address(int(self.network_address) & int(self.netmask)) @@ -1599,10 +1669,44 @@ class IPv4Network(_BaseV4, _BaseNetwork): """ return (not (self.network_address in IPv4Network('100.64.0.0/10') and - self.broadcast_address in IPv4Network('100.64.0.0/10')) and + self.broadcast_address in IPv4Network('100.64.0.0/10')) and not self.is_private) +class _IPv4Constants(object): + + _linklocal_network = IPv4Network('169.254.0.0/16') + + _loopback_network = IPv4Network('127.0.0.0/8') + + _multicast_network = IPv4Network('224.0.0.0/4') + + _public_network = IPv4Network('100.64.0.0/10') + + _private_networks = [ + IPv4Network('0.0.0.0/8'), + IPv4Network('10.0.0.0/8'), + IPv4Network('127.0.0.0/8'), + IPv4Network('169.254.0.0/16'), + IPv4Network('172.16.0.0/12'), + IPv4Network('192.0.0.0/29'), + IPv4Network('192.0.0.170/31'), + IPv4Network('192.0.2.0/24'), + IPv4Network('192.168.0.0/16'), + IPv4Network('198.18.0.0/15'), + IPv4Network('198.51.100.0/24'), + IPv4Network('203.0.113.0/24'), + IPv4Network('240.0.0.0/4'), + IPv4Network('255.255.255.255/32'), + ] + + _reserved_network = IPv4Network('240.0.0.0/4') + + _unspecified_address = IPv4Address('0.0.0.0') + + +IPv4Address._constants = _IPv4Constants + class _BaseV6(object): @@ -1613,15 +1717,37 @@ class _BaseV6(object): """ - _ALL_ONES = (2**IPV6LENGTH) - 1 + __slots__ = () + _version = 6 + _ALL_ONES = (2 ** IPV6LENGTH) - 1 _HEXTET_COUNT = 8 _HEX_DIGITS = frozenset('0123456789ABCDEFabcdef') + _max_prefixlen = IPV6LENGTH - def __init__(self, address): - self._version = 6 - self._max_prefixlen = IPV6LENGTH + # There are only a bunch of valid v6 netmasks, so we cache them all + # when constructed (see _make_netmask()). + _netmask_cache = {} + + @classmethod + def _make_netmask(cls, arg): + """Make a (netmask, prefix_len) tuple from the given argument. - def _ip_int_from_string(self, ip_str): + Argument can be: + - an integer (the prefix length) + - a string representing the prefix length (e.g. "24") + - a string representing the prefix netmask (e.g. "255.255.255.0") + """ + if arg not in cls._netmask_cache: + if isinstance(arg, _compat_int_types): + prefixlen = arg + else: + prefixlen = cls._prefix_from_prefix_string(arg) + netmask = IPv6Address(cls._ip_int_from_prefix(prefixlen)) + cls._netmask_cache[arg] = netmask, prefixlen + return cls._netmask_cache[arg] + + @classmethod + def _ip_int_from_string(cls, ip_str): """Turn an IPv6 ip_str into an integer. Args: @@ -1657,15 +1783,16 @@ class _BaseV6(object): # An IPv6 address can't have more than 8 colons (9 parts). # The extra colon comes from using the "::" notation for a single # leading or trailing zero part. - _max_parts = self._HEXTET_COUNT + 1 + _max_parts = cls._HEXTET_COUNT + 1 if len(parts) > _max_parts: - msg = "At most %d colons permitted in %r" % (_max_parts-1, ip_str) + msg = "At most %d colons permitted in %r" % ( + _max_parts - 1, ip_str) raise AddressValueError(msg) # Disregarding the endpoints, find '::' with nothing in between. # This indicates that a run of zeroes has been skipped. skip_index = None - for i in range(1, len(parts) - 1): + for i in _compat_range(1, len(parts) - 1): if not parts[i]: if skip_index is not None: # Can't have more than one '::' @@ -1689,17 +1816,17 @@ class _BaseV6(object): if parts_lo: msg = "Trailing ':' only permitted as part of '::' in %r" raise AddressValueError(msg % ip_str) # :$ requires ::$ - parts_skipped = self._HEXTET_COUNT - (parts_hi + parts_lo) + parts_skipped = cls._HEXTET_COUNT - (parts_hi + parts_lo) if parts_skipped < 1: msg = "Expected at most %d other parts with '::' in %r" - raise AddressValueError(msg % (self._HEXTET_COUNT-1, ip_str)) + raise AddressValueError(msg % (cls._HEXTET_COUNT - 1, ip_str)) else: # Otherwise, allocate the entire address to parts_hi. The # endpoints could still be empty, but _parse_hextet() will check # for that. - if len(parts) != self._HEXTET_COUNT: + if len(parts) != cls._HEXTET_COUNT: msg = "Exactly %d parts expected without '::' in %r" - raise AddressValueError(msg % (self._HEXTET_COUNT, ip_str)) + raise AddressValueError(msg % (cls._HEXTET_COUNT, ip_str)) if not parts[0]: msg = "Leading ':' only permitted as part of '::' in %r" raise AddressValueError(msg % ip_str) # ^: requires ^:: @@ -1715,16 +1842,17 @@ class _BaseV6(object): ip_int = 0 for i in range(parts_hi): ip_int <<= 16 - ip_int |= self._parse_hextet(parts[i]) + ip_int |= cls._parse_hextet(parts[i]) ip_int <<= 16 * parts_skipped for i in range(-parts_lo, 0): ip_int <<= 16 - ip_int |= self._parse_hextet(parts[i]) + ip_int |= cls._parse_hextet(parts[i]) return ip_int except ValueError as exc: raise AddressValueError("%s in %r" % (exc, ip_str)) - def _parse_hextet(self, hextet_str): + @classmethod + def _parse_hextet(cls, hextet_str): """Convert an IPv6 hextet string into an integer. Args: @@ -1739,7 +1867,7 @@ class _BaseV6(object): """ # Whitelist the characters, since int() allows a lot of bizarre stuff. - if not self._HEX_DIGITS.issuperset(hextet_str): + if not cls._HEX_DIGITS.issuperset(hextet_str): raise ValueError("Only hex digits permitted in %r" % hextet_str) # We do the length check second, since the invalid character error # is likely to be more informative for the user @@ -1749,7 +1877,8 @@ class _BaseV6(object): # Length check means we can skip checking the integer value return int(hextet_str, 16) - def _compress_hextets(self, hextets): + @classmethod + def _compress_hextets(cls, hextets): """Compresses a list of hextets. Compresses a list of strings, replacing the longest continuous @@ -1796,7 +1925,8 @@ class _BaseV6(object): return hextets - def _string_from_ip_int(self, ip_int=None): + @classmethod + def _string_from_ip_int(cls, ip_int=None): """Turns a 128-bit integer into hexadecimal notation. Args: @@ -1810,15 +1940,15 @@ class _BaseV6(object): """ if ip_int is None: - ip_int = int(self._ip) + ip_int = int(cls._ip) - if ip_int > self._ALL_ONES: + if ip_int > cls._ALL_ONES: raise ValueError('IPv6 address is too large') hex_str = '%032x' % ip_int - hextets = ['%x' % int(hex_str[x:x+4], 16) for x in range(0, 32, 4)] + hextets = ['%x' % int(hex_str[x:x + 4], 16) for x in range(0, 32, 4)] - hextets = self._compress_hextets(hextets) + hextets = cls._compress_hextets(hextets) return ':'.join(hextets) def _explode_shorthand_ip_string(self): @@ -1832,19 +1962,28 @@ class _BaseV6(object): """ if isinstance(self, IPv6Network): - ip_str = str(self.network_address) + ip_str = _compat_str(self.network_address) elif isinstance(self, IPv6Interface): - ip_str = str(self.ip) + ip_str = _compat_str(self.ip) else: - ip_str = str(self) + ip_str = _compat_str(self) ip_int = self._ip_int_from_string(ip_str) hex_str = '%032x' % ip_int - parts = [hex_str[x:x+4] for x in range(0, 32, 4)] + parts = [hex_str[x:x + 4] for x in range(0, 32, 4)] if isinstance(self, (_BaseNetwork, IPv6Interface)): return '%s/%d' % (':'.join(parts), self._prefixlen) return ':'.join(parts) + def _reverse_pointer(self): + """Return the reverse DNS pointer name for the IPv6 address. + + This implements the method described in RFC3596 2.5. + + """ + reverse_chars = self.exploded[::-1].replace(':', '') + return '.'.join(reverse_chars) + '.ip6.arpa' + @property def max_prefixlen(self): return self._max_prefixlen @@ -1858,6 +1997,8 @@ class IPv6Address(_BaseV6, _BaseAddress): """Represent and manipulate single IPv6 Addresses.""" + __slots__ = ('_ip', '__weakref__') + def __init__(self, address): """Instantiate a new IPv6 address object. @@ -1875,11 +2016,8 @@ class IPv6Address(_BaseV6, _BaseAddress): AddressValueError: If address isn't a valid IPv6 address. """ - _BaseAddress.__init__(self, address) - _BaseV6.__init__(self, address) - # Efficient constructor from integer. - if isinstance(address, int): + if isinstance(address, _compat_int_types): self._check_int_address(address) self._ip = address return @@ -1887,12 +2025,15 @@ class IPv6Address(_BaseV6, _BaseAddress): # Constructing from a packed address if isinstance(address, bytes): self._check_packed_address(address, 16) - self._ip = _int_from_bytes(address, 'big') + bvs = _compat_bytes_to_byte_vals(address) + self._ip = _compat_int_from_byte_vals(bvs, 'big') return # Assume input argument to be string or any object representation # which converts into a formatted IP string. - addr_str = str(address) + addr_str = _compat_str(address) + if '/' in addr_str: + raise AddressValueError("Unexpected '/' in %r" % address) self._ip = self._ip_int_from_string(addr_str) @property @@ -1909,8 +2050,7 @@ class IPv6Address(_BaseV6, _BaseAddress): See RFC 2373 2.7 for details. """ - multicast_network = IPv6Network('ff00::/8') - return self in multicast_network + return self in self._constants._multicast_network @property def is_reserved(self): @@ -1921,16 +2061,7 @@ class IPv6Address(_BaseV6, _BaseAddress): reserved IPv6 Network ranges. """ - reserved_networks = [IPv6Network('::/8'), IPv6Network('100::/8'), - IPv6Network('200::/7'), IPv6Network('400::/6'), - IPv6Network('800::/5'), IPv6Network('1000::/4'), - IPv6Network('4000::/3'), IPv6Network('6000::/3'), - IPv6Network('8000::/3'), IPv6Network('A000::/3'), - IPv6Network('C000::/3'), IPv6Network('E000::/4'), - IPv6Network('F000::/5'), IPv6Network('F800::/6'), - IPv6Network('FE00::/9')] - - return any(self in x for x in reserved_networks) + return any(self in x for x in self._constants._reserved_networks) @property def is_link_local(self): @@ -1940,8 +2071,7 @@ class IPv6Address(_BaseV6, _BaseAddress): A boolean, True if the address is reserved per RFC 4291. """ - linklocal_network = IPv6Network('fe80::/10') - return self in linklocal_network + return self in self._constants._linklocal_network @property def is_site_local(self): @@ -1955,8 +2085,7 @@ class IPv6Address(_BaseV6, _BaseAddress): A boolean, True if the address is reserved per RFC 3513 2.5.6. """ - sitelocal_network = IPv6Network('fec0::/10') - return self in sitelocal_network + return self in self._constants._sitelocal_network @property def is_private(self): @@ -1967,16 +2096,7 @@ class IPv6Address(_BaseV6, _BaseAddress): iana-ipv6-special-registry. """ - return (self in IPv6Network('::1/128') or - self in IPv6Network('::/128') or - self in IPv6Network('::ffff:0:0/96') or - self in IPv6Network('100::/64') or - self in IPv6Network('2001::/23') or - self in IPv6Network('2001:2::/48') or - self in IPv6Network('2001:db8::/32') or - self in IPv6Network('2001:10::/28') or - self in IPv6Network('fc00::/7') or - self in IPv6Network('fe80::/10')) + return any(self in net for net in self._constants._private_networks) @property def is_global(self): @@ -2056,11 +2176,21 @@ class IPv6Address(_BaseV6, _BaseAddress): class IPv6Interface(IPv6Address): def __init__(self, address): - if isinstance(address, (bytes, int)): + if isinstance(address, (bytes, _compat_int_types)): IPv6Address.__init__(self, address) self.network = IPv6Network(self._ip) self._prefixlen = self._max_prefixlen return + if isinstance(address, tuple): + IPv6Address.__init__(self, address[0]) + if len(address) > 1: + self._prefixlen = int(address[1]) + else: + self._prefixlen = self._max_prefixlen + self.network = IPv6Network(address, strict=False) + self.netmask = self.network.netmask + self.hostmask = self.network.hostmask + return addr = _split_optional_netmask(address) IPv6Address.__init__(self, addr[0]) @@ -2090,7 +2220,8 @@ class IPv6Interface(IPv6Address): if address_less is NotImplemented: return NotImplemented try: - return self.network < other.network + return (self.network < other.network or + self.network == other.network and address_less) except AttributeError: # We *do* allow addresses and interfaces to be sorted. The # unassociated address is considered less than all interfaces. @@ -2099,6 +2230,8 @@ class IPv6Interface(IPv6Address): def __hash__(self): return self._ip ^ self._prefixlen ^ int(self.network.network_address) + __reduce__ = _IPAddressBase.__reduce__ + @property def ip(self): return IPv6Address(self._ip) @@ -2175,21 +2308,29 @@ class IPv6Network(_BaseV6, _BaseNetwork): supplied. """ - _BaseV6.__init__(self, address) _BaseNetwork.__init__(self, address) - # Efficient constructor from integer. - if isinstance(address, int): + # Efficient constructor from integer or packed address + if isinstance(address, (bytes, _compat_int_types)): self.network_address = IPv6Address(address) - self._prefixlen = self._max_prefixlen - self.netmask = IPv6Address(self._ALL_ONES) + self.netmask, self._prefixlen = self._make_netmask( + self._max_prefixlen) return - # Constructing from a packed address - if isinstance(address, bytes): - self.network_address = IPv6Address(address) - self._prefixlen = self._max_prefixlen - self.netmask = IPv6Address(self._ALL_ONES) + if isinstance(address, tuple): + if len(address) > 1: + arg = address[1] + else: + arg = self._max_prefixlen + self.netmask, self._prefixlen = self._make_netmask(arg) + self.network_address = IPv6Address(address[0]) + packed = int(self.network_address) + if packed & int(self.netmask) != packed: + if strict: + raise ValueError('%s has host bits set' % self) + else: + self.network_address = IPv6Address(packed & + int(self.netmask)) return # Assume input argument to be string or any object representation @@ -2199,15 +2340,14 @@ class IPv6Network(_BaseV6, _BaseNetwork): self.network_address = IPv6Address(self._ip_int_from_string(addr[0])) if len(addr) == 2: - # This may raise NetmaskValueError - self._prefixlen = self._prefix_from_prefix_string(addr[1]) + arg = addr[1] else: - self._prefixlen = self._max_prefixlen + arg = self._max_prefixlen + self.netmask, self._prefixlen = self._make_netmask(arg) - self.netmask = IPv6Address(self._ip_int_from_prefix(self._prefixlen)) if strict: if (IPv6Address(int(self.network_address) & int(self.netmask)) != - self.network_address): + self.network_address): raise ValueError('%s has host bits set' % self) self.network_address = IPv6Address(int(self.network_address) & int(self.netmask)) @@ -2215,6 +2355,18 @@ class IPv6Network(_BaseV6, _BaseNetwork): if self._prefixlen == (self._max_prefixlen - 1): self.hosts = self.__iter__ + def hosts(self): + """Generate Iterator over usable hosts in a network. + + This is like __iter__ except it doesn't return the + Subnet-Router anycast address. + + """ + network = int(self.network_address) + broadcast = int(self.broadcast_address) + for x in _compat_range(network + 1, broadcast + 1): + yield self._address_class(x) + @property def is_site_local(self): """Test if the address is reserved for site-local. @@ -2229,3 +2381,39 @@ class IPv6Network(_BaseV6, _BaseNetwork): """ return (self.network_address.is_site_local and self.broadcast_address.is_site_local) + + +class _IPv6Constants(object): + + _linklocal_network = IPv6Network('fe80::/10') + + _multicast_network = IPv6Network('ff00::/8') + + _private_networks = [ + IPv6Network('::1/128'), + IPv6Network('::/128'), + IPv6Network('::ffff:0:0/96'), + IPv6Network('100::/64'), + IPv6Network('2001::/23'), + IPv6Network('2001:2::/48'), + IPv6Network('2001:db8::/32'), + IPv6Network('2001:10::/28'), + IPv6Network('fc00::/7'), + IPv6Network('fe80::/10'), + ] + + _reserved_networks = [ + IPv6Network('::/8'), IPv6Network('100::/8'), + IPv6Network('200::/7'), IPv6Network('400::/6'), + IPv6Network('800::/5'), IPv6Network('1000::/4'), + IPv6Network('4000::/3'), IPv6Network('6000::/3'), + IPv6Network('8000::/3'), IPv6Network('A000::/3'), + IPv6Network('C000::/3'), IPv6Network('E000::/4'), + IPv6Network('F000::/5'), IPv6Network('F800::/6'), + IPv6Network('FE00::/9'), + ] + + _sitelocal_network = IPv6Network('fec0::/10') + + +IPv6Address._constants = _IPv6Constants diff --git a/libs/libfilebot/__init__.py b/libs/libfilebot/__init__.py index c734afaef..4a81ddff2 100644 --- a/libs/libfilebot/__init__.py +++ b/libs/libfilebot/__init__.py @@ -1,5 +1,6 @@ # coding=utf-8 -from main import get_filebot_attrs +from __future__ import absolute_import +from .main import get_filebot_attrs __all__ = ["get_filebot_attrs"] diff --git a/libs/libfilebot/lib.py b/libs/libfilebot/lib.py index ac7e469b0..7820bdc26 100644 --- a/libs/libfilebot/lib.py +++ b/libs/libfilebot/lib.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import os import sys diff --git a/libs/libfilebot/main.py b/libs/libfilebot/main.py index 9a4e685eb..3e1333e0b 100644 --- a/libs/libfilebot/main.py +++ b/libs/libfilebot/main.py @@ -1,5 +1,7 @@ # coding=utf-8 +from __future__ import absolute_import +from __future__ import print_function import subprocess import sys import traceback @@ -10,7 +12,7 @@ import types import os from pipes import quote -from lib import find_executable +from .lib import find_executable mswindows = False if sys.platform == "win32": @@ -87,7 +89,7 @@ def get_filebot_attrs(fn): args_func, match_func = XATTR_MAP.get(sys.platform, XATTR_MAP["default"]) args = args_func(fn) - if isinstance(args, types.ListType): + if isinstance(args, list): try: env = dict(os.environ) if not mswindows: @@ -132,4 +134,4 @@ def get_filebot_attrs(fn): if __name__ == "__main__": - print get_filebot_attrs(sys.argv[1]) + print(get_filebot_attrs(sys.argv[1])) diff --git a/libs/pretty/__init__.py b/libs/pretty/__init__.py index 748623b6a..424d1537c 100644 --- a/libs/pretty/__init__.py +++ b/libs/pretty/__init__.py @@ -8,10 +8,16 @@ __copyright__ = "Copyright 2010, S Anand" __license__ = "WTFPL" from datetime import datetime +from six import PY3 def _df(seconds, denominator=1, text='', past=True): - if past: return str((seconds + denominator/2)/ denominator) + text + ' ago' - else: return 'in ' + str((seconds + denominator/2)/ denominator) + text + if PY3: + result = str(round(seconds / denominator)) + else: + result = str((seconds + denominator/2)/ denominator) + + if past: return result + text + ' ago' + else: return 'in ' + result + text def date(time=False, asdays=False, short=False): '''Returns a pretty formatted date. diff --git a/libs/pyprobe/__init__.py b/libs/pyprobe/__init__.py index b8a1da01a..14e25f36e 100644 --- a/libs/pyprobe/__init__.py +++ b/libs/pyprobe/__init__.py @@ -1,2 +1,2 @@ -from pyprobe import VideoFileParser +from .pyprobe import VideoFileParser diff --git a/libs/pyprobe/ffprobeparsers.py b/libs/pyprobe/ffprobeparsers.py index 64ad05d10..b3627816c 100644 --- a/libs/pyprobe/ffprobeparsers.py +++ b/libs/pyprobe/ffprobeparsers.py @@ -1,6 +1,7 @@ +from __future__ import absolute_import from os import path -from baseparser import BaseParser +from .baseparser import BaseParser class StreamParser(BaseParser): diff --git a/libs/pyprobe/pyprobe.py b/libs/pyprobe/pyprobe.py index bb63fb0e6..b300965d5 100644 --- a/libs/pyprobe/pyprobe.py +++ b/libs/pyprobe/pyprobe.py @@ -1,9 +1,11 @@ +from __future__ import absolute_import +from six import PY3 import json import subprocess from os import path from sys import getfilesystemencoding -import ffprobeparsers +from . import ffprobeparsers class VideoFileParser: @@ -174,16 +176,27 @@ class VideoFileParser: IOError: ffprobe execution failed """ - command = [parser] + commandArgs + [inputFile.encode(getfilesystemencoding())] - try: - completedProcess = subprocess.check_output( - command, stderr=subprocess.STDOUT - ) - except subprocess.CalledProcessError as e: - raise IOError( - "Error occurred during execution - " + e.output + if PY3: + command = [parser] + commandArgs + [inputFile] + completedProcess = subprocess.run( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="utf-8" ) - return completedProcess + if completedProcess.returncode != 0: + raise IOError( + "Error occurred during execution - " + completedProcess.stderr + ) + return completedProcess.stdout + else: + command = [parser] + commandArgs + [inputFile.encode(getfilesystemencoding())] + try: + completedProcess = subprocess.check_output( + command, stderr=subprocess.STDOUT + ) + except subprocess.CalledProcessError as e: + raise IOError( + "Error occurred during execution - " + e.output + ) + return completedProcess @staticmethod def _checkExecutable(executable): diff --git a/libs/pysubs2/common.py b/libs/pysubs2/common.py index 2f95ccf44..4688e5df4 100644 --- a/libs/pysubs2/common.py +++ b/libs/pysubs2/common.py @@ -17,7 +17,7 @@ class Color(_Color): return _Color.__new__(cls, r, g, b, a) #: Version of the pysubs2 library. -VERSION = "0.2.3" +VERSION = "0.2.4" PY3 = sys.version_info.major == 3 diff --git a/libs/pysubs2/formats.py b/libs/pysubs2/formats.py index 5c25a6e96..869a3b6c7 100644 --- a/libs/pysubs2/formats.py +++ b/libs/pysubs2/formats.py @@ -4,6 +4,7 @@ from .subrip import SubripFormat from .jsonformat import JSONFormat from .substation import SubstationFormat from .mpl2 import MPL2Format +from .tmp import TmpFormat from .exceptions import * #: Dict mapping file extensions to format identifiers. @@ -13,6 +14,7 @@ FILE_EXTENSION_TO_FORMAT_IDENTIFIER = { ".ssa": "ssa", ".sub": "microdvd", ".json": "json", + ".txt": "tmp", } #: Dict mapping format identifiers to implementations (FormatBase subclasses). @@ -23,6 +25,7 @@ FORMAT_IDENTIFIER_TO_FORMAT_CLASS = { "microdvd": MicroDVDFormat, "json": JSONFormat, "mpl2": MPL2Format, + "tmp": TmpFormat, } def get_format_class(format_): diff --git a/libs/pysubs2/ssafile.py b/libs/pysubs2/ssafile.py index c6a668439..390a31b54 100644 --- a/libs/pysubs2/ssafile.py +++ b/libs/pysubs2/ssafile.py @@ -66,7 +66,14 @@ class SSAFile(MutableSequence): be detected from the file, in which case you don't need to specify it here (when given, this argument overrides autodetection). - kwargs: Extra options for the parser. + keep_unknown_html_tags (bool): This affects SubRip only (SRT), + for other formats this argument is ignored. + By default, HTML tags are converted to equivalent SubStation tags + (eg. ``<i>`` to ``{\\i1}`` and any remaining tags are removed + to keep the text clean. Set this parameter to ``True`` + if you want to pass through these tags (eg. ``<sub>``). + This is useful if your output format is SRT and your player + supports these tags. Returns: SSAFile @@ -86,6 +93,7 @@ class SSAFile(MutableSequence): Example: >>> subs1 = pysubs2.load("subrip-subtitles.srt") >>> subs2 = pysubs2.load("microdvd-subtitles.sub", fps=23.976) + >>> subs3 = pysubs2.load("subrip-subtitles-with-fancy-tags.srt", keep_unknown_html_tags=True) """ with open(path, encoding=encoding) as fp: diff --git a/libs/pysubs2/ssastyle.py b/libs/pysubs2/ssastyle.py index 2fcadc7ed..eb59b74b5 100644 --- a/libs/pysubs2/ssastyle.py +++ b/libs/pysubs2/ssastyle.py @@ -56,7 +56,7 @@ class SSAStyle(object): self.encoding = 1 #: Charset for k, v in fields.items(): - if k in self.FIELDS and v is not None: + if k in self.FIELDS: setattr(self, k, v) else: raise ValueError("SSAStyle has no field named %r" % k) diff --git a/libs/pysubs2/subrip.py b/libs/pysubs2/subrip.py index fea4eade6..70cb96fe5 100644 --- a/libs/pysubs2/subrip.py +++ b/libs/pysubs2/subrip.py @@ -31,7 +31,7 @@ class SubripFormat(FormatBase): return "srt" @classmethod - def from_file(cls, subs, fp, format_, **kwargs): + def from_file(cls, subs, fp, format_, keep_unknown_html_tags=False, **kwargs): timestamps = [] # (start, end) following_lines = [] # contains lists of lines following each timestamp @@ -56,15 +56,15 @@ class SubripFormat(FormatBase): # Handle the general case. s = "".join(lines).strip() s = re.sub(r"\n+ *\d+ *$", "", s) # strip number of next subtitle - s = re.sub(r"< *i *>", r"{\i1}", s) - s = re.sub(r"< */ *i *>", r"{\i0}", s) - s = re.sub(r"< *s *>", r"{\s1}", s) - s = re.sub(r"< */ *s *>", r"{\s0}", s) - s = re.sub(r"< *u *>", "{\\u1}", s) # not r" for Python 2.7 compat, triggers unicodeescape - s = re.sub(r"< */ *u *>", "{\\u0}", s) - s = re.sub(r"< */? *[a-zA-Z][^>]*>", "", s) # strip other HTML tags - s = re.sub(r"\r", "", s) # convert newlines - s = re.sub(r"\n", r"\N", s) # convert newlines + s = re.sub(r"< *i *>", r"{\\i1}", s) + s = re.sub(r"< */ *i *>", r"{\\i0}", s) + s = re.sub(r"< *s *>", r"{\\s1}", s) + s = re.sub(r"< */ *s *>", r"{\\s0}", s) + s = re.sub(r"< *u *>", "{\\\\u1}", s) # not r" for Python 2.7 compat, triggers unicodeescape + s = re.sub(r"< */ *u *>", "{\\\\u0}", s) + if not keep_unknown_html_tags: + s = re.sub(r"< */? *[a-zA-Z][^>]*>", "", s) # strip other HTML tags + s = re.sub(r"\n", r"\\N", s) # convert newlines return s subs.events = [SSAEvent(start=start, end=end, text=prepare_text(lines)) diff --git a/libs/pysubs2/substation.py b/libs/pysubs2/substation.py index f810a4776..8563f8a0d 100644 --- a/libs/pysubs2/substation.py +++ b/libs/pysubs2/substation.py @@ -145,7 +145,12 @@ class SubstationFormat(FormatBase): def string_to_field(f, v): if f in {"start", "end"}: - return timestamp_to_ms(TIMESTAMP.match(v).groups()) + if v.startswith("-"): + # handle negative timestamps + v = v[1:] + return -timestamp_to_ms(TIMESTAMP.match(v).groups()) + else: + return timestamp_to_ms(TIMESTAMP.match(v).groups()) elif "color" in f: if format_ == "ass": return ass_rgba_to_color(v) @@ -184,22 +189,22 @@ class SubstationFormat(FormatBase): elif inside_info_section or inside_aegisub_section: if line.startswith(";"): continue # skip comments try: - k, v = line.split(": ", 1) + k, v = line.split(":", 1) if inside_info_section: - subs.info[k] = v + subs.info[k] = v.strip() elif inside_aegisub_section: - subs.aegisub_project[k] = v + subs.aegisub_project[k] = v.strip() except ValueError: pass elif line.startswith("Style:"): - _, rest = line.split(": ", 1) + _, rest = line.split(":", 1) buf = rest.strip().split(",") name, raw_fields = buf[0], buf[1:] # splat workaround for Python 2.7 field_dict = {f: string_to_field(f, v) for f, v in zip(STYLE_FIELDS[format_], raw_fields)} sty = SSAStyle(**field_dict) subs.styles[name] = sty elif line.startswith("Dialogue:") or line.startswith("Comment:"): - ev_type, rest = line.split(": ", 1) + ev_type, rest = line.split(":", 1) raw_fields = rest.strip().split(",", len(EVENT_FIELDS[format_])-1) field_dict = {f: string_to_field(f, v) for f, v in zip(EVENT_FIELDS[format_], raw_fields)} field_dict["type"] = ev_type diff --git a/libs/pysubs2/time.py b/libs/pysubs2/time.py index 46d349f85..24e9ec077 100644 --- a/libs/pysubs2/time.py +++ b/libs/pysubs2/time.py @@ -49,6 +49,20 @@ def timestamp_to_ms(groups): ms += h * 3600000 return ms +def tmptimestamp_to_ms(groups): + """ + Convert groups from :data:`pysubs2.time.TMPTIMESTAMP` match to milliseconds. + + Example: + >>> timestamp_to_ms(TIMESTAMP.match("0:00:01").groups()) + 1000 + + """ + h, m, s = map(int, groups) + ms = s * 1000 + ms += m * 60000 + ms += h * 3600000 + return ms def times_to_ms(h=0, m=0, s=0, ms=0): """ Convert hours, minutes, seconds to milliseconds. diff --git a/libs/pysubs2/tmp.py b/libs/pysubs2/tmp.py new file mode 100644 index 000000000..aae55202c --- /dev/null +++ b/libs/pysubs2/tmp.py @@ -0,0 +1,88 @@ +from __future__ import print_function, unicode_literals + +import re +from .formatbase import FormatBase +from .ssaevent import SSAEvent +from .ssastyle import SSAStyle +from .substation import parse_tags +from .time import ms_to_times, make_time, tmptimestamp_to_ms + +#: Pattern that matches TMP timestamp +TMPTIMESTAMP = re.compile(r"(\d{1,2}):(\d{2}):(\d{2})") +#: Pattern that matches TMP line +TMP_LINE = re.compile(r"(\d{1,2}:\d{2}:\d{2}):(.+)") + +#: Largest timestamp allowed in Tmp, ie. 99:59:59. +MAX_REPRESENTABLE_TIME = make_time(h=100) - 1 + +def ms_to_timestamp(ms): + """Convert ms to 'HH:MM:SS'""" + # XXX throw on overflow/underflow? + if ms < 0: ms = 0 + if ms > MAX_REPRESENTABLE_TIME: ms = MAX_REPRESENTABLE_TIME + h, m, s, ms = ms_to_times(ms) + return "%02d:%02d:%02d" % (h, m, s) + + +class TmpFormat(FormatBase): + @classmethod + def guess_format(cls, text): + if "[Script Info]" in text or "[V4+ Styles]" in text: + # disambiguation vs. SSA/ASS + return None + + for line in text.splitlines(): + if TMP_LINE.match(line) and len(TMP_LINE.findall(line)) == 1: + return "tmp" + + @classmethod + def from_file(cls, subs, fp, format_, **kwargs): + timestamps = [] # (start) + lines = [] # contains lists of lines following each timestamp + + for line in fp: + match = TMP_LINE.match(line) + if not match: + continue + + start, text = match.groups() + start = tmptimestamp_to_ms(TMPTIMESTAMP.match(start).groups()) + #calculate endtime from starttime + 500 miliseconds + 67 miliseconds per each character (15 chars per second) + end = start + 500 + (len(line) * 67) + timestamps.append((start, end)) + lines.append(text) + + def prepare_text(lines): + lines = lines.replace("|", r"\N") # convert newlines + lines = re.sub(r"< *u *>", "{\\\\u1}", lines) # not r" for Python 2.7 compat, triggers unicodeescape + lines = re.sub(r"< */? *[a-zA-Z][^>]*>", "", lines) # strip other HTML tags + return lines + + subs.events = [SSAEvent(start=start, end=end, text=prepare_text(lines)) + for (start, end), lines in zip(timestamps, lines)] + + @classmethod + def to_file(cls, subs, fp, format_, **kwargs): + def prepare_text(text, style): + body = [] + for fragment, sty in parse_tags(text, style, subs.styles): + fragment = fragment.replace(r"\h", " ") + fragment = fragment.replace(r"\n", "\n") + fragment = fragment.replace(r"\N", "\n") + if sty.italic: fragment = "<i>%s</i>" % fragment + if sty.underline: fragment = "<u>%s</u>" % fragment + if sty.strikeout: fragment = "<s>%s</s>" % fragment + body.append(fragment) + + return re.sub("\n+", "\n", "".join(body).strip()) + + visible_lines = (line for line in subs if not line.is_comment) + + for i, line in enumerate(visible_lines, 1): + start = ms_to_timestamp(line.start) + #end = ms_to_timestamp(line.end) + text = prepare_text(line.text, subs.styles.get(line.style, SSAStyle.DEFAULT_STYLE)) + + #print("%d" % i, file=fp) # Python 2.7 compat + print(start + ":" + text, end="\n", file=fp) + #print(text, end="\n\n", file=fp) diff --git a/libs/pysubs2/txt_generic.py b/libs/pysubs2/txt_generic.py deleted file mode 100644 index 70bf3e31c..000000000 --- a/libs/pysubs2/txt_generic.py +++ /dev/null @@ -1,45 +0,0 @@ -# coding=utf-8 - -from __future__ import print_function, division, unicode_literals -import re -from numbers import Number - -from pysubs2.time import times_to_ms -from .formatbase import FormatBase -from .ssaevent import SSAEvent -from .ssastyle import SSAStyle - - -# thanks to http://otsaloma.io/gaupol/doc/api/aeidon.files.mpl2_source.html -MPL2_FORMAT = re.compile(r"^(?um)\[(-?\d+)\]\[(-?\d+)\](.*?)$") - - -class TXTGenericFormat(FormatBase): - @classmethod - def guess_format(cls, text): - if MPL2_FORMAT.match(text): - return "mpl2" - - -class MPL2Format(FormatBase): - @classmethod - def guess_format(cls, text): - return TXTGenericFormat.guess_format(text) - - @classmethod - def from_file(cls, subs, fp, format_, **kwargs): - def prepare_text(lines): - out = [] - for s in lines.split("|"): - if s.startswith("/"): - out.append(r"{\i1}%s{\i0}" % s[1:]) - continue - out.append(s) - return "\n".join(out) - - subs.events = [SSAEvent(start=times_to_ms(s=float(start) / 10), end=times_to_ms(s=float(end) / 10), - text=prepare_text(text)) for start, end, text in MPL2_FORMAT.findall(fp.getvalue())] - - @classmethod - def to_file(cls, subs, fp, format_, **kwargs): - raise NotImplemented diff --git a/libs/requests/__init__.py b/libs/requests/__init__.py index bc168ee53..9a899df67 100644 --- a/libs/requests/__init__.py +++ b/libs/requests/__init__.py @@ -57,10 +57,10 @@ def check_compatibility(urllib3_version, chardet_version): # Check urllib3 for compatibility. major, minor, patch = urllib3_version # noqa: F811 major, minor, patch = int(major), int(minor), int(patch) - # urllib3 >= 1.21.1, <= 1.24 + # urllib3 >= 1.21.1, <= 1.25 assert major == 1 assert minor >= 21 - assert minor <= 24 + assert minor <= 25 # Check chardet for compatibility. major, minor, patch = chardet_version.split('.')[:3] diff --git a/libs/requests/__version__.py b/libs/requests/__version__.py index f5b5d0367..9844f740a 100644 --- a/libs/requests/__version__.py +++ b/libs/requests/__version__.py @@ -5,10 +5,10 @@ __title__ = 'requests' __description__ = 'Python HTTP for Humans.' __url__ = 'http://python-requests.org' -__version__ = '2.21.0' -__build__ = 0x022100 +__version__ = '2.22.0' +__build__ = 0x022200 __author__ = 'Kenneth Reitz' __author_email__ = '[email protected]' __license__ = 'Apache 2.0' -__copyright__ = 'Copyright 2018 Kenneth Reitz' +__copyright__ = 'Copyright 2019 Kenneth Reitz' __cake__ = u'\u2728 \U0001f370 \u2728' diff --git a/libs/requests/api.py b/libs/requests/api.py index abada96d4..ef71d0759 100644 --- a/libs/requests/api.py +++ b/libs/requests/api.py @@ -19,7 +19,7 @@ def request(method, url, **kwargs): :param method: method for the new :class:`Request` object. :param url: URL for the new :class:`Request` object. :param params: (optional) Dictionary, list of tuples or bytes to send - in the body of the :class:`Request`. + in the query string for the :class:`Request`. :param data: (optional) Dictionary, list of tuples, bytes, or file-like object to send in the body of the :class:`Request`. :param json: (optional) A JSON serializable Python object to send in the body of the :class:`Request`. @@ -65,7 +65,7 @@ def get(url, params=None, **kwargs): :param url: URL for the new :class:`Request` object. :param params: (optional) Dictionary, list of tuples or bytes to send - in the body of the :class:`Request`. + in the query string for the :class:`Request`. :param \*\*kwargs: Optional arguments that ``request`` takes. :return: :class:`Response <Response>` object :rtype: requests.Response diff --git a/libs/simpleconfigparser/__init__.py b/libs/simpleconfigparser/__init__.py index 1a515024c..959ebda6a 100644 --- a/libs/simpleconfigparser/__init__.py +++ b/libs/simpleconfigparser/__init__.py @@ -23,12 +23,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -# Bazarr patch to use custom ConfigParser2: -from ConfigParser2 import ConfigParser as configparser, NoOptionError, NoSectionError -#try: -# from configparser2 import ConfigParser as configparser, NoOptionError, NoSectionError -#except ImportError: -# from ConfigParser import SafeConfigParser as configparser, NoOptionError, NoSectionError +try: + from backports.configparser2 import ConfigParser as configparser, NoOptionError, NoSectionError +except ImportError: + from ConfigParser import SafeConfigParser as configparser, NoOptionError, NoSectionError class simpleconfigparser(configparser): diff --git a/libs/six.py b/libs/six.py index 6bf4fd381..89b2188fd 100644 --- a/libs/six.py +++ b/libs/six.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2017 Benjamin Peterson +# Copyright (c) 2010-2018 Benjamin Peterson # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -29,7 +29,7 @@ import sys import types __author__ = "Benjamin Peterson <[email protected]>" -__version__ = "1.11.0" +__version__ = "1.12.0" # Useful for very coarse version differentiation. @@ -844,10 +844,71 @@ def add_metaclass(metaclass): orig_vars.pop(slots_var) orig_vars.pop('__dict__', None) orig_vars.pop('__weakref__', None) + if hasattr(cls, '__qualname__'): + orig_vars['__qualname__'] = cls.__qualname__ return metaclass(cls.__name__, cls.__bases__, orig_vars) return wrapper +def ensure_binary(s, encoding='utf-8', errors='strict'): + """Coerce **s** to six.binary_type. + + For Python 2: + - `unicode` -> encoded to `str` + - `str` -> `str` + + For Python 3: + - `str` -> encoded to `bytes` + - `bytes` -> `bytes` + """ + if isinstance(s, text_type): + return s.encode(encoding, errors) + elif isinstance(s, binary_type): + return s + else: + raise TypeError("not expecting type '%s'" % type(s)) + + +def ensure_str(s, encoding='utf-8', errors='strict'): + """Coerce *s* to `str`. + + For Python 2: + - `unicode` -> encoded to `str` + - `str` -> `str` + + For Python 3: + - `str` -> `str` + - `bytes` -> decoded to `str` + """ + if not isinstance(s, (text_type, binary_type)): + raise TypeError("not expecting type '%s'" % type(s)) + if PY2 and isinstance(s, text_type): + s = s.encode(encoding, errors) + elif PY3 and isinstance(s, binary_type): + s = s.decode(encoding, errors) + return s + + +def ensure_text(s, encoding='utf-8', errors='strict'): + """Coerce *s* to six.text_type. + + For Python 2: + - `unicode` -> `unicode` + - `str` -> `unicode` + + For Python 3: + - `str` -> `str` + - `bytes` -> decoded to `str` + """ + if isinstance(s, binary_type): + return s.decode(encoding, errors) + elif isinstance(s, text_type): + return s + else: + raise TypeError("not expecting type '%s'" % type(s)) + + + def python_2_unicode_compatible(klass): """ A decorator that defines __unicode__ and __str__ methods under Python 2. diff --git a/libs/soupsieve/__init__.py b/libs/soupsieve/__init__.py new file mode 100644 index 000000000..ebd3a4a43 --- /dev/null +++ b/libs/soupsieve/__init__.py @@ -0,0 +1,127 @@ +""" +Soup Sieve. + +A CSS selector filter for BeautifulSoup4. + +MIT License + +Copyright (c) 2018 Isaac Muse + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +from __future__ import unicode_literals +from .__meta__ import __version__, __version_info__ # noqa: F401 +from . import css_parser as cp +from . import css_match as cm +from . import css_types as ct +from .util import DEBUG, deprecated, SelectorSyntaxError # noqa: F401 + +__all__ = ( + 'DEBUG', 'SelectorSyntaxError', 'SoupSieve', + 'closest', 'comments', 'compile', 'filter', 'icomments', + 'iselect', 'match', 'select', 'select_one' +) + +SoupSieve = cm.SoupSieve + + +def compile(pattern, namespaces=None, flags=0, **kwargs): # noqa: A001 + """Compile CSS pattern.""" + + if namespaces is not None: + namespaces = ct.Namespaces(**namespaces) + + custom = kwargs.get('custom') + if custom is not None: + custom = ct.CustomSelectors(**custom) + + if isinstance(pattern, SoupSieve): + if flags: + raise ValueError("Cannot process 'flags' argument on a compiled selector list") + elif namespaces is not None: + raise ValueError("Cannot process 'namespaces' argument on a compiled selector list") + elif custom is not None: + raise ValueError("Cannot process 'custom' argument on a compiled selector list") + return pattern + + return cp._cached_css_compile(pattern, namespaces, custom, flags) + + +def purge(): + """Purge cached patterns.""" + + cp._purge_cache() + + +def closest(select, tag, namespaces=None, flags=0, **kwargs): + """Match closest ancestor.""" + + return compile(select, namespaces, flags, **kwargs).closest(tag) + + +def match(select, tag, namespaces=None, flags=0, **kwargs): + """Match node.""" + + return compile(select, namespaces, flags, **kwargs).match(tag) + + +def filter(select, iterable, namespaces=None, flags=0, **kwargs): # noqa: A001 + """Filter list of nodes.""" + + return compile(select, namespaces, flags, **kwargs).filter(iterable) + + +@deprecated("'comments' is not related to CSS selectors and will be removed in the future.") +def comments(tag, limit=0, flags=0, **kwargs): + """Get comments only.""" + + return [comment for comment in cm.CommentsMatch(tag).get_comments(limit)] + + +@deprecated("'icomments' is not related to CSS selectors and will be removed in the future.") +def icomments(tag, limit=0, flags=0, **kwargs): + """Iterate comments only.""" + + for comment in cm.CommentsMatch(tag).get_comments(limit): + yield comment + + +def select_one(select, tag, namespaces=None, flags=0, **kwargs): + """Select a single tag.""" + + return compile(select, namespaces, flags, **kwargs).select_one(tag) + + +def select(select, tag, namespaces=None, limit=0, flags=0, **kwargs): + """Select the specified tags.""" + + return compile(select, namespaces, flags, **kwargs).select(tag, limit) + + +def iselect(select, tag, namespaces=None, limit=0, flags=0, **kwargs): + """Iterate the specified tags.""" + + for el in compile(select, namespaces, flags, **kwargs).iselect(tag, limit): + yield el + + +def escape(ident): + """Escape identifier.""" + + return cp.escape(ident) diff --git a/libs/soupsieve/__meta__.py b/libs/soupsieve/__meta__.py new file mode 100644 index 000000000..35a80664b --- /dev/null +++ b/libs/soupsieve/__meta__.py @@ -0,0 +1,190 @@ +"""Meta related things.""" +from __future__ import unicode_literals +from collections import namedtuple +import re + +RE_VER = re.compile( + r'''(?x) + (?P<major>\d+)(?:\.(?P<minor>\d+))?(?:\.(?P<micro>\d+))? + (?:(?P<type>a|b|rc)(?P<pre>\d+))? + (?:\.post(?P<post>\d+))? + (?:\.dev(?P<dev>\d+))? + ''' +) + +REL_MAP = { + ".dev": "", + ".dev-alpha": "a", + ".dev-beta": "b", + ".dev-candidate": "rc", + "alpha": "a", + "beta": "b", + "candidate": "rc", + "final": "" +} + +DEV_STATUS = { + ".dev": "2 - Pre-Alpha", + ".dev-alpha": "2 - Pre-Alpha", + ".dev-beta": "2 - Pre-Alpha", + ".dev-candidate": "2 - Pre-Alpha", + "alpha": "3 - Alpha", + "beta": "4 - Beta", + "candidate": "4 - Beta", + "final": "5 - Production/Stable" +} + +PRE_REL_MAP = {"a": 'alpha', "b": 'beta', "rc": 'candidate'} + + +class Version(namedtuple("Version", ["major", "minor", "micro", "release", "pre", "post", "dev"])): + """ + Get the version (PEP 440). + + A biased approach to the PEP 440 semantic version. + + Provides a tuple structure which is sorted for comparisons `v1 > v2` etc. + (major, minor, micro, release type, pre-release build, post-release build, development release build) + Release types are named in is such a way they are comparable with ease. + Accessors to check if a development, pre-release, or post-release build. Also provides accessor to get + development status for setup files. + + How it works (currently): + + - You must specify a release type as either `final`, `alpha`, `beta`, or `candidate`. + - To define a development release, you can use either `.dev`, `.dev-alpha`, `.dev-beta`, or `.dev-candidate`. + The dot is used to ensure all development specifiers are sorted before `alpha`. + You can specify a `dev` number for development builds, but do not have to as implicit development releases + are allowed. + - You must specify a `pre` value greater than zero if using a prerelease as this project (not PEP 440) does not + allow implicit prereleases. + - You can optionally set `post` to a value greater than zero to make the build a post release. While post releases + are technically allowed in prereleases, it is strongly discouraged, so we are rejecting them. It should be + noted that we do not allow `post0` even though PEP 440 does not restrict this. This project specifically + does not allow implicit post releases. + - It should be noted that we do not support epochs `1!` or local versions `+some-custom.version-1`. + + Acceptable version releases: + + ``` + Version(1, 0, 0, "final") 1.0 + Version(1, 2, 0, "final") 1.2 + Version(1, 2, 3, "final") 1.2.3 + Version(1, 2, 0, ".dev-alpha", pre=4) 1.2a4 + Version(1, 2, 0, ".dev-beta", pre=4) 1.2b4 + Version(1, 2, 0, ".dev-candidate", pre=4) 1.2rc4 + Version(1, 2, 0, "final", post=1) 1.2.post1 + Version(1, 2, 3, ".dev") 1.2.3.dev0 + Version(1, 2, 3, ".dev", dev=1) 1.2.3.dev1 + ``` + + """ + + def __new__(cls, major, minor, micro, release="final", pre=0, post=0, dev=0): + """Validate version info.""" + + # Ensure all parts are positive integers. + for value in (major, minor, micro, pre, post): + if not (isinstance(value, int) and value >= 0): + raise ValueError("All version parts except 'release' should be integers.") + + if release not in REL_MAP: + raise ValueError("'{}' is not a valid release type.".format(release)) + + # Ensure valid pre-release (we do not allow implicit pre-releases). + if ".dev-candidate" < release < "final": + if pre == 0: + raise ValueError("Implicit pre-releases not allowed.") + elif dev: + raise ValueError("Version is not a development release.") + elif post: + raise ValueError("Post-releases are not allowed with pre-releases.") + + # Ensure valid development or development/pre release + elif release < "alpha": + if release > ".dev" and pre == 0: + raise ValueError("Implicit pre-release not allowed.") + elif post: + raise ValueError("Post-releases are not allowed with pre-releases.") + + # Ensure a valid normal release + else: + if pre: + raise ValueError("Version is not a pre-release.") + elif dev: + raise ValueError("Version is not a development release.") + + return super(Version, cls).__new__(cls, major, minor, micro, release, pre, post, dev) + + def _is_pre(self): + """Is prerelease.""" + + return self.pre > 0 + + def _is_dev(self): + """Is development.""" + + return bool(self.release < "alpha") + + def _is_post(self): + """Is post.""" + + return self.post > 0 + + def _get_dev_status(self): # pragma: no cover + """Get development status string.""" + + return DEV_STATUS[self.release] + + def _get_canonical(self): + """Get the canonical output string.""" + + # Assemble major, minor, micro version and append `pre`, `post`, or `dev` if needed.. + if self.micro == 0: + ver = "{}.{}".format(self.major, self.minor) + else: + ver = "{}.{}.{}".format(self.major, self.minor, self.micro) + if self._is_pre(): + ver += '{}{}'.format(REL_MAP[self.release], self.pre) + if self._is_post(): + ver += ".post{}".format(self.post) + if self._is_dev(): + ver += ".dev{}".format(self.dev) + + return ver + + +def parse_version(ver, pre=False): + """Parse version into a comparable Version tuple.""" + + m = RE_VER.match(ver) + + # Handle major, minor, micro + major = int(m.group('major')) + minor = int(m.group('minor')) if m.group('minor') else 0 + micro = int(m.group('micro')) if m.group('micro') else 0 + + # Handle pre releases + if m.group('type'): + release = PRE_REL_MAP[m.group('type')] + pre = int(m.group('pre')) + else: + release = "final" + pre = 0 + + # Handle development releases + dev = m.group('dev') if m.group('dev') else 0 + if m.group('dev'): + dev = int(m.group('dev')) + release = '.dev-' + release if pre else '.dev' + else: + dev = 0 + + # Handle post + post = int(m.group('post')) if m.group('post') else 0 + + return Version(major, minor, micro, release, pre, post, dev) + + +__version_info__ = Version(1, 9, 3, "final") +__version__ = __version_info__._get_canonical() diff --git a/libs/soupsieve/css_match.py b/libs/soupsieve/css_match.py new file mode 100644 index 000000000..aa3be671e --- /dev/null +++ b/libs/soupsieve/css_match.py @@ -0,0 +1,1472 @@ +"""CSS matcher.""" +from __future__ import unicode_literals +from datetime import datetime +from . import util +import re +from .import css_types as ct +import unicodedata + +# Empty tag pattern (whitespace okay) +RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') + +RE_NOT_WS = re.compile('[^ \t\r\n\f]+') + +# Relationships +REL_PARENT = ' ' +REL_CLOSE_PARENT = '>' +REL_SIBLING = '~' +REL_CLOSE_SIBLING = '+' + +# Relationships for :has() (forward looking) +REL_HAS_PARENT = ': ' +REL_HAS_CLOSE_PARENT = ':>' +REL_HAS_SIBLING = ':~' +REL_HAS_CLOSE_SIBLING = ':+' + +NS_XHTML = 'http://www.w3.org/1999/xhtml' +NS_XML = 'http://www.w3.org/XML/1998/namespace' + +DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL +RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE + +DIR_MAP = { + 'ltr': ct.SEL_DIR_LTR, + 'rtl': ct.SEL_DIR_RTL, + 'auto': 0 +} + +RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$") +RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$') +RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$') +RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$') +RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$') +RE_DATETIME = re.compile( + r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' +) + +MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November +FEB = 2 +SHORT_MONTH = 30 +LONG_MONTH = 31 +FEB_MONTH = 28 +FEB_LEAP_MONTH = 29 +DAYS_IN_WEEK = 7 + + +class _FakeParent(object): + """ + Fake parent class. + + When we have a fragment with no `BeautifulSoup` document object, + we can't evaluate `nth` selectors properly. Create a temporary + fake parent so we can traverse the root element as a child. + """ + + def __init__(self, element): + """Initialize.""" + + self.contents = [element] + + def __len__(self): + """Length.""" + + return len(self.contents) + + +class _DocumentNav(object): + """Navigate a Beautiful Soup document.""" + + @classmethod + def assert_valid_input(cls, tag): + """Check if valid input tag or document.""" + + # Fail on unexpected types. + if not cls.is_tag(tag): + raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag))) + + @staticmethod + def is_doc(obj): + """Is `BeautifulSoup` object.""" + + import bs4 + return isinstance(obj, bs4.BeautifulSoup) + + @staticmethod + def is_tag(obj): + """Is tag.""" + + import bs4 + return isinstance(obj, bs4.Tag) + + @staticmethod + def is_comment(obj): + """Is comment.""" + + import bs4 + return isinstance(obj, bs4.Comment) + + @staticmethod + def is_declaration(obj): # pragma: no cover + """Is declaration.""" + + import bs4 + return isinstance(obj, bs4.Declaration) + + @staticmethod + def is_cdata(obj): + """Is CDATA.""" + + import bs4 + return isinstance(obj, bs4.CData) + + @staticmethod + def is_processing_instruction(obj): # pragma: no cover + """Is processing instruction.""" + + import bs4 + return isinstance(obj, bs4.ProcessingInstruction) + + @staticmethod + def is_navigable_string(obj): + """Is navigable string.""" + + import bs4 + return isinstance(obj, bs4.NavigableString) + + @staticmethod + def is_special_string(obj): + """Is special string.""" + + import bs4 + return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) + + @classmethod + def is_content_string(cls, obj): + """Check if node is content string.""" + + return cls.is_navigable_string(obj) and not cls.is_special_string(obj) + + @staticmethod + def create_fake_parent(el): + """Create fake parent for a given element.""" + + return _FakeParent(el) + + @staticmethod + def is_xml_tree(el): + """Check if element (or document) is from a XML tree.""" + + return el._is_xml + + def is_iframe(self, el): + """Check if element is an `iframe`.""" + + return ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el) + + def is_root(self, el): + """ + Return whether element is a root element. + + We check that the element is the root of the tree (which we have already pre-calculated), + and we check if it is the root element under an `iframe`. + """ + + root = self.root and self.root is el + if not root: + parent = self.get_parent(el) + root = parent is not None and self.is_html and self.is_iframe(parent) + return root + + def get_contents(self, el, no_iframe=False): + """Get contents or contents in reverse.""" + if not no_iframe or not self.is_iframe(el): + for content in el.contents: + yield content + + def get_children(self, el, start=None, reverse=False, tags=True, no_iframe=False): + """Get children.""" + + if not no_iframe or not self.is_iframe(el): + last = len(el.contents) - 1 + if start is None: + index = last if reverse else 0 + else: + index = start + end = -1 if reverse else last + 1 + incr = -1 if reverse else 1 + + if 0 <= index <= last: + while index != end: + node = el.contents[index] + index += incr + if not tags or self.is_tag(node): + yield node + + def get_descendants(self, el, tags=True, no_iframe=False): + """Get descendants.""" + + if not no_iframe or not self.is_iframe(el): + next_good = None + for child in el.descendants: + + if next_good is not None: + if child is not next_good: + continue + next_good = None + + is_tag = self.is_tag(child) + + if no_iframe and is_tag and self.is_iframe(child): + if child.next_sibling is not None: + next_good = child.next_sibling + else: + last_child = child + while self.is_tag(last_child) and last_child.contents: + last_child = last_child.contents[-1] + next_good = last_child.next_element + yield child + if next_good is None: + break + # Coverage isn't seeing this even though it's executed + continue # pragma: no cover + + if not tags or is_tag: + yield child + + def get_parent(self, el, no_iframe=False): + """Get parent.""" + + parent = el.parent + if no_iframe and parent is not None and self.is_iframe(parent): + parent = None + return parent + + @staticmethod + def get_tag_name(el): + """Get tag.""" + + return el.name + + @staticmethod + def get_prefix_name(el): + """Get prefix.""" + + return el.prefix + + @staticmethod + def get_uri(el): + """Get namespace `URI`.""" + + return el.namespace + + @classmethod + def get_next(cls, el, tags=True): + """Get next sibling tag.""" + + sibling = el.next_sibling + while tags and not cls.is_tag(sibling) and sibling is not None: + sibling = sibling.next_sibling + return sibling + + @classmethod + def get_previous(cls, el, tags=True): + """Get previous sibling tag.""" + + sibling = el.previous_sibling + while tags and not cls.is_tag(sibling) and sibling is not None: + sibling = sibling.previous_sibling + return sibling + + @staticmethod + def has_html_ns(el): + """ + Check if element has an HTML namespace. + + This is a bit different than whether a element is treated as having an HTML namespace, + like we do in the case of `is_html_tag`. + """ + + ns = getattr(el, 'namespace') if el else None + return ns and ns == NS_XHTML + + @staticmethod + def split_namespace(el, attr_name): + """Return namespace and attribute name without the prefix.""" + + return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) + + @staticmethod + def get_attribute_by_name(el, name, default=None): + """Get attribute by name.""" + + value = default + if el._is_xml: + try: + value = el.attrs[name] + except KeyError: + pass + else: + for k, v in el.attrs.items(): + if util.lower(k) == name: + value = v + break + return value + + @staticmethod + def iter_attributes(el): + """Iterate attributes.""" + + for k, v in el.attrs.items(): + yield k, v + + @classmethod + def get_classes(cls, el): + """Get classes.""" + + classes = cls.get_attribute_by_name(el, 'class', []) + if isinstance(classes, util.ustr): + classes = RE_NOT_WS.findall(classes) + return classes + + def get_text(self, el, no_iframe=False): + """Get text.""" + + return ''.join( + [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] + ) + + +class Inputs(object): + """Class for parsing and validating input items.""" + + @staticmethod + def validate_day(year, month, day): + """Validate day.""" + + max_days = LONG_MONTH + if month == FEB: + max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH + elif month in MONTHS_30: + max_days = SHORT_MONTH + return 1 <= day <= max_days + + @staticmethod + def validate_week(year, week): + """Validate week.""" + + max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1] + if max_week == 1: + max_week = 53 + return 1 <= week <= max_week + + @staticmethod + def validate_month(month): + """Validate month.""" + + return 1 <= month <= 12 + + @staticmethod + def validate_year(year): + """Validate year.""" + + return 1 <= year + + @staticmethod + def validate_hour(hour): + """Validate hour.""" + + return 0 <= hour <= 23 + + @staticmethod + def validate_minutes(minutes): + """Validate minutes.""" + + return 0 <= minutes <= 59 + + @classmethod + def parse_value(cls, itype, value): + """Parse the input value.""" + + parsed = None + if itype == "date": + m = RE_DATE.match(value) + if m: + year = int(m.group('year'), 10) + month = int(m.group('month'), 10) + day = int(m.group('day'), 10) + if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day): + parsed = (year, month, day) + elif itype == "month": + m = RE_MONTH.match(value) + if m: + year = int(m.group('year'), 10) + month = int(m.group('month'), 10) + if cls.validate_year(year) and cls.validate_month(month): + parsed = (year, month) + elif itype == "week": + m = RE_WEEK.match(value) + if m: + year = int(m.group('year'), 10) + week = int(m.group('week'), 10) + if cls.validate_year(year) and cls.validate_week(year, week): + parsed = (year, week) + elif itype == "time": + m = RE_TIME.match(value) + if m: + hour = int(m.group('hour'), 10) + minutes = int(m.group('minutes'), 10) + if cls.validate_hour(hour) and cls.validate_minutes(minutes): + parsed = (hour, minutes) + elif itype == "datetime-local": + m = RE_DATETIME.match(value) + if m: + year = int(m.group('year'), 10) + month = int(m.group('month'), 10) + day = int(m.group('day'), 10) + hour = int(m.group('hour'), 10) + minutes = int(m.group('minutes'), 10) + if ( + cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and + cls.validate_hour(hour) and cls.validate_minutes(minutes) + ): + parsed = (year, month, day, hour, minutes) + elif itype in ("number", "range"): + m = RE_NUM.match(value) + if m: + parsed = float(m.group('value')) + return parsed + + +class _Match(object): + """Perform CSS matching.""" + + def __init__(self, selectors, scope, namespaces, flags): + """Initialize.""" + + self.assert_valid_input(scope) + self.tag = scope + self.cached_meta_lang = [] + self.cached_default_forms = [] + self.cached_indeterminate_forms = [] + self.selectors = selectors + self.namespaces = {} if namespaces is None else namespaces + self.flags = flags + self.iframe_restrict = False + + # Find the root element for the whole tree + doc = scope + parent = self.get_parent(doc) + while parent: + doc = parent + parent = self.get_parent(doc) + root = None + if not self.is_doc(doc): + root = doc + else: + for child in self.get_children(doc): + root = child + break + + self.root = root + self.scope = scope if scope is not doc else root + self.has_html_namespace = self.has_html_ns(root) + + # A document can be both XML and HTML (XHTML) + self.is_xml = self.is_xml_tree(doc) + self.is_html = not self.is_xml or self.has_html_namespace + + def supports_namespaces(self): + """Check if namespaces are supported in the HTML type.""" + + return self.is_xml or self.has_html_namespace + + def get_tag_ns(self, el): + """Get tag namespace.""" + + if self.supports_namespaces(): + namespace = '' + ns = self.get_uri(el) + if ns: + namespace = ns + else: + namespace = NS_XHTML + return namespace + + def is_html_tag(self, el): + """Check if tag is in HTML namespace.""" + + return self.get_tag_ns(el) == NS_XHTML + + def get_tag(self, el): + """Get tag.""" + + name = self.get_tag_name(el) + return util.lower(name) if name is not None and not self.is_xml else name + + def get_prefix(self, el): + """Get prefix.""" + + prefix = self.get_prefix_name(el) + return util.lower(prefix) if prefix is not None and not self.is_xml else prefix + + def find_bidi(self, el): + """Get directionality from element text.""" + + for node in self.get_children(el, tags=False): + + # Analyze child text nodes + if self.is_tag(node): + + # Avoid analyzing certain elements specified in the specification. + direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) + if ( + self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or + not self.is_html_tag(node) or + direction is not None + ): + continue # pragma: no cover + + # Check directionality of this node's text + value = self.find_bidi(node) + if value is not None: + return value + + # Direction could not be determined + continue # pragma: no cover + + # Skip `doctype` comments, etc. + if self.is_special_string(node): + continue + + # Analyze text nodes for directionality. + for c in node: + bidi = unicodedata.bidirectional(c) + if bidi in ('AL', 'R', 'L'): + return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL + return None + + def match_attribute_name(self, el, attr, prefix): + """Match attribute name and return value if it exists.""" + + value = None + if self.supports_namespaces(): + value = None + # If we have not defined namespaces, we can't very well find them, so don't bother trying. + if prefix: + ns = self.namespaces.get(prefix) + if ns is None and prefix != '*': + return None + else: + ns = None + + for k, v in self.iter_attributes(el): + + # Get attribute parts + namespace, name = self.split_namespace(el, k) + + # Can't match a prefix attribute as we haven't specified one to match + # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`. + if ns is None: + if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)): + value = v + break + # Coverage is not finding this even though it is executed. + # Adding a print statement before this (and erasing coverage) causes coverage to find the line. + # Ignore the false positive message. + continue # pragma: no cover + + # We can't match our desired prefix attribute as the attribute doesn't have a prefix + if namespace is None or ns != namespace and prefix != '*': + continue + + # The attribute doesn't match. + if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name): + continue + + value = v + break + else: + for k, v in self.iter_attributes(el): + if util.lower(attr) != util.lower(k): + continue + value = v + break + return value + + def match_namespace(self, el, tag): + """Match the namespace of the element.""" + + match = True + namespace = self.get_tag_ns(el) + default_namespace = self.namespaces.get('') + tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix, None) + # We must match the default namespace if one is not provided + if tag.prefix is None and (default_namespace is not None and namespace != default_namespace): + match = False + # If we specified `|tag`, we must not have a namespace. + elif (tag.prefix is not None and tag.prefix == '' and namespace): + match = False + # Verify prefix matches + elif ( + tag.prefix and + tag.prefix != '*' and (tag_ns is None or namespace != tag_ns) + ): + match = False + return match + + def match_attributes(self, el, attributes): + """Match attributes.""" + + match = True + if attributes: + for a in attributes: + value = self.match_attribute_name(el, a.attribute, a.prefix) + pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern + if isinstance(value, list): + value = ' '.join(value) + if value is None: + match = False + break + elif pattern is None: + continue + elif pattern.match(value) is None: + match = False + break + return match + + def match_tagname(self, el, tag): + """Match tag name.""" + + name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) + return not ( + name is not None and + name not in (self.get_tag(el), '*') + ) + + def match_tag(self, el, tag): + """Match the tag.""" + + match = True + if tag is not None: + # Verify namespace + if not self.match_namespace(el, tag): + match = False + if not self.match_tagname(el, tag): + match = False + return match + + def match_past_relations(self, el, relation): + """Match past relationship.""" + + found = False + if relation[0].rel_type == REL_PARENT: + parent = self.get_parent(el, no_iframe=self.iframe_restrict) + while not found and parent: + found = self.match_selectors(parent, relation) + parent = self.get_parent(parent, no_iframe=self.iframe_restrict) + elif relation[0].rel_type == REL_CLOSE_PARENT: + parent = self.get_parent(el, no_iframe=self.iframe_restrict) + if parent: + found = self.match_selectors(parent, relation) + elif relation[0].rel_type == REL_SIBLING: + sibling = self.get_previous(el) + while not found and sibling: + found = self.match_selectors(sibling, relation) + sibling = self.get_previous(sibling) + elif relation[0].rel_type == REL_CLOSE_SIBLING: + sibling = self.get_previous(el) + if sibling and self.is_tag(sibling): + found = self.match_selectors(sibling, relation) + return found + + def match_future_child(self, parent, relation, recursive=False): + """Match future child.""" + + match = False + children = self.get_descendants if recursive else self.get_children + for child in children(parent, no_iframe=self.iframe_restrict): + match = self.match_selectors(child, relation) + if match: + break + return match + + def match_future_relations(self, el, relation): + """Match future relationship.""" + + found = False + if relation[0].rel_type == REL_HAS_PARENT: + found = self.match_future_child(el, relation, True) + elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: + found = self.match_future_child(el, relation) + elif relation[0].rel_type == REL_HAS_SIBLING: + sibling = self.get_next(el) + while not found and sibling: + found = self.match_selectors(sibling, relation) + sibling = self.get_next(sibling) + elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: + sibling = self.get_next(el) + if sibling and self.is_tag(sibling): + found = self.match_selectors(sibling, relation) + return found + + def match_relations(self, el, relation): + """Match relationship to other elements.""" + + found = False + + if relation[0].rel_type.startswith(':'): + found = self.match_future_relations(el, relation) + else: + found = self.match_past_relations(el, relation) + + return found + + def match_id(self, el, ids): + """Match element's ID.""" + + found = True + for i in ids: + if i != self.get_attribute_by_name(el, 'id', ''): + found = False + break + return found + + def match_classes(self, el, classes): + """Match element's classes.""" + + current_classes = self.get_classes(el) + found = True + for c in classes: + if c not in current_classes: + found = False + break + return found + + def match_root(self, el): + """Match element as root.""" + + is_root = self.is_root(el) + if is_root: + sibling = self.get_previous(el, tags=False) + while is_root and sibling is not None: + if ( + self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or + self.is_cdata(sibling) + ): + is_root = False + else: + sibling = self.get_previous(sibling, tags=False) + if is_root: + sibling = self.get_next(el, tags=False) + while is_root and sibling is not None: + if ( + self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or + self.is_cdata(sibling) + ): + is_root = False + else: + sibling = self.get_next(sibling, tags=False) + return is_root + + def match_scope(self, el): + """Match element as scope.""" + + return self.scope is el + + def match_nth_tag_type(self, el, child): + """Match tag type for `nth` matches.""" + + return( + (self.get_tag(child) == self.get_tag(el)) and + (self.get_tag_ns(child) == self.get_tag_ns(el)) + ) + + def match_nth(self, el, nth): + """Match `nth` elements.""" + + matched = True + + for n in nth: + matched = False + if n.selectors and not self.match_selectors(el, n.selectors): + break + parent = self.get_parent(el) + if parent is None: + parent = self.create_fake_parent(el) + last = n.last + last_index = len(parent) - 1 + index = last_index if last else 0 + relative_index = 0 + a = n.a + b = n.b + var = n.n + count = 0 + count_incr = 1 + factor = -1 if last else 1 + idx = last_idx = a * count + b if var else a + + # We can only adjust bounds within a variable index + if var: + # Abort if our nth index is out of bounds and only getting further out of bounds as we increment. + # Otherwise, increment to try to get in bounds. + adjust = None + while idx < 1 or idx > last_index: + if idx < 0: + diff_low = 0 - idx + if adjust is not None and adjust == 1: + break + adjust = -1 + count += count_incr + idx = last_idx = a * count + b if var else a + diff = 0 - idx + if diff >= diff_low: + break + else: + diff_high = idx - last_index + if adjust is not None and adjust == -1: + break + adjust = 1 + count += count_incr + idx = last_idx = a * count + b if var else a + diff = idx - last_index + if diff >= diff_high: + break + diff_high = diff + + # If a < 0, our count is working backwards, so floor the index by increasing the count. + # Find the count that yields the lowest, in bound value and use that. + # Lastly reverse count increment so that we'll increase our index. + lowest = count + if a < 0: + while idx >= 1: + lowest = count + count += count_incr + idx = last_idx = a * count + b if var else a + count_incr = -1 + count = lowest + idx = last_idx = a * count + b if var else a + + # Evaluate elements while our calculated nth index is still in range + while 1 <= idx <= last_index + 1: + child = None + # Evaluate while our child index is still in range. + for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False): + index += factor + if not self.is_tag(child): + continue + # Handle `of S` in `nth-child` + if n.selectors and not self.match_selectors(child, n.selectors): + continue + # Handle `of-type` + if n.of_type and not self.match_nth_tag_type(el, child): + continue + relative_index += 1 + if relative_index == idx: + if child is el: + matched = True + else: + break + if child is el: + break + if child is el: + break + last_idx = idx + count += count_incr + if count < 0: + # Count is counting down and has now ventured into invalid territory. + break + idx = a * count + b if var else a + if last_idx == idx: + break + if not matched: + break + return matched + + def match_empty(self, el): + """Check if element is empty (if requested).""" + + is_empty = True + for child in self.get_children(el, tags=False): + if self.is_tag(child): + is_empty = False + break + elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): + is_empty = False + break + return is_empty + + def match_subselectors(self, el, selectors): + """Match selectors.""" + + match = True + for sel in selectors: + if not self.match_selectors(el, sel): + match = False + return match + + def match_contains(self, el, contains): + """Match element if it contains text.""" + + match = True + content = None + for contain_list in contains: + if content is None: + content = self.get_text(el, no_iframe=self.is_html) + found = False + for text in contain_list.text: + if text in content: + found = True + break + if not found: + match = False + return match + + def match_default(self, el): + """Match default.""" + + match = False + + # Find this input's form + form = None + parent = self.get_parent(el, no_iframe=True) + while parent and form is None: + if self.get_tag(parent) == 'form' and self.is_html_tag(parent): + form = parent + else: + parent = self.get_parent(parent, no_iframe=True) + + # Look in form cache to see if we've already located its default button + found_form = False + for f, t in self.cached_default_forms: + if f is form: + found_form = True + if t is el: + match = True + break + + # We didn't have the form cached, so look for its default button + if not found_form: + for child in self.get_descendants(form, no_iframe=True): + name = self.get_tag(child) + # Can't do nested forms (haven't figured out why we never hit this) + if name == 'form': # pragma: no cover + break + if name in ('input', 'button'): + v = self.get_attribute_by_name(child, 'type', '') + if v and util.lower(v) == 'submit': + self.cached_default_forms.append([form, child]) + if el is child: + match = True + break + return match + + def match_indeterminate(self, el): + """Match default.""" + + match = False + name = self.get_attribute_by_name(el, 'name') + + def get_parent_form(el): + """Find this input's form.""" + form = None + parent = self.get_parent(el, no_iframe=True) + while form is None: + if self.get_tag(parent) == 'form' and self.is_html_tag(parent): + form = parent + break + last_parent = parent + parent = self.get_parent(parent, no_iframe=True) + if parent is None: + form = last_parent + break + return form + + form = get_parent_form(el) + + # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate + found_form = False + for f, n, i in self.cached_indeterminate_forms: + if f is form and n == name: + found_form = True + if i is True: + match = True + break + + # We didn't have the form cached, so validate that the radio button is indeterminate + if not found_form: + checked = False + for child in self.get_descendants(form, no_iframe=True): + if child is el: + continue + tag_name = self.get_tag(child) + if tag_name == 'input': + is_radio = False + check = False + has_name = False + for k, v in self.iter_attributes(child): + if util.lower(k) == 'type' and util.lower(v) == 'radio': + is_radio = True + elif util.lower(k) == 'name' and v == name: + has_name = True + elif util.lower(k) == 'checked': + check = True + if is_radio and check and has_name and get_parent_form(child) is form: + checked = True + break + if checked: + break + if not checked: + match = True + self.cached_indeterminate_forms.append([form, name, match]) + + return match + + def match_lang(self, el, langs): + """Match languages.""" + + match = False + has_ns = self.supports_namespaces() + root = self.root + has_html_namespace = self.has_html_namespace + + # Walk parents looking for `lang` (HTML) or `xml:lang` XML property. + parent = el + found_lang = None + last = None + while not found_lang: + has_html_ns = self.has_html_ns(parent) + for k, v in self.iter_attributes(parent): + attr_ns, attr = self.split_namespace(parent, k) + if ( + ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or + ( + has_ns and not has_html_ns and attr_ns == NS_XML and + (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang' + ) + ): + found_lang = v + break + last = parent + parent = self.get_parent(parent, no_iframe=self.is_html) + + if parent is None: + root = last + has_html_namespace = self.has_html_ns(root) + parent = last + break + + # Use cached meta language. + if not found_lang and self.cached_meta_lang: + for cache in self.cached_meta_lang: + if root is cache[0]: + found_lang = cache[1] + + # If we couldn't find a language, and the document is HTML, look to meta to determine language. + if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')): + # Find head + found = False + for tag in ('html', 'head'): + found = False + for child in self.get_children(parent, no_iframe=self.is_html): + if self.get_tag(child) == tag and self.is_html_tag(child): + found = True + parent = child + break + if not found: # pragma: no cover + break + + # Search meta tags + if found: + for child in parent: + if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent): + c_lang = False + content = None + for k, v in self.iter_attributes(child): + if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language': + c_lang = True + if util.lower(k) == 'content': + content = v + if c_lang and content: + found_lang = content + self.cached_meta_lang.append((root, found_lang)) + break + if found_lang: + break + if not found_lang: + self.cached_meta_lang.append((root, False)) + + # If we determined a language, compare. + if found_lang: + for patterns in langs: + match = False + for pattern in patterns: + if pattern.match(found_lang): + match = True + if not match: + break + + return match + + def match_dir(self, el, directionality): + """Check directionality.""" + + # If we have to match both left and right, we can't match either. + if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: + return False + + if el is None or not self.is_html_tag(el): + return False + + # Element has defined direction of left to right or right to left + direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) + if direction not in (None, 0): + return direction == directionality + + # Element is the document element (the root) and no direction assigned, assume left to right. + is_root = self.is_root(el) + if is_root and direction is None: + return ct.SEL_DIR_LTR == directionality + + # If `input[type=telephone]` and no direction is assigned, assume left to right. + name = self.get_tag(el) + is_input = name == 'input' + is_textarea = name == 'textarea' + is_bdi = name == 'bdi' + itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' + if is_input and itype == 'tel' and direction is None: + return ct.SEL_DIR_LTR == directionality + + # Auto handling for text inputs + if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: + if is_textarea: + value = [] + for node in self.get_contents(el, no_iframe=True): + if self.is_content_string(node): + value.append(node) + value = ''.join(value) + else: + value = self.get_attribute_by_name(el, 'value', '') + if value: + for c in value: + bidi = unicodedata.bidirectional(c) + if bidi in ('AL', 'R', 'L'): + direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL + return direction == directionality + # Assume left to right + return ct.SEL_DIR_LTR == directionality + elif is_root: + return ct.SEL_DIR_LTR == directionality + return self.match_dir(self.get_parent(el, no_iframe=True), directionality) + + # Auto handling for `bdi` and other non text inputs. + if (is_bdi and direction is None) or direction == 0: + direction = self.find_bidi(el) + if direction is not None: + return direction == directionality + elif is_root: + return ct.SEL_DIR_LTR == directionality + return self.match_dir(self.get_parent(el, no_iframe=True), directionality) + + # Match parents direction + return self.match_dir(self.get_parent(el, no_iframe=True), directionality) + + def match_range(self, el, condition): + """ + Match range. + + Behavior is modeled after what we see in browsers. Browsers seem to evaluate + if the value is out of range, and if not, it is in range. So a missing value + will not evaluate out of range; therefore, value is in range. Personally, I + feel like this should evaluate as neither in or out of range. + """ + + out_of_range = False + + itype = self.get_attribute_by_name(el, 'type').lower() + mn = self.get_attribute_by_name(el, 'min', None) + if mn is not None: + mn = Inputs.parse_value(itype, mn) + mx = self.get_attribute_by_name(el, 'max', None) + if mx is not None: + mx = Inputs.parse_value(itype, mx) + + # There is no valid min or max, so we cannot evaluate a range + if mn is None and mx is None: + return False + + value = self.get_attribute_by_name(el, 'value', None) + if value is not None: + value = Inputs.parse_value(itype, value) + if value is not None: + if itype in ("date", "datetime-local", "month", "week", "number", "range"): + if mn is not None and value < mn: + out_of_range = True + if not out_of_range and mx is not None and value > mx: + out_of_range = True + elif itype == "time": + if mn is not None and mx is not None and mn > mx: + # Time is periodic, so this is a reversed/discontinuous range + if value < mn and value > mx: + out_of_range = True + else: + if mn is not None and value < mn: + out_of_range = True + if not out_of_range and mx is not None and value > mx: + out_of_range = True + + return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range + + def match_defined(self, el): + """ + Match defined. + + `:defined` is related to custom elements in a browser. + + - If the document is XML (not XHTML), all tags will match. + - Tags that are not custom (don't have a hyphen) are marked defined. + - If the tag has a prefix (without or without a namespace), it will not match. + + This is of course requires the parser to provide us with the proper prefix and namespace info, + if it doesn't, there is nothing we can do. + """ + + name = self.get_tag(el) + return ( + name.find('-') == -1 or + name.find(':') != -1 or + self.get_prefix(el) is not None + ) + + def match_selectors(self, el, selectors): + """Check if element matches one of the selectors.""" + + match = False + is_not = selectors.is_not + is_html = selectors.is_html + + # Internal selector lists that use the HTML flag, will automatically get the `html` namespace. + if is_html: + namespaces = self.namespaces + iframe_restrict = self.iframe_restrict + self.namespaces = {'html': NS_XHTML} + self.iframe_restrict = True + + if not is_html or self.is_html: + for selector in selectors: + match = is_not + # We have a un-matchable situation (like `:focus` as you can focus an element in this environment) + if isinstance(selector, ct.SelectorNull): + continue + # Verify tag matches + if not self.match_tag(el, selector.tag): + continue + # Verify tag is defined + if selector.flags & ct.SEL_DEFINED and not self.match_defined(el): + continue + # Verify element is root + if selector.flags & ct.SEL_ROOT and not self.match_root(el): + continue + # Verify element is scope + if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): + continue + # Verify `nth` matches + if not self.match_nth(el, selector.nth): + continue + if selector.flags & ct.SEL_EMPTY and not self.match_empty(el): + continue + # Verify id matches + if selector.ids and not self.match_id(el, selector.ids): + continue + # Verify classes match + if selector.classes and not self.match_classes(el, selector.classes): + continue + # Verify attribute(s) match + if not self.match_attributes(el, selector.attributes): + continue + # Verify ranges + if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES): + continue + # Verify language patterns + if selector.lang and not self.match_lang(el, selector.lang): + continue + # Verify pseudo selector patterns + if selector.selectors and not self.match_subselectors(el, selector.selectors): + continue + # Verify relationship selectors + if selector.relation and not self.match_relations(el, selector.relation): + continue + # Validate that the current default selector match corresponds to the first submit button in the form + if selector.flags & ct.SEL_DEFAULT and not self.match_default(el): + continue + # Validate that the unset radio button is among radio buttons with the same name in a form that are + # also not set. + if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el): + continue + # Validate element directionality + if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS): + continue + # Validate that the tag contains the specified text. + if not self.match_contains(el, selector.contains): + continue + match = not is_not + break + + # Restore actual namespaces being used for external selector lists + if is_html: + self.namespaces = namespaces + self.iframe_restrict = iframe_restrict + + return match + + def select(self, limit=0): + """Match all tags under the targeted tag.""" + + if limit < 1: + limit = None + + for child in self.get_descendants(self.tag): + if self.match(child): + yield child + if limit is not None: + limit -= 1 + if limit < 1: + break + + def closest(self): + """Match closest ancestor.""" + + current = self.tag + closest = None + while closest is None and current is not None: + if self.match(current): + closest = current + else: + current = self.get_parent(current) + return closest + + def filter(self): # noqa A001 + """Filter tag's children.""" + + return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)] + + def match(self, el): + """Match.""" + + return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) + + +class CSSMatch(_DocumentNav, _Match): + """The Beautiful Soup CSS match class.""" + + +class CommentsMatch(_DocumentNav): + """Comments matcher.""" + + def __init__(self, el): + """Initialize.""" + + self.assert_valid_input(el) + self.tag = el + + def get_comments(self, limit=0): + """Get comments.""" + + if limit < 1: + limit = None + + for child in self.get_descendants(self.tag, tags=False): + if self.is_comment(child): + yield child + if limit is not None: + limit -= 1 + if limit < 1: + break + + +class SoupSieve(ct.Immutable): + """Compiled Soup Sieve selector matching object.""" + + __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") + + def __init__(self, pattern, selectors, namespaces, custom, flags): + """Initialize.""" + + super(SoupSieve, self).__init__( + pattern=pattern, + selectors=selectors, + namespaces=namespaces, + custom=custom, + flags=flags + ) + + def match(self, tag): + """Match.""" + + return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) + + def closest(self, tag): + """Match closest ancestor.""" + + return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() + + def filter(self, iterable): # noqa A001 + """ + Filter. + + `CSSMatch` can cache certain searches for tags of the same document, + so if we are given a tag, all tags are from the same document, + and we can take advantage of the optimization. + + Any other kind of iterable could have tags from different documents or detached tags, + so for those, we use a new `CSSMatch` for each item in the iterable. + """ + + if CSSMatch.is_tag(iterable): + return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter() + else: + return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] + + @util.deprecated("'comments' is not related to CSS selectors and will be removed in the future.") + def comments(self, tag, limit=0): + """Get comments only.""" + + return [comment for comment in CommentsMatch(tag).get_comments(limit)] + + @util.deprecated("'icomments' is not related to CSS selectors and will be removed in the future.") + def icomments(self, tag, limit=0): + """Iterate comments only.""" + + for comment in CommentsMatch(tag).get_comments(limit): + yield comment + + def select_one(self, tag): + """Select a single tag.""" + + tags = self.select(tag, limit=1) + return tags[0] if tags else None + + def select(self, tag, limit=0): + """Select the specified tags.""" + + return list(self.iselect(tag, limit)) + + def iselect(self, tag, limit=0): + """Iterate the specified tags.""" + + for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit): + yield el + + def __repr__(self): # pragma: no cover + """Representation.""" + + return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format( + self.pattern, + self.namespaces, + self.custom, + self.flags + ) + + __str__ = __repr__ + + +ct.pickle_register(SoupSieve) diff --git a/libs/soupsieve/css_parser.py b/libs/soupsieve/css_parser.py new file mode 100644 index 000000000..ed8368a53 --- /dev/null +++ b/libs/soupsieve/css_parser.py @@ -0,0 +1,1218 @@ +"""CSS selector parser.""" +from __future__ import unicode_literals +import re +from . import util +from . import css_match as cm +from . import css_types as ct +from .util import SelectorSyntaxError + +UNICODE_REPLACEMENT_CHAR = 0xFFFD + +# Simple pseudo classes that take no parameters +PSEUDO_SIMPLE = { + ":any-link", + ":empty", + ":first-child", + ":first-of-type", + ":in-range", + ":out-of-range", + ":last-child", + ":last-of-type", + ":link", + ":only-child", + ":only-of-type", + ":root", + ':checked', + ':default', + ':disabled', + ':enabled', + ':indeterminate', + ':optional', + ':placeholder-shown', + ':read-only', + ':read-write', + ':required', + ':scope', + ':defined' +} + +# Supported, simple pseudo classes that match nothing in the Soup Sieve environment +PSEUDO_SIMPLE_NO_MATCH = { + ':active', + ':current', + ':focus', + ':focus-visible', + ':focus-within', + ':future', + ':host', + ':hover', + ':local-link', + ':past', + ':paused', + ':playing', + ':target', + ':target-within', + ':user-invalid', + ':visited' +} + +# Complex pseudo classes that take selector lists +PSEUDO_COMPLEX = { + ':contains', + ':has', + ':is', + ':matches', + ':not', + ':where' +} + +PSEUDO_COMPLEX_NO_MATCH = { + ':current', + ':host', + ':host-context' +} + +# Complex pseudo classes that take very specific parameters and are handled special +PSEUDO_SPECIAL = { + ':dir', + ':lang', + ':nth-child', + ':nth-last-child', + ':nth-last-of-type', + ':nth-of-type' +} + +PSEUDO_SUPPORTED = PSEUDO_SIMPLE | PSEUDO_SIMPLE_NO_MATCH | PSEUDO_COMPLEX | PSEUDO_COMPLEX_NO_MATCH | PSEUDO_SPECIAL + +# Sub-patterns parts +# Whitespace +NEWLINE = r'(?:\r\n|(?!\r\n)[\n\f\r])' +WS = r'(?:[ \t]|{})'.format(NEWLINE) +# Comments +COMMENTS = r'(?:/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)' +# Whitespace with comments included +WSC = r'(?:{ws}|{comments})'.format(ws=WS, comments=COMMENTS) +# CSS escapes +CSS_ESCAPES = r'(?:\\(?:[a-f0-9]{{1,6}}{ws}?|[^\r\n\f]|$))'.format(ws=WS) +CSS_STRING_ESCAPES = r'(?:\\(?:[a-f0-9]{{1,6}}{ws}?|[^\r\n\f]|$|{nl}))'.format(ws=WS, nl=NEWLINE) +# CSS Identifier +IDENTIFIER = r''' +(?:(?:-?(?:[^\x00-\x2f\x30-\x40\x5B-\x5E\x60\x7B-\x9f]|{esc})+|--) +(?:[^\x00-\x2c\x2e\x2f\x3A-\x40\x5B-\x5E\x60\x7B-\x9f]|{esc})*) +'''.format(esc=CSS_ESCAPES) +# `nth` content +NTH = r'(?:[-+])?(?:[0-9]+n?|n)(?:(?<=n){ws}*(?:[-+]){ws}*(?:[0-9]+))?'.format(ws=WSC) +# Value: quoted string or identifier +VALUE = r''' +(?:"(?:\\(?:.|{nl})|[^\\"\r\n\f]+)*?"|'(?:\\(?:.|{nl})|[^\\'\r\n\f]+)*?'|{ident}+) +'''.format(nl=NEWLINE, ident=IDENTIFIER) +# Attribute value comparison. `!=` is handled special as it is non-standard. +ATTR = r''' +(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\] +'''.format(ws=WSC, value=VALUE) + +# Selector patterns +# IDs (`#id`) +PAT_ID = r'\#{ident}'.format(ident=IDENTIFIER) +# Classes (`.class`) +PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER) +# Prefix:Tag (`prefix|tag`) +PAT_TAG = r'(?:(?:{ident}|\*)?\|)?(?:{ident}|\*)'.format(ident=IDENTIFIER) +# Attributes (`[attr]`, `[attr=value]`, etc.) +PAT_ATTR = r'\[{ws}*(?P<ns_attr>(?:(?:{ident}|\*)?\|)?{ident}){attr}'.format(ws=WSC, ident=IDENTIFIER, attr=ATTR) +# Pseudo class (`:pseudo-class`, `:pseudo-class(`) +PAT_PSEUDO_CLASS = r'(?P<name>:{ident})(?P<open>\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER) +# Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes. +PAT_PSEUDO_CLASS_SPECIAL = r'(?P<name>:{ident})(?P<open>\({ws}*)'.format(ws=WSC, ident=IDENTIFIER) +# Custom pseudo class (`:--custom-pseudo`) +PAT_PSEUDO_CLASS_CUSTOM = r'(?P<name>:(?=--){ident})'.format(ident=IDENTIFIER) +# Closing pseudo group (`)`) +PAT_PSEUDO_CLOSE = r'{ws}*\)'.format(ws=WSC) +# Pseudo element (`::pseudo-element`) +PAT_PSEUDO_ELEMENT = r':{}'.format(PAT_PSEUDO_CLASS) +# At rule (`@page`, etc.) (not supported) +PAT_AT_RULE = r'@P{ident}'.format(ident=IDENTIFIER) +# Pseudo class `nth-child` (`:nth-child(an+b [of S]?)`, `:first-child`, etc.) +PAT_PSEUDO_NTH_CHILD = r''' +(?P<pseudo_nth_child>{name} +(?P<nth_child>{nth}|even|odd))(?:{wsc}*\)|(?P<of>{comments}*{ws}{wsc}*of{comments}*{ws}{wsc}*)) +'''.format(name=PAT_PSEUDO_CLASS_SPECIAL, wsc=WSC, comments=COMMENTS, ws=WS, nth=NTH) +# Pseudo class `nth-of-type` (`:nth-of-type(an+b)`, `:first-of-type`, etc.) +PAT_PSEUDO_NTH_TYPE = r''' +(?P<pseudo_nth_type>{name} +(?P<nth_type>{nth}|even|odd)){ws}*\) +'''.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, nth=NTH) +# Pseudo class language (`:lang("*-de", en)`) +PAT_PSEUDO_LANG = r'{name}(?P<values>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format( + name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE +) +# Pseudo class direction (`:dir(ltr)`) +PAT_PSEUDO_DIR = r'{name}(?P<dir>ltr|rtl){ws}*\)'.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC) +# Combining characters (`>`, `~`, ` `, `+`, `,`) +PAT_COMBINE = r'{wsc}*?(?P<relation>[,+>~]|{ws}(?![,+>~])){wsc}*'.format(ws=WS, wsc=WSC) +# Extra: Contains (`:contains(text)`) +PAT_PSEUDO_CONTAINS = r'{name}(?P<values>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format( + name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE +) + +# Regular expressions +# CSS escape pattern +RE_CSS_ESC = re.compile(r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\[^\r\n\f])|(\\$))'.format(ws=WSC), re.I) +RE_CSS_STR_ESC = re.compile( + r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\[^\r\n\f])|(\\$)|(\\{nl}))'.format(ws=WS, nl=NEWLINE), re.I +) +# Pattern to break up `nth` specifiers +RE_NTH = re.compile( + r'(?P<s1>[-+])?(?P<a>[0-9]+n?|n)(?:(?<=n){ws}*(?P<s2>[-+]){ws}*(?P<b>[0-9]+))?'.format(ws=WSC), + re.I +) +# Pattern to iterate multiple values. +RE_VALUES = re.compile(r'(?:(?P<value>{value})|(?P<split>{ws}*,{ws}*))'.format(ws=WSC, value=VALUE), re.X) +# Whitespace checks +RE_WS = re.compile(WS) +RE_WS_BEGIN = re.compile('^{}*'.format(WSC)) +RE_WS_END = re.compile('{}*$'.format(WSC)) +RE_CUSTOM = re.compile(r'^{}$'.format(PAT_PSEUDO_CLASS_CUSTOM), re.X) + +# Constants +# List split token +COMMA_COMBINATOR = ',' +# Relation token for descendant +WS_COMBINATOR = " " + +# Parse flags +FLG_PSEUDO = 0x01 +FLG_NOT = 0x02 +FLG_RELATIVE = 0x04 +FLG_DEFAULT = 0x08 +FLG_HTML = 0x10 +FLG_INDETERMINATE = 0x20 +FLG_OPEN = 0x40 +FLG_IN_RANGE = 0x80 +FLG_OUT_OF_RANGE = 0x100 + +# Maximum cached patterns to store +_MAXCACHE = 500 + + [email protected]_cache(maxsize=_MAXCACHE) +def _cached_css_compile(pattern, namespaces, custom, flags): + """Cached CSS compile.""" + + custom_selectors = process_custom(custom) + return cm.SoupSieve( + pattern, + CSSParser(pattern, custom=custom_selectors, flags=flags).process_selectors(), + namespaces, + custom, + flags + ) + + +def _purge_cache(): + """Purge the cache.""" + + _cached_css_compile.cache_clear() + + +def process_custom(custom): + """Process custom.""" + + custom_selectors = {} + if custom is not None: + for key, value in custom.items(): + name = util.lower(key) + if RE_CUSTOM.match(name) is None: + raise SelectorSyntaxError("The name '{}' is not a valid custom pseudo-class name".format(name)) + if name in custom_selectors: + raise KeyError("The custom selector '{}' has already been registered".format(name)) + custom_selectors[css_unescape(name)] = value + return custom_selectors + + +def css_unescape(content, string=False): + """ + Unescape CSS value. + + Strings allow for spanning the value on multiple strings by escaping a new line. + """ + + def replace(m): + """Replace with the appropriate substitute.""" + + if m.group(1): + codepoint = int(m.group(1)[1:], 16) + if codepoint == 0: + codepoint = UNICODE_REPLACEMENT_CHAR + value = util.uchr(codepoint) + elif m.group(2): + value = m.group(2)[1:] + elif m.group(3): + value = '\ufffd' + else: + value = '' + + return value + + return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content) + + +def escape(ident): + """Escape identifier.""" + + string = [] + length = len(ident) + start_dash = length > 0 and ident[0] == '-' + if length == 1 and start_dash: + # Need to escape identifier that is a single `-` with no other characters + string.append('\\{}'.format(ident)) + else: + for index, c in enumerate(ident): + codepoint = util.uord(c) + if codepoint == 0x00: + string.append('\ufffd') + elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F: + string.append('\\{:x} '.format(codepoint)) + elif (index == 0 or (start_dash and index == 1)) and (0x30 <= codepoint <= 0x39): + string.append('\\{:x} '.format(codepoint)) + elif ( + codepoint in (0x2D, 0x5F) or codepoint >= 0x80 or (0x30 <= codepoint <= 0x39) or + (0x30 <= codepoint <= 0x39) or (0x41 <= codepoint <= 0x5A) or (0x61 <= codepoint <= 0x7A) + ): + string.append(c) + else: + string.append('\\{}'.format(c)) + return ''.join(string) + + +class SelectorPattern(object): + """Selector pattern.""" + + def __init__(self, name, pattern): + """Initialize.""" + + self.name = name + self.re_pattern = re.compile(pattern, re.I | re.X | re.U) + + def get_name(self): + """Get name.""" + + return self.name + + def enabled(self, flags): + """Enabled.""" + + return True + + def match(self, selector, index): + """Match the selector.""" + + return self.re_pattern.match(selector, index) + + +class SpecialPseudoPattern(SelectorPattern): + """Selector pattern.""" + + def __init__(self, patterns): + """Initialize.""" + + self.patterns = {} + for p in patterns: + name = p[0] + pattern = SelectorPattern(name, p[2]) + for pseudo in p[1]: + self.patterns[pseudo] = pattern + + self.matched_name = None + self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U) + + def get_name(self): + """Get name.""" + + return self.matched_name.get_name() + + def enabled(self, flags): + """Enabled.""" + + return True + + def match(self, selector, index): + """Match the selector.""" + + pseudo = None + m = self.re_pseudo_name.match(selector, index) + if m: + name = util.lower(css_unescape(m.group('name'))) + pattern = self.patterns.get(name) + if pattern: + pseudo = pattern.match(selector, index) + if pseudo: + self.matched_name = pattern + + return pseudo + + +class _Selector(object): + """ + Intermediate selector class. + + This stores selector data for a compound selector as we are acquiring them. + Once we are done collecting the data for a compound selector, we freeze + the data in an object that can be pickled and hashed. + """ + + def __init__(self, **kwargs): + """Initialize.""" + + self.tag = kwargs.get('tag', None) + self.ids = kwargs.get('ids', []) + self.classes = kwargs.get('classes', []) + self.attributes = kwargs.get('attributes', []) + self.nth = kwargs.get('nth', []) + self.selectors = kwargs.get('selectors', []) + self.relations = kwargs.get('relations', []) + self.rel_type = kwargs.get('rel_type', None) + self.contains = kwargs.get('contains', []) + self.lang = kwargs.get('lang', []) + self.flags = kwargs.get('flags', 0) + self.no_match = kwargs.get('no_match', False) + + def _freeze_relations(self, relations): + """Freeze relation.""" + + if relations: + sel = relations[0] + sel.relations.extend(relations[1:]) + return ct.SelectorList([sel.freeze()]) + else: + return ct.SelectorList() + + def freeze(self): + """Freeze self.""" + + if self.no_match: + return ct.SelectorNull() + else: + return ct.Selector( + self.tag, + tuple(self.ids), + tuple(self.classes), + tuple(self.attributes), + tuple(self.nth), + tuple(self.selectors), + self._freeze_relations(self.relations), + self.rel_type, + tuple(self.contains), + tuple(self.lang), + self.flags + ) + + def __str__(self): # pragma: no cover + """String representation.""" + + return ( + '_Selector(tag={!r}, ids={!r}, classes={!r}, attributes={!r}, nth={!r}, selectors={!r}, ' + 'relations={!r}, rel_type={!r}, contains={!r}, lang={!r}, flags={!r}, no_match={!r})' + ).format( + self.tag, self.ids, self.classes, self.attributes, self.nth, self.selectors, + self.relations, self.rel_type, self.contains, self.lang, self.flags, self.no_match + ) + + __repr__ = __str__ + + +class CSSParser(object): + """Parse CSS selectors.""" + + css_tokens = ( + SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE), + SpecialPseudoPattern( + ( + ("pseudo_contains", (':contains',), PAT_PSEUDO_CONTAINS), + ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD), + ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE), + ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG), + ("pseudo_dir", (':dir',), PAT_PSEUDO_DIR) + ) + ), + SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM), + SelectorPattern("pseudo_class", PAT_PSEUDO_CLASS), + SelectorPattern("pseudo_element", PAT_PSEUDO_ELEMENT), + SelectorPattern("at_rule", PAT_AT_RULE), + SelectorPattern("id", PAT_ID), + SelectorPattern("class", PAT_CLASS), + SelectorPattern("tag", PAT_TAG), + SelectorPattern("attribute", PAT_ATTR), + SelectorPattern("combine", PAT_COMBINE) + ) + + def __init__(self, selector, custom=None, flags=0): + """Initialize.""" + + self.pattern = selector.replace('\x00', '\ufffd') + self.flags = flags + self.debug = self.flags & util.DEBUG + self.custom = {} if custom is None else custom + + def parse_attribute_selector(self, sel, m, has_selector): + """Create attribute selector from the returned regex match.""" + + inverse = False + op = m.group('cmp') + case = util.lower(m.group('case')) if m.group('case') else None + parts = [css_unescape(a) for a in m.group('ns_attr').split('|')] + ns = '' + is_type = False + pattern2 = None + if len(parts) > 1: + ns = parts[0] + attr = parts[1] + else: + attr = parts[0] + if case: + flags = re.I if case == 'i' else 0 + elif util.lower(attr) == 'type': + flags = re.I + is_type = True + else: + flags = 0 + + if op: + if m.group('value').startswith(('"', "'")): + value = css_unescape(m.group('value')[1:-1], True) + else: + value = css_unescape(m.group('value')) + else: + value = None + if not op: + # Attribute name + pattern = None + elif op.startswith('^'): + # Value start with + pattern = re.compile(r'^%s.*' % re.escape(value), flags) + elif op.startswith('$'): + # Value ends with + pattern = re.compile(r'.*?%s$' % re.escape(value), flags) + elif op.startswith('*'): + # Value contains + pattern = re.compile(r'.*?%s.*' % re.escape(value), flags) + elif op.startswith('~'): + # Value contains word within space separated list + # `~=` should match nothing if it is empty or contains whitespace, + # so if either of these cases is present, use `[^\s\S]` which cannot be matched. + value = r'[^\s\S]' if not value or RE_WS.search(value) else re.escape(value) + pattern = re.compile(r'.*?(?:(?<=^)|(?<=[ \t\r\n\f]))%s(?=(?:[ \t\r\n\f]|$)).*' % value, flags) + elif op.startswith('|'): + # Value starts with word in dash separated list + pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags) + else: + # Value matches + pattern = re.compile(r'^%s$' % re.escape(value), flags) + if op.startswith('!'): + # Equivalent to `:not([attr=value])` + inverse = True + if is_type and pattern: + pattern2 = re.compile(pattern.pattern) + + # Append the attribute selector + sel_attr = ct.SelectorAttribute(attr, ns, pattern, pattern2) + if inverse: + # If we are using `!=`, we need to nest the pattern under a `:not()`. + sub_sel = _Selector() + sub_sel.attributes.append(sel_attr) + not_list = ct.SelectorList([sub_sel.freeze()], True, False) + sel.selectors.append(not_list) + else: + sel.attributes.append(sel_attr) + + has_selector = True + return has_selector + + def parse_tag_pattern(self, sel, m, has_selector): + """Parse tag pattern from regex match.""" + + parts = [css_unescape(x) for x in m.group(0).split('|')] + if len(parts) > 1: + prefix = parts[0] + tag = parts[1] + else: + tag = parts[0] + prefix = None + sel.tag = ct.SelectorTag(tag, prefix) + has_selector = True + return has_selector + + def parse_pseudo_class_custom(self, sel, m, has_selector): + """ + Parse custom pseudo class alias. + + Compile custom selectors as we need them. When compiling a custom selector, + set it to `None` in the dictionary so we can avoid an infinite loop. + """ + + pseudo = util.lower(css_unescape(m.group('name'))) + selector = self.custom.get(pseudo) + if selector is None: + raise SelectorSyntaxError( + "Undefined custom selector '{}' found at postion {}".format(pseudo, m.end(0)), + self.pattern, + m.end(0) + ) + + if not isinstance(selector, ct.SelectorList): + self.custom[pseudo] = None + selector = CSSParser( + selector, custom=self.custom, flags=self.flags + ).process_selectors(flags=FLG_PSEUDO) + self.custom[pseudo] = selector + + sel.selectors.append(selector) + has_selector = True + return has_selector + + def parse_pseudo_class(self, sel, m, has_selector, iselector, is_html): + """Parse pseudo class.""" + + complex_pseudo = False + pseudo = util.lower(css_unescape(m.group('name'))) + if m.group('open'): + complex_pseudo = True + if complex_pseudo and pseudo in PSEUDO_COMPLEX: + has_selector = self.parse_pseudo_open(sel, pseudo, has_selector, iselector, m.end(0)) + elif not complex_pseudo and pseudo in PSEUDO_SIMPLE: + if pseudo == ':root': + sel.flags |= ct.SEL_ROOT + elif pseudo == ':defined': + sel.flags |= ct.SEL_DEFINED + is_html = True + elif pseudo == ':scope': + sel.flags |= ct.SEL_SCOPE + elif pseudo == ':empty': + sel.flags |= ct.SEL_EMPTY + elif pseudo in (':link', ':any-link'): + sel.selectors.append(CSS_LINK) + elif pseudo == ':checked': + sel.selectors.append(CSS_CHECKED) + elif pseudo == ':default': + sel.selectors.append(CSS_DEFAULT) + elif pseudo == ':indeterminate': + sel.selectors.append(CSS_INDETERMINATE) + elif pseudo == ":disabled": + sel.selectors.append(CSS_DISABLED) + elif pseudo == ":enabled": + sel.selectors.append(CSS_ENABLED) + elif pseudo == ":required": + sel.selectors.append(CSS_REQUIRED) + elif pseudo == ":optional": + sel.selectors.append(CSS_OPTIONAL) + elif pseudo == ":read-only": + sel.selectors.append(CSS_READ_ONLY) + elif pseudo == ":read-write": + sel.selectors.append(CSS_READ_WRITE) + elif pseudo == ":in-range": + sel.selectors.append(CSS_IN_RANGE) + elif pseudo == ":out-of-range": + sel.selectors.append(CSS_OUT_OF_RANGE) + elif pseudo == ":placeholder-shown": + sel.selectors.append(CSS_PLACEHOLDER_SHOWN) + elif pseudo == ':first-child': + sel.nth.append(ct.SelectorNth(1, False, 0, False, False, ct.SelectorList())) + elif pseudo == ':last-child': + sel.nth.append(ct.SelectorNth(1, False, 0, False, True, ct.SelectorList())) + elif pseudo == ':first-of-type': + sel.nth.append(ct.SelectorNth(1, False, 0, True, False, ct.SelectorList())) + elif pseudo == ':last-of-type': + sel.nth.append(ct.SelectorNth(1, False, 0, True, True, ct.SelectorList())) + elif pseudo == ':only-child': + sel.nth.extend( + [ + ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()), + ct.SelectorNth(1, False, 0, False, True, ct.SelectorList()) + ] + ) + elif pseudo == ':only-of-type': + sel.nth.extend( + [ + ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()), + ct.SelectorNth(1, False, 0, True, True, ct.SelectorList()) + ] + ) + has_selector = True + elif complex_pseudo and pseudo in PSEUDO_COMPLEX_NO_MATCH: + self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN) + sel.no_match = True + has_selector = True + elif not complex_pseudo and pseudo in PSEUDO_SIMPLE_NO_MATCH: + sel.no_match = True + has_selector = True + elif pseudo in PSEUDO_SUPPORTED: + raise SelectorSyntaxError( + "Invalid syntax for pseudo class '{}'".format(pseudo), + self.pattern, + m.start(0) + ) + else: + raise NotImplementedError( + "'{}' pseudo-class is not implemented at this time".format(pseudo) + ) + + return has_selector, is_html + + def parse_pseudo_nth(self, sel, m, has_selector, iselector): + """Parse `nth` pseudo.""" + + mdict = m.groupdict() + if mdict.get('pseudo_nth_child'): + postfix = '_child' + else: + postfix = '_type' + mdict['name'] = util.lower(css_unescape(mdict['name'])) + content = util.lower(mdict.get('nth' + postfix)) + if content == 'even': + # 2n + s1 = 2 + s2 = 0 + var = True + elif content == 'odd': + # 2n+1 + s1 = 2 + s2 = 1 + var = True + else: + nth_parts = RE_NTH.match(content) + s1 = '-' if nth_parts.group('s1') and nth_parts.group('s1') == '-' else '' + a = nth_parts.group('a') + var = a.endswith('n') + if a.startswith('n'): + s1 += '1' + elif var: + s1 += a[:-1] + else: + s1 += a + s2 = '-' if nth_parts.group('s2') and nth_parts.group('s2') == '-' else '' + if nth_parts.group('b'): + s2 += nth_parts.group('b') + else: + s2 = '0' + s1 = int(s1, 10) + s2 = int(s2, 10) + + pseudo_sel = mdict['name'] + if postfix == '_child': + if m.group('of'): + # Parse the rest of `of S`. + nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN) + else: + # Use default `*|*` for `of S`. + nth_sel = CSS_NTH_OF_S_DEFAULT + if pseudo_sel == ':nth-child': + sel.nth.append(ct.SelectorNth(s1, var, s2, False, False, nth_sel)) + elif pseudo_sel == ':nth-last-child': + sel.nth.append(ct.SelectorNth(s1, var, s2, False, True, nth_sel)) + else: + if pseudo_sel == ':nth-of-type': + sel.nth.append(ct.SelectorNth(s1, var, s2, True, False, ct.SelectorList())) + elif pseudo_sel == ':nth-last-of-type': + sel.nth.append(ct.SelectorNth(s1, var, s2, True, True, ct.SelectorList())) + has_selector = True + return has_selector + + def parse_pseudo_open(self, sel, name, has_selector, iselector, index): + """Parse pseudo with opening bracket.""" + + flags = FLG_PSEUDO | FLG_OPEN + if name == ':not': + flags |= FLG_NOT + if name == ':has': + flags |= FLG_RELATIVE + + sel.selectors.append(self.parse_selectors(iselector, index, flags)) + has_selector = True + return has_selector + + def parse_has_combinator(self, sel, m, has_selector, selectors, rel_type, index): + """Parse combinator tokens.""" + + combinator = m.group('relation').strip() + if not combinator: + combinator = WS_COMBINATOR + if combinator == COMMA_COMBINATOR: + if not has_selector: + # If we've not captured any selector parts, the comma is either at the beginning of the pattern + # or following another comma, both of which are unexpected. Commas must split selectors. + raise SelectorSyntaxError( + "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index), + self.pattern, + index + ) + sel.rel_type = rel_type + selectors[-1].relations.append(sel) + rel_type = ":" + WS_COMBINATOR + selectors.append(_Selector()) + else: + if has_selector: + # End the current selector and associate the leading combinator with this selector. + sel.rel_type = rel_type + selectors[-1].relations.append(sel) + elif rel_type[1:] != WS_COMBINATOR: + # It's impossible to have two whitespace combinators after each other as the patterns + # will gobble up trailing whitespace. It is also impossible to have a whitespace + # combinator after any other kind for the same reason. But we could have + # multiple non-whitespace combinators. So if the current combinator is not a whitespace, + # then we've hit the multiple combinator case, so we should fail. + raise SelectorSyntaxError( + 'The multiple combinators at position {}'.format(index), + self.pattern, + index + ) + # Set the leading combinator for the next selector. + rel_type = ':' + combinator + sel = _Selector() + + has_selector = False + return has_selector, sel, rel_type + + def parse_combinator(self, sel, m, has_selector, selectors, relations, is_pseudo, index): + """Parse combinator tokens.""" + + combinator = m.group('relation').strip() + if not combinator: + combinator = WS_COMBINATOR + if not has_selector: + raise SelectorSyntaxError( + "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index), + self.pattern, + index + ) + + if combinator == COMMA_COMBINATOR: + if not sel.tag and not is_pseudo: + # Implied `*` + sel.tag = ct.SelectorTag('*', None) + sel.relations.extend(relations) + selectors.append(sel) + del relations[:] + else: + sel.relations.extend(relations) + sel.rel_type = combinator + del relations[:] + relations.append(sel) + sel = _Selector() + + has_selector = False + return has_selector, sel + + def parse_class_id(self, sel, m, has_selector): + """Parse HTML classes and ids.""" + + selector = m.group(0) + if selector.startswith('.'): + sel.classes.append(css_unescape(selector[1:])) + else: + sel.ids.append(css_unescape(selector[1:])) + has_selector = True + return has_selector + + def parse_pseudo_contains(self, sel, m, has_selector): + """Parse contains.""" + + values = m.group('values') + patterns = [] + for token in RE_VALUES.finditer(values): + if token.group('split'): + continue + value = token.group('value') + if value.startswith(("'", '"')): + value = css_unescape(value[1:-1], True) + else: + value = css_unescape(value) + patterns.append(value) + sel.contains.append(ct.SelectorContains(tuple(patterns))) + has_selector = True + return has_selector + + def parse_pseudo_lang(self, sel, m, has_selector): + """Parse pseudo language.""" + + values = m.group('values') + patterns = [] + for token in RE_VALUES.finditer(values): + if token.group('split'): + continue + value = token.group('value') + if value.startswith(('"', "'")): + parts = css_unescape(value[1:-1], True).split('-') + else: + parts = css_unescape(value).split('-') + + new_parts = [] + first = True + for part in parts: + if part == '*' and first: + new_parts.append('(?!x\b)[a-z0-9]+?') + elif part != '*': + new_parts.append(('' if first else '(-(?!x\b)[a-z0-9]+)*?\\-') + re.escape(part)) + if first: + first = False + patterns.append(re.compile(r'^{}(?:-.*)?$'.format(''.join(new_parts)), re.I)) + sel.lang.append(ct.SelectorLang(patterns)) + has_selector = True + + return has_selector + + def parse_pseudo_dir(self, sel, m, has_selector): + """Parse pseudo direction.""" + + value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL + sel.flags |= value + has_selector = True + return has_selector + + def parse_selectors(self, iselector, index=0, flags=0): + """Parse selectors.""" + + sel = _Selector() + selectors = [] + has_selector = False + closed = False + relations = [] + rel_type = ":" + WS_COMBINATOR + is_open = bool(flags & FLG_OPEN) + is_pseudo = bool(flags & FLG_PSEUDO) + is_relative = bool(flags & FLG_RELATIVE) + is_not = bool(flags & FLG_NOT) + is_html = bool(flags & FLG_HTML) + is_default = bool(flags & FLG_DEFAULT) + is_indeterminate = bool(flags & FLG_INDETERMINATE) + is_in_range = bool(flags & FLG_IN_RANGE) + is_out_of_range = bool(flags & FLG_OUT_OF_RANGE) + + if self.debug: # pragma: no cover + if is_pseudo: + print(' is_pseudo: True') + if is_open: + print(' is_open: True') + if is_relative: + print(' is_relative: True') + if is_not: + print(' is_not: True') + if is_html: + print(' is_html: True') + if is_default: + print(' is_default: True') + if is_indeterminate: + print(' is_indeterminate: True') + if is_in_range: + print(' is_in_range: True') + if is_out_of_range: + print(' is_out_of_range: True') + + if is_relative: + selectors.append(_Selector()) + + try: + while True: + key, m = next(iselector) + + # Handle parts + if key == "at_rule": + raise NotImplementedError("At-rules found at position {}".format(m.start(0))) + elif key == 'pseudo_class_custom': + has_selector = self.parse_pseudo_class_custom(sel, m, has_selector) + elif key == 'pseudo_class': + has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html) + elif key == 'pseudo_element': + raise NotImplementedError("Psuedo-element found at position {}".format(m.start(0))) + elif key == 'pseudo_contains': + has_selector = self.parse_pseudo_contains(sel, m, has_selector) + elif key in ('pseudo_nth_type', 'pseudo_nth_child'): + has_selector = self.parse_pseudo_nth(sel, m, has_selector, iselector) + elif key == 'pseudo_lang': + has_selector = self.parse_pseudo_lang(sel, m, has_selector) + elif key == 'pseudo_dir': + has_selector = self.parse_pseudo_dir(sel, m, has_selector) + # Currently only supports HTML + is_html = True + elif key == 'pseudo_close': + if not has_selector: + raise SelectorSyntaxError( + "Expected a selector at postion {}".format(m.start(0)), + self.pattern, + m.start(0) + ) + if is_open: + closed = True + break + else: + raise SelectorSyntaxError( + "Unmatched pseudo-class close at postion {}".format(m.start(0)), + self.pattern, + m.start(0) + ) + elif key == 'combine': + if is_relative: + has_selector, sel, rel_type = self.parse_has_combinator( + sel, m, has_selector, selectors, rel_type, index + ) + else: + has_selector, sel = self.parse_combinator( + sel, m, has_selector, selectors, relations, is_pseudo, index + ) + elif key == 'attribute': + has_selector = self.parse_attribute_selector(sel, m, has_selector) + elif key == 'tag': + if has_selector: + raise SelectorSyntaxError( + "Tag name found at position {} instead of at the start".format(m.start(0)), + self.pattern, + m.start(0) + ) + has_selector = self.parse_tag_pattern(sel, m, has_selector) + elif key in ('class', 'id'): + has_selector = self.parse_class_id(sel, m, has_selector) + + index = m.end(0) + except StopIteration: + pass + + if is_open and not closed: + raise SelectorSyntaxError( + "Unclosed pseudo-class at position {}".format(index), + self.pattern, + index + ) + + if has_selector: + if not sel.tag and not is_pseudo: + # Implied `*` + sel.tag = ct.SelectorTag('*', None) + if is_relative: + sel.rel_type = rel_type + selectors[-1].relations.append(sel) + else: + sel.relations.extend(relations) + del relations[:] + selectors.append(sel) + else: + # We will always need to finish a selector when `:has()` is used as it leads with combining. + raise SelectorSyntaxError( + 'Expected a selector at position {}'.format(index), + self.pattern, + index + ) + + # Some patterns require additional logic, such as default. We try to make these the + # last pattern, and append the appropriate flag to that selector which communicates + # to the matcher what additional logic is required. + if is_default: + selectors[-1].flags = ct.SEL_DEFAULT + if is_indeterminate: + selectors[-1].flags = ct.SEL_INDETERMINATE + if is_in_range: + selectors[-1].flags = ct.SEL_IN_RANGE + if is_out_of_range: + selectors[-1].flags = ct.SEL_OUT_OF_RANGE + + return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html) + + def selector_iter(self, pattern): + """Iterate selector tokens.""" + + # Ignore whitespace and comments at start and end of pattern + m = RE_WS_BEGIN.search(pattern) + index = m.end(0) if m else 0 + m = RE_WS_END.search(pattern) + end = (m.start(0) - 1) if m else (len(pattern) - 1) + + if self.debug: # pragma: no cover + print('## PARSING: {!r}'.format(pattern)) + while index <= end: + m = None + for v in self.css_tokens: + if not v.enabled(self.flags): # pragma: no cover + continue + m = v.match(pattern, index) + if m: + name = v.get_name() + if self.debug: # pragma: no cover + print("TOKEN: '{}' --> {!r} at position {}".format(name, m.group(0), m.start(0))) + index = m.end(0) + yield name, m + break + if m is None: + c = pattern[index] + # If the character represents the start of one of the known selector types, + # throw an exception mentioning that the known selector type is in error; + # otherwise, report the invalid character. + if c == '[': + msg = "Malformed attribute selector at position {}".format(index) + elif c == '.': + msg = "Malformed class selector at position {}".format(index) + elif c == '#': + msg = "Malformed id selector at position {}".format(index) + elif c == ':': + msg = "Malformed pseudo-class selector at position {}".format(index) + else: + msg = "Invalid character {!r} position {}".format(c, index) + raise SelectorSyntaxError(msg, self.pattern, index) + if self.debug: # pragma: no cover + print('## END PARSING') + + def process_selectors(self, index=0, flags=0): + """Process selectors.""" + + return self.parse_selectors(self.selector_iter(self.pattern), index, flags) + + +# Precompile CSS selector lists for pseudo-classes (additional logic may be required beyond the pattern) +# A few patterns are order dependent as they use patterns previous compiled. + +# CSS pattern for `:link` and `:any-link` +CSS_LINK = CSSParser( + 'html|*:is(a, area, link)[href]' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:checked` +CSS_CHECKED = CSSParser( + ''' + html|*:is(input[type=checkbox], input[type=radio])[checked], + html|select > html|option[selected] + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:default` (must compile CSS_CHECKED first) +CSS_DEFAULT = CSSParser( + ''' + :checked, + + /* + This pattern must be at the end. + Special logic is applied to the last selector. + */ + html|form html|*:is(button, input)[type="submit"] + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_DEFAULT) +# CSS pattern for `:indeterminate` +CSS_INDETERMINATE = CSSParser( + ''' + html|input[type="checkbox"][indeterminate], + html|input[type="radio"]:is(:not([name]), [name=""]):not([checked]), + html|progress:not([value]), + + /* + This pattern must be at the end. + Special logic is applied to the last selector. + */ + html|input[type="radio"][name][name!='']:not([checked]) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE) +# CSS pattern for `:disabled` +CSS_DISABLED = CSSParser( + ''' + html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset)[disabled], + html|optgroup[disabled] > html|option, + html|fieldset[disabled] > html|*:is(input[type!=hidden], button, select, textarea, fieldset), + html|fieldset[disabled] > + html|*:not(legend:nth-of-type(1)) html|*:is(input[type!=hidden], button, select, textarea, fieldset) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:enabled` +CSS_ENABLED = CSSParser( + ''' + html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:required` +CSS_REQUIRED = CSSParser( + 'html|*:is(input, textarea, select)[required]' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:optional` +CSS_OPTIONAL = CSSParser( + 'html|*:is(input, textarea, select):not([required])' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:placeholder-shown` +CSS_PLACEHOLDER_SHOWN = CSSParser( + ''' + html|*:is( + input:is( + :not([type]), + [type=""], + [type=text], + [type=search], + [type=url], + [type=tel], + [type=email], + [type=password], + [type=number] + ), + textarea + )[placeholder][placeholder!=''] + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern default for `:nth-child` "of S" feature +CSS_NTH_OF_S_DEFAULT = CSSParser( + '*|*' +).process_selectors(flags=FLG_PSEUDO) +# CSS pattern for `:read-write` (CSS_DISABLED must be compiled first) +CSS_READ_WRITE = CSSParser( + ''' + html|*:is( + textarea, + input:is( + :not([type]), + [type=""], + [type=text], + [type=search], + [type=url], + [type=tel], + [type=email], + [type=number], + [type=password], + [type=date], + [type=datetime-local], + [type=month], + [type=time], + [type=week] + ) + ):not([readonly], :disabled), + html|*:is([contenteditable=""], [contenteditable="true" i]) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:read-only` +CSS_READ_ONLY = CSSParser( + ''' + html|*:not(:read-write) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:in-range` +CSS_IN_RANGE = CSSParser( + ''' + html|input:is( + [type="date"], + [type="month"], + [type="week"], + [type="time"], + [type="datetime-local"], + [type="number"], + [type="range"] + ):is( + [min], + [max] + ) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_IN_RANGE | FLG_HTML) +# CSS pattern for `:out-of-range` +CSS_OUT_OF_RANGE = CSSParser( + ''' + html|input:is( + [type="date"], + [type="month"], + [type="week"], + [type="time"], + [type="datetime-local"], + [type="number"], + [type="range"] + ):is( + [min], + [max] + ) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_OUT_OF_RANGE | FLG_HTML) diff --git a/libs/soupsieve/css_types.py b/libs/soupsieve/css_types.py new file mode 100644 index 000000000..d426287aa --- /dev/null +++ b/libs/soupsieve/css_types.py @@ -0,0 +1,344 @@ +"""CSS selector structure items.""" +from __future__ import unicode_literals +from . import util + +__all__ = ( + 'Selector', + 'SelectorNull', + 'SelectorTag', + 'SelectorAttribute', + 'SelectorContains', + 'SelectorNth', + 'SelectorLang', + 'SelectorList', + 'Namespaces', + 'CustomSelectors' +) + + +SEL_EMPTY = 0x1 +SEL_ROOT = 0x2 +SEL_DEFAULT = 0x4 +SEL_INDETERMINATE = 0x8 +SEL_SCOPE = 0x10 +SEL_DIR_LTR = 0x20 +SEL_DIR_RTL = 0x40 +SEL_IN_RANGE = 0x80 +SEL_OUT_OF_RANGE = 0x100 +SEL_DEFINED = 0x200 + + +class Immutable(object): + """Immutable.""" + + __slots__ = ('_hash',) + + def __init__(self, **kwargs): + """Initialize.""" + + temp = [] + for k, v in kwargs.items(): + temp.append(type(v)) + temp.append(v) + super(Immutable, self).__setattr__(k, v) + super(Immutable, self).__setattr__('_hash', hash(tuple(temp))) + + @classmethod + def __base__(cls): + """Get base class.""" + + return cls + + def __eq__(self, other): + """Equal.""" + + return ( + isinstance(other, self.__base__()) and + all([getattr(other, key) == getattr(self, key) for key in self.__slots__ if key != '_hash']) + ) + + def __ne__(self, other): + """Equal.""" + + return ( + not isinstance(other, self.__base__()) or + any([getattr(other, key) != getattr(self, key) for key in self.__slots__ if key != '_hash']) + ) + + def __hash__(self): + """Hash.""" + + return self._hash + + def __setattr__(self, name, value): + """Prevent mutability.""" + + raise AttributeError("'{}' is immutable".format(self.__class__.__name__)) + + def __repr__(self): # pragma: no cover + """Representation.""" + + return "{}({})".format( + self.__base__(), ', '.join(["{}={!r}".format(k, getattr(self, k)) for k in self.__slots__[:-1]]) + ) + + __str__ = __repr__ + + +class ImmutableDict(util.Mapping): + """Hashable, immutable dictionary.""" + + def __init__(self, *args, **kwargs): + """Initialize.""" + + arg = args[0] if args else kwargs + is_dict = isinstance(arg, dict) + if ( + is_dict and not all([isinstance(v, util.Hashable) for v in arg.values()]) or + not is_dict and not all([isinstance(k, util.Hashable) and isinstance(v, util.Hashable) for k, v in arg]) + ): + raise TypeError('All values must be hashable') + + self._d = dict(*args, **kwargs) + self._hash = hash(tuple([(type(x), x, type(y), y) for x, y in sorted(self._d.items())])) + + def __iter__(self): + """Iterator.""" + + return iter(self._d) + + def __len__(self): + """Length.""" + + return len(self._d) + + def __getitem__(self, key): + """Get item: `namespace['key']`.""" + return self._d[key] + + def __hash__(self): + """Hash.""" + + return self._hash + + def __repr__(self): # pragma: no cover + """Representation.""" + + return "{!r}".format(self._d) + + __str__ = __repr__ + + +class Namespaces(ImmutableDict): + """Namespaces.""" + + def __init__(self, *args, **kwargs): + """Initialize.""" + + # If there are arguments, check the first index. + # `super` should fail if the user gave multiple arguments, + # so don't bother checking that. + arg = args[0] if args else kwargs + is_dict = isinstance(arg, dict) + if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]): + raise TypeError('Namespace keys and values must be Unicode strings') + elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]): + raise TypeError('Namespace keys and values must be Unicode strings') + + super(Namespaces, self).__init__(*args, **kwargs) + + +class CustomSelectors(ImmutableDict): + """Custom selectors.""" + + def __init__(self, *args, **kwargs): + """Initialize.""" + + # If there are arguments, check the first index. + # `super` should fail if the user gave multiple arguments, + # so don't bother checking that. + arg = args[0] if args else kwargs + is_dict = isinstance(arg, dict) + if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]): + raise TypeError('CustomSelectors keys and values must be Unicode strings') + elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]): + raise TypeError('CustomSelectors keys and values must be Unicode strings') + + super(CustomSelectors, self).__init__(*args, **kwargs) + + +class Selector(Immutable): + """Selector.""" + + __slots__ = ( + 'tag', 'ids', 'classes', 'attributes', 'nth', 'selectors', + 'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash' + ) + + def __init__( + self, tag, ids, classes, attributes, nth, selectors, + relation, rel_type, contains, lang, flags + ): + """Initialize.""" + + super(Selector, self).__init__( + tag=tag, + ids=ids, + classes=classes, + attributes=attributes, + nth=nth, + selectors=selectors, + relation=relation, + rel_type=rel_type, + contains=contains, + lang=lang, + flags=flags + ) + + +class SelectorNull(Immutable): + """Null Selector.""" + + def __init__(self): + """Initialize.""" + + super(SelectorNull, self).__init__() + + +class SelectorTag(Immutable): + """Selector tag.""" + + __slots__ = ("name", "prefix", "_hash") + + def __init__(self, name, prefix): + """Initialize.""" + + super(SelectorTag, self).__init__( + name=name, + prefix=prefix + ) + + +class SelectorAttribute(Immutable): + """Selector attribute rule.""" + + __slots__ = ("attribute", "prefix", "pattern", "xml_type_pattern", "_hash") + + def __init__(self, attribute, prefix, pattern, xml_type_pattern): + """Initialize.""" + + super(SelectorAttribute, self).__init__( + attribute=attribute, + prefix=prefix, + pattern=pattern, + xml_type_pattern=xml_type_pattern + ) + + +class SelectorContains(Immutable): + """Selector contains rule.""" + + __slots__ = ("text", "_hash") + + def __init__(self, text): + """Initialize.""" + + super(SelectorContains, self).__init__( + text=text + ) + + +class SelectorNth(Immutable): + """Selector nth type.""" + + __slots__ = ("a", "n", "b", "of_type", "last", "selectors", "_hash") + + def __init__(self, a, n, b, of_type, last, selectors): + """Initialize.""" + + super(SelectorNth, self).__init__( + a=a, + n=n, + b=b, + of_type=of_type, + last=last, + selectors=selectors + ) + + +class SelectorLang(Immutable): + """Selector language rules.""" + + __slots__ = ("languages", "_hash",) + + def __init__(self, languages): + """Initialize.""" + + super(SelectorLang, self).__init__( + languages=tuple(languages) + ) + + def __iter__(self): + """Iterator.""" + + return iter(self.languages) + + def __len__(self): # pragma: no cover + """Length.""" + + return len(self.languages) + + def __getitem__(self, index): # pragma: no cover + """Get item.""" + + return self.languages[index] + + +class SelectorList(Immutable): + """Selector list.""" + + __slots__ = ("selectors", "is_not", "is_html", "_hash") + + def __init__(self, selectors=tuple(), is_not=False, is_html=False): + """Initialize.""" + + super(SelectorList, self).__init__( + selectors=tuple(selectors), + is_not=is_not, + is_html=is_html + ) + + def __iter__(self): + """Iterator.""" + + return iter(self.selectors) + + def __len__(self): + """Length.""" + + return len(self.selectors) + + def __getitem__(self, index): + """Get item.""" + + return self.selectors[index] + + +def _pickle(p): + return p.__base__(), tuple([getattr(p, s) for s in p.__slots__[:-1]]) + + +def pickle_register(obj): + """Allow object to be pickled.""" + + util.copyreg.pickle(obj, _pickle) + + +pickle_register(Selector) +pickle_register(SelectorNull) +pickle_register(SelectorTag) +pickle_register(SelectorAttribute) +pickle_register(SelectorContains) +pickle_register(SelectorNth) +pickle_register(SelectorLang) +pickle_register(SelectorList) diff --git a/libs/soupsieve/util.py b/libs/soupsieve/util.py new file mode 100644 index 000000000..6158367a5 --- /dev/null +++ b/libs/soupsieve/util.py @@ -0,0 +1,170 @@ +"""Utility.""" +from __future__ import unicode_literals +from functools import wraps +import warnings +import sys +import struct +import os +import re +MODULE = os.path.dirname(__file__) + +PY3 = sys.version_info >= (3, 0) +PY35 = sys.version_info >= (3, 5) +PY37 = sys.version_info >= (3, 7) + +if PY3: + from functools import lru_cache # noqa F401 + import copyreg # noqa F401 + from collections.abc import Hashable, Mapping # noqa F401 + + ustr = str + bstr = bytes + unichar = chr + string = str +else: + from backports.functools_lru_cache import lru_cache # noqa F401 + import copy_reg as copyreg # noqa F401 + from collections import Hashable, Mapping # noqa F401 + + ustr = unicode # noqa: F821 + bstr = str + unichar = unichr # noqa: F821 + string = basestring # noqa: F821 + +DEBUG = 0x00001 + +RE_PATTERN_LINE_SPLIT = re.compile(r'(?:\r\n|(?!\r\n)[\n\r])|$') + +LC_A = ord('a') +LC_Z = ord('z') +UC_A = ord('A') +UC_Z = ord('Z') + + +def lower(string): + """Lower.""" + + new_string = [] + for c in string: + o = ord(c) + new_string.append(chr(o + 32) if UC_A <= o <= UC_Z else c) + return ''.join(new_string) + + +def upper(string): # pragma: no cover + """Lower.""" + + new_string = [] + for c in string: + o = ord(c) + new_string.append(chr(o - 32) if LC_A <= o <= LC_Z else c) + return ''.join(new_string) + + +def uchr(i): + """Allow getting Unicode character on narrow python builds.""" + + try: + return unichar(i) + except ValueError: # pragma: no cover + return struct.pack('i', i).decode('utf-32') + + +def uord(c): + """Get Unicode ordinal.""" + + if len(c) == 2: # pragma: no cover + high, low = [ord(p) for p in c] + ordinal = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000 + else: + ordinal = ord(c) + + return ordinal + + +class SelectorSyntaxError(SyntaxError): + """Syntax error in a CSS selector.""" + + def __init__(self, msg, pattern=None, index=None): + """Initialize.""" + + self.line = None + self.col = None + self.context = None + + if pattern is not None and index is not None: + # Format pattern to show line and column position + self.context, self.line, self.col = get_pattern_context(pattern, index) + msg = '{}\n line {}:\n{}'.format(msg, self.line, self.context) + + super(SelectorSyntaxError, self).__init__(msg) + + +def deprecated(message, stacklevel=2): # pragma: no cover + """ + Raise a `DeprecationWarning` when wrapped function/method is called. + + Borrowed from https://stackoverflow.com/a/48632082/866026 + """ + + def _decorator(func): + @wraps(func) + def _func(*args, **kwargs): + warnings.warn( + "'{}' is deprecated. {}".format(func.__name__, message), + category=DeprecationWarning, + stacklevel=stacklevel + ) + return func(*args, **kwargs) + return _func + return _decorator + + +def warn_deprecated(message, stacklevel=2): # pragma: no cover + """Warn deprecated.""" + + warnings.warn( + message, + category=DeprecationWarning, + stacklevel=stacklevel + ) + + +def get_pattern_context(pattern, index): + """Get the pattern context.""" + + last = 0 + current_line = 1 + col = 1 + text = [] + line = 1 + + # Split pattern by newline and handle the text before the newline + for m in RE_PATTERN_LINE_SPLIT.finditer(pattern): + linetext = pattern[last:m.start(0)] + if not len(m.group(0)) and not len(text): + indent = '' + offset = -1 + col = index - last + 1 + elif last <= index < m.end(0): + indent = '--> ' + offset = (-1 if index > m.start(0) else 0) + 3 + col = index - last + 1 + else: + indent = ' ' + offset = None + if len(text): + # Regardless of whether we are presented with `\r\n`, `\r`, or `\n`, + # we will render the output with just `\n`. We will still log the column + # correctly though. + text.append('\n') + text.append('{}{}'.format(indent, linetext)) + if offset is not None: + text.append('\n') + text.append(' ' * (col + offset) + '^') + line = current_line + + current_line += 1 + last = m.end(0) + + return ''.join(text), line, col diff --git a/libs/sqlite3worker.py b/libs/sqlite3worker.py index 989994628..4c9dbe49d 100644 --- a/libs/sqlite3worker.py +++ b/libs/sqlite3worker.py @@ -90,11 +90,11 @@ class Sqlite3Worker(threading.Thread): """ LOGGER.debug("run: Thread started") execute_count = 0 - for token, query, values, only_one in iter(self.sql_queue.get, None): + for token, query, values, only_one, execute_many in iter(self.sql_queue.get, None): LOGGER.debug("sql_queue: %s", self.sql_queue.qsize()) if token != self.exit_token: LOGGER.debug("run: %s, %s", query, values) - self.run_query(token, query, values, only_one) + self.run_query(token, query, values, only_one, execute_many) execute_count += 1 # Let the executes build up a little before committing to disk # to speed things up. @@ -112,7 +112,7 @@ class Sqlite3Worker(threading.Thread): self.thread_running = False return - def run_query(self, token, query, values, only_one): + def run_query(self, token, query, values, only_one=False, execute_many=False): """Run a query. Args: @@ -136,9 +136,14 @@ class Sqlite3Worker(threading.Thread): "Query returned error: %s: %s: %s", query, values, err) else: try: - self.sqlite3_cursor.execute(query, values) - if query.lower().strip().startswith(("insert", "update")): - self.results[token] = self.sqlite3_cursor.rowcount + if execute_many: + self.sqlite3_cursor.executemany(query, values) + if query.lower().strip().startswith(("insert", "update", "delete")): + self.results[token] = self.sqlite3_cursor.rowcount + else: + self.sqlite3_cursor.execute(query, values) + if query.lower().strip().startswith(("insert", "update", "delete")): + self.results[token] = self.sqlite3_cursor.rowcount except sqlite3.Error as err: self.results[token] = ( "Query returned error: %s: %s: %s" % (query, values, err)) @@ -148,7 +153,7 @@ class Sqlite3Worker(threading.Thread): def close(self): """Close down the thread and close the sqlite3 database file.""" self.exit_set = True - self.sql_queue.put((self.exit_token, "", "", ""), timeout=5) + self.sql_queue.put((self.exit_token, "", "", "", ""), timeout=5) # Sleep and check that the thread is done before returning. while self.thread_running: time.sleep(.01) # Don't kill the CPU waiting. @@ -181,7 +186,7 @@ class Sqlite3Worker(threading.Thread): if delay < 8: delay += delay - def execute(self, query, values=None, only_one=False): + def execute(self, query, values=None, only_one=False, execute_many=False): """Execute a query. Args: @@ -200,11 +205,11 @@ class Sqlite3Worker(threading.Thread): token = str(uuid.uuid4()) # If it's a select we queue it up with a token to mark the results # into the output queue so we know what results are ours. - if query.lower().strip().startswith(("select", "insert", "update")): - self.sql_queue.put((token, query, values, only_one), timeout=5) + if query.lower().strip().startswith(("select", "insert", "update", "delete")): + self.sql_queue.put((token, query, values, only_one, execute_many), timeout=5) return self.query_results(token) else: - self.sql_queue.put((token, query, values, only_one), timeout=5) + self.sql_queue.put((token, query, values, only_one, execute_many), timeout=5) def dict_factory(cursor, row): diff --git a/libs/subliminal/__init__.py b/libs/subliminal/__init__.py index 73b137e98..249cb40fe 100644 --- a/libs/subliminal/__init__.py +++ b/libs/subliminal/__init__.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import __title__ = 'subliminal' __version__ = '2.1.0.dev' __short_version__ = '.'.join(__version__.split('.')[:2]) diff --git a/libs/subliminal/cache.py b/libs/subliminal/cache.py index 244ba9531..3d8848e1d 100644 --- a/libs/subliminal/cache.py +++ b/libs/subliminal/cache.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import datetime from dogpile.cache import make_region diff --git a/libs/subliminal/cli.py b/libs/subliminal/cli.py index 2eb502fbc..6eaa07fb3 100644 --- a/libs/subliminal/cli.py +++ b/libs/subliminal/cli.py @@ -4,6 +4,7 @@ Subliminal uses `click <http://click.pocoo.org>`_ to provide a powerful :abbr:`C """ from __future__ import division +from __future__ import absolute_import from collections import defaultdict from datetime import timedelta import glob diff --git a/libs/subliminal/converters/addic7ed.py b/libs/subliminal/converters/addic7ed.py index f9cb83161..7c9f1bcc7 100644 --- a/libs/subliminal/converters/addic7ed.py +++ b/libs/subliminal/converters/addic7ed.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from babelfish import LanguageReverseConverter, language_converters diff --git a/libs/subliminal/converters/legendastv.py b/libs/subliminal/converters/legendastv.py index c2e13bd39..991c7888e 100644 --- a/libs/subliminal/converters/legendastv.py +++ b/libs/subliminal/converters/legendastv.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from babelfish import LanguageReverseConverter from ..exceptions import ConfigurationError diff --git a/libs/subliminal/converters/shooter.py b/libs/subliminal/converters/shooter.py index ac6431a62..c25a56bf4 100644 --- a/libs/subliminal/converters/shooter.py +++ b/libs/subliminal/converters/shooter.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from babelfish import LanguageReverseConverter from ..exceptions import ConfigurationError diff --git a/libs/subliminal/converters/thesubdb.py b/libs/subliminal/converters/thesubdb.py index 58051afb8..ef04dd832 100644 --- a/libs/subliminal/converters/thesubdb.py +++ b/libs/subliminal/converters/thesubdb.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from babelfish import LanguageReverseConverter from ..exceptions import ConfigurationError diff --git a/libs/subliminal/converters/tvsubtitles.py b/libs/subliminal/converters/tvsubtitles.py index 45b9fed1a..7fd8f97d9 100644 --- a/libs/subliminal/converters/tvsubtitles.py +++ b/libs/subliminal/converters/tvsubtitles.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from babelfish import LanguageReverseConverter, language_converters diff --git a/libs/subliminal/core.py b/libs/subliminal/core.py index e61502525..cf26f333a 100644 --- a/libs/subliminal/core.py +++ b/libs/subliminal/core.py @@ -1,12 +1,14 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from collections import defaultdict import platform +from six.moves import range is_windows_special_path = False if platform.system() == "Windows": try: - __file__.decode("ascii") + __file__ except UnicodeDecodeError: is_windows_special_path = True diff --git a/libs/subliminal/extensions.py b/libs/subliminal/extensions.py index 495b68efc..78c66a230 100644 --- a/libs/subliminal/extensions.py +++ b/libs/subliminal/extensions.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from pkg_resources import EntryPoint from stevedore import ExtensionManager diff --git a/libs/subliminal/providers/__init__.py b/libs/subliminal/providers/__init__.py index 3ac78665f..882236ca3 100644 --- a/libs/subliminal/providers/__init__.py +++ b/libs/subliminal/providers/__init__.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import logging from bs4 import BeautifulSoup, FeatureNotFound diff --git a/libs/subliminal/providers/addic7ed.py b/libs/subliminal/providers/addic7ed.py index 2926081e0..d61570366 100644 --- a/libs/subliminal/providers/addic7ed.py +++ b/libs/subliminal/providers/addic7ed.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import logging import re diff --git a/libs/subliminal/providers/legendastv.py b/libs/subliminal/providers/legendastv.py index 925bb9ead..80c433d83 100644 --- a/libs/subliminal/providers/legendastv.py +++ b/libs/subliminal/providers/legendastv.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import io import json import logging diff --git a/libs/subliminal/providers/napiprojekt.py b/libs/subliminal/providers/napiprojekt.py index 0a1a3aa37..75aba3957 100644 --- a/libs/subliminal/providers/napiprojekt.py +++ b/libs/subliminal/providers/napiprojekt.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import logging from babelfish import Language @@ -7,6 +8,7 @@ from requests import Session from . import Provider from .. import __short_version__ from ..subtitle import Subtitle +from six.moves import range logger = logging.getLogger(__name__) diff --git a/libs/subliminal/providers/opensubtitles.py b/libs/subliminal/providers/opensubtitles.py index f7a87fcad..185c5cbfc 100644 --- a/libs/subliminal/providers/opensubtitles.py +++ b/libs/subliminal/providers/opensubtitles.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import base64 import logging import os diff --git a/libs/subliminal/providers/podnapisi.py b/libs/subliminal/providers/podnapisi.py index 5dcf25583..ddcf797d1 100644 --- a/libs/subliminal/providers/podnapisi.py +++ b/libs/subliminal/providers/podnapisi.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import io import logging import re @@ -138,7 +139,7 @@ class PodnapisiProvider(Provider): if subtitle_xml.find('release').text: for release in subtitle_xml.find('release').text.split(): release = re.sub(r'\.+$', '', release) # remove trailing dots - release = ''.join(filter(lambda x: ord(x) < 128, release)) # remove non-ascii characters + release = ''.join([x for x in release if ord(x) < 128]) # remove non-ascii characters releases.append(release) title = subtitle_xml.find('title').text season = int(subtitle_xml.find('tvSeason').text) diff --git a/libs/subliminal/providers/shooter.py b/libs/subliminal/providers/shooter.py index 62f11bd93..6cfb7489f 100644 --- a/libs/subliminal/providers/shooter.py +++ b/libs/subliminal/providers/shooter.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import json import logging import os diff --git a/libs/subliminal/providers/subscenter.py b/libs/subliminal/providers/subscenter.py index f9bf3c8cb..0ea896d76 100644 --- a/libs/subliminal/providers/subscenter.py +++ b/libs/subliminal/providers/subscenter.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import bisect from collections import defaultdict import io @@ -121,7 +122,7 @@ class SubsCenterProvider(Provider): self.session.close() - @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) + @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, should_cache_fn=lambda value: value) def _search_url_titles(self, title): """Search the URL titles by kind for the given `title`. @@ -209,7 +210,7 @@ class SubsCenterProvider(Provider): logger.debug('Found subtitle %r', subtitle) subtitles[subtitle_id] = subtitle - return subtitles.values() + return list(subtitles.values()) def list_subtitles(self, video, languages): season = episode = None diff --git a/libs/subliminal/providers/thesubdb.py b/libs/subliminal/providers/thesubdb.py index 87c1e7b79..1c9632dfb 100644 --- a/libs/subliminal/providers/thesubdb.py +++ b/libs/subliminal/providers/thesubdb.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import logging from babelfish import Language, language_converters diff --git a/libs/subliminal/providers/tvsubtitles.py b/libs/subliminal/providers/tvsubtitles.py index 56b692c6e..277dbdaa6 100644 --- a/libs/subliminal/providers/tvsubtitles.py +++ b/libs/subliminal/providers/tvsubtitles.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import io import logging import re diff --git a/libs/subliminal/refiners/metadata.py b/libs/subliminal/refiners/metadata.py index a84087423..91c077481 100644 --- a/libs/subliminal/refiners/metadata.py +++ b/libs/subliminal/refiners/metadata.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import logging import os diff --git a/libs/subliminal/refiners/omdb.py b/libs/subliminal/refiners/omdb.py index e2514ae9d..65c651dbf 100644 --- a/libs/subliminal/refiners/omdb.py +++ b/libs/subliminal/refiners/omdb.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import logging import operator diff --git a/libs/subliminal/refiners/tvdb.py b/libs/subliminal/refiners/tvdb.py index 575840455..2f2f4eed0 100644 --- a/libs/subliminal/refiners/tvdb.py +++ b/libs/subliminal/refiners/tvdb.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from datetime import datetime, timedelta from functools import wraps import logging diff --git a/libs/subliminal/score.py b/libs/subliminal/score.py index 61dd22c25..fee95db78 100644 --- a/libs/subliminal/score.py +++ b/libs/subliminal/score.py @@ -28,6 +28,7 @@ Available matches: """ from __future__ import division, print_function +from __future__ import absolute_import import logging from .video import Episode, Movie diff --git a/libs/subliminal/subtitle.py b/libs/subliminal/subtitle.py index 5c2c789b2..e225dec78 100644 --- a/libs/subliminal/subtitle.py +++ b/libs/subliminal/subtitle.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import codecs import logging import os diff --git a/libs/subliminal/utils.py b/libs/subliminal/utils.py index ac426d456..a6c93afea 100644 --- a/libs/subliminal/utils.py +++ b/libs/subliminal/utils.py @@ -1,9 +1,11 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from datetime import datetime import hashlib import os import re import struct +from six.moves import range def hash_opensubtitles(video_path): diff --git a/libs/subliminal/video.py b/libs/subliminal/video.py index 0db6c65c4..c673b204d 100644 --- a/libs/subliminal/video.py +++ b/libs/subliminal/video.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import division +from __future__ import absolute_import from datetime import datetime, timedelta import logging import os diff --git a/libs/subliminal_patch/__init__.py b/libs/subliminal_patch/__init__.py index e2d872246..522280c61 100644 --- a/libs/subliminal_patch/__init__.py +++ b/libs/subliminal_patch/__init__.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import subliminal # patch subliminal's subtitle and provider base @@ -12,8 +13,8 @@ from .core import scan_video, search_external_subtitles, list_all_subtitles, sav download_best_subtitles from .score import compute_score from .video import Video -import extensions -import http +from . import extensions +from . import http # patch subliminal's core functions subliminal.scan_video = subliminal.core.scan_video = scan_video diff --git a/libs/subliminal_patch/converters/assrt.py b/libs/subliminal_patch/converters/assrt.py index 06dd91338..729e0d3a5 100644 --- a/libs/subliminal_patch/converters/assrt.py +++ b/libs/subliminal_patch/converters/assrt.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from babelfish import LanguageReverseConverter from subliminal.exceptions import ConfigurationError diff --git a/libs/subliminal_patch/converters/hosszupuska.py b/libs/subliminal_patch/converters/hosszupuska.py index abbe394be..c0be2c3c4 100644 --- a/libs/subliminal_patch/converters/hosszupuska.py +++ b/libs/subliminal_patch/converters/hosszupuska.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from babelfish import LanguageReverseConverter, language_converters diff --git a/libs/subliminal_patch/converters/subscene.py b/libs/subliminal_patch/converters/subscene.py index 66b28de2d..1d1727c2e 100644 --- a/libs/subliminal_patch/converters/subscene.py +++ b/libs/subliminal_patch/converters/subscene.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import from babelfish import LanguageReverseConverter from subliminal.exceptions import ConfigurationError from subzero.language import Language diff --git a/libs/subliminal_patch/converters/supersubtitles.py b/libs/subliminal_patch/converters/supersubtitles.py index cc5697b42..4cb66889f 100644 --- a/libs/subliminal_patch/converters/supersubtitles.py +++ b/libs/subliminal_patch/converters/supersubtitles.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from babelfish import LanguageReverseConverter, language_converters diff --git a/libs/subliminal_patch/converters/titlovi.py b/libs/subliminal_patch/converters/titlovi.py index 761cf79a6..30baece8a 100644 --- a/libs/subliminal_patch/converters/titlovi.py +++ b/libs/subliminal_patch/converters/titlovi.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import logging from babelfish import LanguageReverseConverter diff --git a/libs/subliminal_patch/core.py b/libs/subliminal_patch/core.py index 0eff91856..fcd7ce3be 100644 --- a/libs/subliminal_patch/core.py +++ b/libs/subliminal_patch/core.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import codecs import json import re @@ -10,7 +11,7 @@ import time import operator import itertools -from httplib import ResponseNotReady +from six.moves.http_client import ResponseNotReady import rarfile import requests @@ -21,7 +22,7 @@ from babelfish import LanguageReverseError from guessit.jsonutils import GuessitEncoder from subliminal import ProviderError, refiner_manager -from extensions import provider_registry +from .extensions import provider_registry from subliminal.exceptions import ServiceUnavailable, DownloadLimitExceeded from subliminal.score import compute_score as default_compute_score from subliminal.utils import hash_napiprojekt, hash_opensubtitles, hash_shooter, hash_thesubdb @@ -32,6 +33,7 @@ from subliminal_patch.exceptions import TooManyRequests, APIThrottled from subzero.language import Language, ENDSWITH_LANGUAGECODE_RE from scandir import scandir, scandir_generic as _scandir_generic +import six logger = logging.getLogger(__name__) @@ -653,10 +655,10 @@ def search_external_subtitles(path, languages=None, only_one=False, match_strict for folder_or_subfolder in [video_path] + CUSTOM_PATHS: # folder_or_subfolder may be a relative path or an absolute one try: - abspath = unicode(os.path.abspath( + abspath = six.text_type(os.path.abspath( os.path.join(*[video_path if not os.path.isabs(folder_or_subfolder) else "", folder_or_subfolder, video_filename]))) - except Exception, e: + except Exception as e: logger.error("skipping path %s because of %s", repr(folder_or_subfolder), e) continue logger.debug("external subs: scanning path %s", abspath) @@ -864,7 +866,7 @@ def save_subtitles(file_path, subtitles, single=False, directory=None, chmod=Non if os.path.exists(subtitle_path): os.remove(subtitle_path) - with open(subtitle_path, 'w') as f: + with open(subtitle_path, 'wb') as f: f.write(content) subtitle.storage_path = subtitle_path else: diff --git a/libs/subliminal_patch/exceptions.py b/libs/subliminal_patch/exceptions.py index 9b166a29a..82f33ade8 100644 --- a/libs/subliminal_patch/exceptions.py +++ b/libs/subliminal_patch/exceptions.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import from subliminal import ProviderError diff --git a/libs/subliminal_patch/extensions.py b/libs/subliminal_patch/extensions.py index e155439a6..eeb2ee4cb 100644 --- a/libs/subliminal_patch/extensions.py +++ b/libs/subliminal_patch/extensions.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import from collections import OrderedDict import subliminal @@ -43,7 +44,7 @@ class ProviderRegistry(object): self.providers[name] = cls def names(self): - return self.providers.keys() + return list(self.providers.keys()) provider_registry = ProviderRegistry() diff --git a/libs/subliminal_patch/http.py b/libs/subliminal_patch/http.py index db313578e..204924b04 100644 --- a/libs/subliminal_patch/http.py +++ b/libs/subliminal_patch/http.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import json from collections import OrderedDict @@ -8,19 +9,21 @@ import os import socket import logging import requests -import xmlrpclib +import six.moves.xmlrpc_client import dns.resolver import ipaddress import re +from six import PY3 from requests import exceptions from urllib3.util import connection from retry.api import retry_call -from exceptions import APIThrottled +from .exceptions import APIThrottled from dogpile.cache.api import NO_VALUE from subliminal.cache import region from subliminal_patch.pitcher import pitchers from cloudscraper import CloudScraper +import six try: import brotli @@ -28,14 +31,14 @@ except: pass try: - from urlparse import urlparse + from six.moves.urllib.parse import urlparse except ImportError: from urllib.parse import urlparse from subzero.lib.io import get_viable_encoding logger = logging.getLogger(__name__) -pem_file = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(unicode(__file__, get_viable_encoding()))), "..", certifi.where())) +pem_file = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", certifi.where())) try: default_ssl_context = ssl.create_default_context(cafile=pem_file) except AttributeError: @@ -99,14 +102,18 @@ class CFSession(CloudScraper): # Solve Challenge resp = self.sendChallengeResponse(resp, **kwargs) - except ValueError, e: - if e.message == "Captcha": + except ValueError as e: + if PY3: + error = str(e) + else: + error = e.message + if error == "Captcha": parsed_url = urlparse(url) domain = parsed_url.netloc # solve the captcha - site_key = re.search(r'data-sitekey="(.+?)"', resp.content).group(1) - challenge_s = re.search(r'type="hidden" name="s" value="(.+?)"', resp.content).group(1) - challenge_ray = re.search(r'data-ray="(.+?)"', resp.content).group(1) + site_key = re.search(r'data-sitekey="(.+?)"', resp.text).group(1) + challenge_s = re.search(r'type="hidden" name="s" value="(.+?)"', resp.text).group(1) + challenge_ray = re.search(r'data-ray="(.+?)"', resp.text).group(1) if not all([site_key, challenge_s, challenge_ray]): raise Exception("cf: Captcha site-key not found!") @@ -150,7 +157,7 @@ class CFSession(CloudScraper): if cf_data is not NO_VALUE: cf_cookies, hdrs = cf_data logger.debug("Trying to use old cf data for %s: %s", domain, cf_data) - for cookie, value in cf_cookies.iteritems(): + for cookie, value in six.iteritems(cf_cookies): self.cookies.set(cookie, value, domain=domain) self.headers = hdrs @@ -181,10 +188,10 @@ class CFSession(CloudScraper): "Unable to find Cloudflare cookies. Does the site actually have " "Cloudflare IUAM (\"I'm Under Attack Mode\") enabled?") - return (OrderedDict(filter(lambda x: x[1], [ + return (OrderedDict([x for x in [ ("__cfduid", self.cookies.get("__cfduid", "", domain=cookie_domain)), ("cf_clearance", self.cookies.get("cf_clearance", "", domain=cookie_domain)) - ])), + ] if x[1]]), self.headers ) @@ -231,7 +238,7 @@ class RetryingCFSession(RetryingSession, CFSession): pass -class SubZeroRequestsTransport(xmlrpclib.SafeTransport): +class SubZeroRequestsTransport(six.moves.xmlrpc_client.SafeTransport): """ Drop in Transport for xmlrpclib that uses Requests instead of httplib @@ -254,7 +261,7 @@ class SubZeroRequestsTransport(xmlrpclib.SafeTransport): "https": proxy } - xmlrpclib.SafeTransport.__init__(self, *args, **kwargs) + six.moves.xmlrpc_client.SafeTransport.__init__(self, *args, **kwargs) def request(self, host, handler, request_body, verbose=0): """ @@ -317,7 +324,7 @@ def patch_create_connection(): host, port = address try: - ipaddress.ip_address(unicode(host)) + ipaddress.ip_address(six.text_type(host)) except (ipaddress.AddressValueError, ValueError): __custom_resolver_ips = os.environ.get("dns_resolvers", None) diff --git a/libs/subliminal_patch/language.py b/libs/subliminal_patch/language.py index 83f70e155..b001bf5d1 100644 --- a/libs/subliminal_patch/language.py +++ b/libs/subliminal_patch/language.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import from subliminal.converters.addic7ed import Addic7edConverter from babelfish.converters.opensubtitles import OpenSubtitlesConverter diff --git a/libs/subliminal_patch/pitcher.py b/libs/subliminal_patch/pitcher.py index b2cef63b3..be1302943 100644 --- a/libs/subliminal_patch/pitcher.py +++ b/libs/subliminal_patch/pitcher.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import os import time import logging @@ -9,6 +10,8 @@ from dogpile.cache.api import NO_VALUE from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, AnticaptchaException,\ Proxy from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TOKEN_TIMEOUT +import six +from six.moves import range logger = logging.getLogger(__name__) @@ -167,7 +170,7 @@ class AntiCaptchaPitcher(AntiCaptchaProxyLessPitcher): self.user_agent = kwargs.pop("user_agent") cookies = kwargs.pop("cookies", {}) if isinstance(cookies, dict): - self.cookies = ";".join(["%s=%s" % (k, v) for k, v in cookies.iteritems()]) + self.cookies = ";".join(["%s=%s" % (k, v) for k, v in six.iteritems(cookies)]) super(AntiCaptchaPitcher, self).__init__(*args, **kwargs) diff --git a/libs/subliminal_patch/providers/__init__.py b/libs/subliminal_patch/providers/__init__.py index a3520fce5..ced4694f3 100644 --- a/libs/subliminal_patch/providers/__init__.py +++ b/libs/subliminal_patch/providers/__init__.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import importlib import os import subliminal @@ -10,6 +11,7 @@ from subliminal_patch.http import RetryingSession from subliminal_patch.subtitle import Subtitle, guess_matches from subzero.lib.io import get_viable_encoding +import six class Provider(_Provider): @@ -20,7 +22,7 @@ class Provider(_Provider): # register providers # fixme: this is bad -for name in os.listdir(os.path.dirname(unicode(__file__, get_viable_encoding()))): +for name in os.listdir(os.path.dirname(__file__)): if name in ("__init__.py", "mixins.py", "utils.py") or not name.endswith(".py"): continue diff --git a/libs/subliminal_patch/providers/addic7ed.py b/libs/subliminal_patch/providers/addic7ed.py index f24a35634..880a4729c 100644 --- a/libs/subliminal_patch/providers/addic7ed.py +++ b/libs/subliminal_patch/providers/addic7ed.py @@ -112,13 +112,13 @@ class Addic7edProvider(_Addic7edProvider): while tries <= 3: tries += 1 r = self.session.get(self.server_url + 'login.php', timeout=10, headers={"Referer": self.server_url}) - if "g-recaptcha" in r.content or "grecaptcha" in r.content: + if "g-recaptcha" in r.text or "grecaptcha" in r.text: logger.info('Addic7ed: Solving captcha. This might take a couple of minutes, but should only ' 'happen once every so often') for g, s in (("g-recaptcha-response", r'g-recaptcha.+?data-sitekey=\"(.+?)\"'), ("recaptcha_response", r'grecaptcha.execute\(\'(.+?)\',')): - site_key = re.search(s, r.content).group(1) + site_key = re.search(s, r.text).group(1) if site_key: break if not site_key: @@ -142,10 +142,10 @@ class Addic7edProvider(_Addic7edProvider): r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10, headers={"Referer": self.server_url + "login.php"}) - if "relax, slow down" in r.content: + if "relax, slow down" in r.text: raise TooManyRequests(self.username) - if "Wrong password" in r.content or "doesn't exist" in r.content: + if "Wrong password" in r.text or "doesn't exist" in r.text: raise AuthenticationError(self.username) if r.status_code != 302: @@ -240,8 +240,8 @@ class Addic7edProvider(_Addic7edProvider): if show_cells: soup = ParserBeautifulSoup(b''.join(show_cells).decode('utf-8', 'ignore'), ['lxml', 'html.parser']) else: - # If RegEx fails, fall back to original r.content and use 'html.parser' - soup = ParserBeautifulSoup(r.content, ['html.parser']) + # If RegEx fails, fall back to original r.text and use 'html.parser' + soup = ParserBeautifulSoup(r.text, ['html.parser']) # populate the show ids show_ids = {} @@ -301,7 +301,7 @@ class Addic7edProvider(_Addic7edProvider): r = self.session.get(self.server_url + endpoint, params=params, timeout=10, headers=headers) r.raise_for_status() - if r.content and "Sorry, your search" not in r.content: + if r.text and "Sorry, your search" not in r.text: break time.sleep(4) @@ -309,7 +309,7 @@ class Addic7edProvider(_Addic7edProvider): if r.status_code == 304: raise TooManyRequests() - soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + soup = ParserBeautifulSoup(r.text, ['lxml', 'html.parser']) suggestion = None @@ -351,13 +351,13 @@ class Addic7edProvider(_Addic7edProvider): if r.status_code == 304: raise TooManyRequests() - if not r.content: + if not r.text: # Provider wrongful return a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.error('No data returned from provider') return [] - soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + soup = ParserBeautifulSoup(r.text, ['lxml', 'html.parser']) # loop over subtitle rows subtitles = [] @@ -400,7 +400,7 @@ class Addic7edProvider(_Addic7edProvider): if r.status_code == 304: raise TooManyRequests() - if not r.content: + if not r.text: # Provider wrongful return a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.error('Unable to download subtitle. No data returned from provider') diff --git a/libs/subliminal_patch/providers/argenteam.py b/libs/subliminal_patch/providers/argenteam.py index 5fe3d28cc..c3707925f 100644 --- a/libs/subliminal_patch/providers/argenteam.py +++ b/libs/subliminal_patch/providers/argenteam.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import logging import os import io diff --git a/libs/subliminal_patch/providers/assrt.py b/libs/subliminal_patch/providers/assrt.py index 67300dd0b..4831da792 100644 --- a/libs/subliminal_patch/providers/assrt.py +++ b/libs/subliminal_patch/providers/assrt.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import json import logging import os @@ -20,7 +21,7 @@ logger = logging.getLogger(__name__) language_converters.register('assrt = subliminal_patch.converters.assrt:AssrtConverter') server_url = 'https://api.assrt.net/v1' -supported_languages = language_converters['assrt'].to_assrt.keys() +supported_languages = list(language_converters['assrt'].to_assrt.keys()) class AssrtSubtitle(Subtitle): @@ -139,7 +140,7 @@ class AssrtProvider(Provider): logger.debug('No subtitle found') # parse the subtitles - pattern = re.compile(ur'lang(?P<code>\w+)') + pattern = re.compile(r'lang(?P<code>\w+)') subtitles = [] for sub in result['sub']['subs']: if 'lang' not in sub: diff --git a/libs/subliminal_patch/providers/betaseries.py b/libs/subliminal_patch/providers/betaseries.py index 4452e614b..b44de5329 100644 --- a/libs/subliminal_patch/providers/betaseries.py +++ b/libs/subliminal_patch/providers/betaseries.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import json import logging import os diff --git a/libs/subliminal_patch/providers/greeksubtitles.py b/libs/subliminal_patch/providers/greeksubtitles.py index 979f4e191..2169885e3 100644 --- a/libs/subliminal_patch/providers/greeksubtitles.py +++ b/libs/subliminal_patch/providers/greeksubtitles.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import io import logging import os diff --git a/libs/subliminal_patch/providers/hosszupuska.py b/libs/subliminal_patch/providers/hosszupuska.py index 186fb06d7..62cf77bc8 100644 --- a/libs/subliminal_patch/providers/hosszupuska.py +++ b/libs/subliminal_patch/providers/hosszupuska.py @@ -1,5 +1,6 @@ # coding: utf-8 +from __future__ import absolute_import import io import six import logging @@ -115,7 +116,7 @@ class HosszupuskaSubtitle(Subtitle): if video.format and self.version and video.format.lower() in self.version.lower(): matches.add('format') # other properties - matches |= guess_matches(video, guessit(self.release_info.encode("utf-8"))) + matches |= guess_matches(video, guessit(self.release_info)) return matches diff --git a/libs/subliminal_patch/providers/legendastv.py b/libs/subliminal_patch/providers/legendastv.py index 0661b5f4d..285e0c89e 100644 --- a/libs/subliminal_patch/providers/legendastv.py +++ b/libs/subliminal_patch/providers/legendastv.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import logging import rarfile import os @@ -197,7 +198,7 @@ class LegendasTVProvider(_LegendasTVProvider): expiration_time = (datetime.utcnow().replace(tzinfo=pytz.utc) - a.timestamp).total_seconds() # attempt to get the releases from the cache - cache_key = releases_key.format(archive_id=a.id, archive_name=a.name) + cache_key = str(a.id + "|" + a.name) releases = region.get(cache_key, expiration_time=expiration_time) # the releases are not in cache or cache is expired diff --git a/libs/subliminal_patch/providers/mixins.py b/libs/subliminal_patch/providers/mixins.py index 255caa447..98299e02c 100644 --- a/libs/subliminal_patch/providers/mixins.py +++ b/libs/subliminal_patch/providers/mixins.py @@ -1,12 +1,13 @@ # coding=utf-8 +from __future__ import absolute_import import re import time import logging import traceback import types import os -from httplib import ResponseNotReady +from six.moves.http_client import ResponseNotReady from guessit import guessit from subliminal import ProviderError @@ -107,7 +108,7 @@ class ProviderSubtitleArchiveMixin(object): if "format" in subtitle.matches: format_matches = False - if isinstance(subtitle.releases, types.ListType): + if isinstance(subtitle.releases, list): releases = ",".join(subtitle.releases).lower() else: releases = subtitle.releases.lower() @@ -117,7 +118,7 @@ class ProviderSubtitleArchiveMixin(object): else: formats = guess["format"] - if not isinstance(formats, types.ListType): + if not isinstance(formats, list): formats = [formats] for f in formats: @@ -157,13 +158,5 @@ class ProviderSubtitleArchiveMixin(object): elif subs_fallback: matching_sub = subs_fallback[0] - try: - matching_sub_unicode = matching_sub.decode("utf-8") - except UnicodeDecodeError: - try: - matching_sub_unicode = matching_sub.decode("cp437") - except UnicodeDecodeError: - matching_sub_unicode = matching_sub.decode("utf-8", errors='replace') - - logger.info(u"Using %s from the archive", matching_sub_unicode) + logger.info(u"Using %s from the archive", matching_sub) return fix_line_ending(archive.read(matching_sub)) diff --git a/libs/subliminal_patch/providers/napiprojekt.py b/libs/subliminal_patch/providers/napiprojekt.py index 0b80f821d..119892b0a 100644 --- a/libs/subliminal_patch/providers/napiprojekt.py +++ b/libs/subliminal_patch/providers/napiprojekt.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import logging from subliminal.providers.napiprojekt import NapiProjektProvider as _NapiProjektProvider, \ diff --git a/libs/subliminal_patch/providers/napisy24.py b/libs/subliminal_patch/providers/napisy24.py index 553e948f2..acbc2bba9 100644 --- a/libs/subliminal_patch/providers/napisy24.py +++ b/libs/subliminal_patch/providers/napisy24.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import import logging import os from io import BytesIO diff --git a/libs/subliminal_patch/providers/nekur.py b/libs/subliminal_patch/providers/nekur.py index 06fb1401f..d9892afb5 100644 --- a/libs/subliminal_patch/providers/nekur.py +++ b/libs/subliminal_patch/providers/nekur.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import io import logging from random import randint diff --git a/libs/subliminal_patch/providers/opensubtitles.py b/libs/subliminal_patch/providers/opensubtitles.py index a314087c2..34007ed4e 100644 --- a/libs/subliminal_patch/providers/opensubtitles.py +++ b/libs/subliminal_patch/providers/opensubtitles.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import base64 import logging import os @@ -13,7 +14,7 @@ from subliminal.exceptions import ConfigurationError, ServiceUnavailable from subliminal.providers.opensubtitles import OpenSubtitlesProvider as _OpenSubtitlesProvider,\ OpenSubtitlesSubtitle as _OpenSubtitlesSubtitle, Episode, Movie, ServerProxy, Unauthorized, NoSession, \ DownloadLimitReached, InvalidImdbid, UnknownUserAgent, DisabledUserAgent, OpenSubtitlesError -from mixins import ProviderRetryMixin +from .mixins import ProviderRetryMixin from subliminal.subtitle import fix_line_ending from subliminal_patch.http import SubZeroRequestsTransport from subliminal_patch.utils import sanitize @@ -153,7 +154,7 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider): self.token = response['token'] logger.debug('Logged in with token %r', self.token[:10]+"X"*(len(self.token)-10)) - region.set("os_token", self.token) + region.set("os_token", bytearray(self.token, encoding='utf-8')) def use_token_or_login(self, func): if not self.token: @@ -174,7 +175,7 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider): logger.info('Logging in') - token = region.get("os_token") + token = str(region.get("os_token")) if token is not NO_VALUE: try: logger.debug('Trying previous token: %r', token[:10]+"X"*(len(token)-10)) diff --git a/libs/subliminal_patch/providers/podnapisi.py b/libs/subliminal_patch/providers/podnapisi.py index 16b7c2d7e..b388f8560 100644 --- a/libs/subliminal_patch/providers/podnapisi.py +++ b/libs/subliminal_patch/providers/podnapisi.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import logging import re import io diff --git a/libs/subliminal_patch/providers/shooter.py b/libs/subliminal_patch/providers/shooter.py index af3870d99..8b81856df 100644 --- a/libs/subliminal_patch/providers/shooter.py +++ b/libs/subliminal_patch/providers/shooter.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import from subliminal.providers.shooter import ShooterProvider as _ShooterProvider, ShooterSubtitle as _ShooterSubtitle diff --git a/libs/subliminal_patch/providers/subdivx.py b/libs/subliminal_patch/providers/subdivx.py index 8fd2b58cf..ab7db33b6 100644 --- a/libs/subliminal_patch/providers/subdivx.py +++ b/libs/subliminal_patch/providers/subdivx.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import io import logging import os @@ -15,6 +16,7 @@ from subliminal.providers import ParserBeautifulSoup, Provider from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending,guess_matches from subliminal.video import Episode, Movie from subliminal_patch.exceptions import ParseResponseError +from six.moves import range logger = logging.getLogger(__name__) diff --git a/libs/subliminal_patch/providers/subs4free.py b/libs/subliminal_patch/providers/subs4free.py index 153b9f469..e25e1a7e2 100644 --- a/libs/subliminal_patch/providers/subs4free.py +++ b/libs/subliminal_patch/providers/subs4free.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import io import logging import os diff --git a/libs/subliminal_patch/providers/subs4series.py b/libs/subliminal_patch/providers/subs4series.py index d58d8aa23..96b756914 100644 --- a/libs/subliminal_patch/providers/subs4series.py +++ b/libs/subliminal_patch/providers/subs4series.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import io import logging import os diff --git a/libs/subliminal_patch/providers/subscene.py b/libs/subliminal_patch/providers/subscene.py index 7f6e3911b..a377813f7 100644 --- a/libs/subliminal_patch/providers/subscene.py +++ b/libs/subliminal_patch/providers/subscene.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import io import logging import os @@ -11,8 +12,8 @@ import requests import inflect import re import json -import HTMLParser -import urlparse +import six.moves.html_parser +import six.moves.urllib.parse from zipfile import ZipFile from babelfish import language_converters @@ -29,6 +30,7 @@ from subliminal_patch.subtitle import Subtitle, guess_matches from subliminal_patch.converters.subscene import language_ids, supported_languages from subscene_api.subscene import search, Subtitle as APISubtitle, SITE_DOMAIN from subzero.language import Language +import six p = inflect.engine() @@ -150,16 +152,16 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin): def login(self): r = self.session.get("https://subscene.com/account/login") - if "Server Error" in r.content: + if "Server Error" in r.text: logger.error("Login unavailable; Maintenance?") raise ServiceUnavailable("Login unavailable; Maintenance?") - match = re.search(r"<script id='modelJson' type='application/json'>\s*(.+)\s*</script>", r.content) + match = re.search(r"<script id='modelJson' type='application/json'>\s*(.+)\s*</script>", r.text) if match: - h = HTMLParser.HTMLParser() + h = six.moves.html_parser.HTMLParser() data = json.loads(h.unescape(match.group(1))) - login_url = urlparse.urljoin(data["siteUrl"], data["loginUrl"]) + login_url = six.moves.urllib.parse.urljoin(data["siteUrl"], data["loginUrl"]) time.sleep(1.0) r = self.session.post(login_url, @@ -176,7 +178,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin): r"scope.+?value=\"(?P<scope>.+?)\".+?" r"state.+?value=\"(?P<state>.+?)\".+?" r"session_state.+?value=\"(?P<session_state>.+?)\"", - r.content, re.MULTILINE | re.DOTALL) + r.text, re.MULTILINE | re.DOTALL) if pep_content: r = self.session.post(SITE_DOMAIN, pep_content.groupdict()) @@ -187,7 +189,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin): else: cj = self.session.cookies.copy() store_cks = ("scene", "idsrv", "idsrv.xsrf", "idsvr.clients", "idsvr.session", "idsvr.username") - for cn in self.session.cookies.iterkeys(): + for cn in six.iterkeys(self.session.cookies): if cn not in store_cks: del cj[cn] @@ -266,7 +268,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin): for s in film.subtitles: try: subtitle = SubsceneSubtitle.from_api(s) - except NotImplementedError, e: + except NotImplementedError as e: logger.info(e) continue subtitle.asked_for_release_group = video.release_group diff --git a/libs/subliminal_patch/providers/subscenter.py b/libs/subliminal_patch/providers/subscenter.py index 5626c9935..4bcf1c4f2 100644 --- a/libs/subliminal_patch/providers/subscenter.py +++ b/libs/subliminal_patch/providers/subscenter.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import from subliminal.providers.subscenter import SubsCenterProvider as _SubsCenterProvider, \ SubsCenterSubtitle as _SubsCenterSubtitle from subzero.language import Language diff --git a/libs/subliminal_patch/providers/subssabbz.py b/libs/subliminal_patch/providers/subssabbz.py index ddcd47a7b..709029169 100644 --- a/libs/subliminal_patch/providers/subssabbz.py +++ b/libs/subliminal_patch/providers/subssabbz.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import logging import re import io diff --git a/libs/subliminal_patch/providers/subsunacs.py b/libs/subliminal_patch/providers/subsunacs.py index d616901eb..5af116d99 100644 --- a/libs/subliminal_patch/providers/subsunacs.py +++ b/libs/subliminal_patch/providers/subsunacs.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import logging import re import io diff --git a/libs/subliminal_patch/providers/subtitriid.py b/libs/subliminal_patch/providers/subtitriid.py index d21ec7324..516cf02df 100644 --- a/libs/subliminal_patch/providers/subtitriid.py +++ b/libs/subliminal_patch/providers/subtitriid.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import io import logging from random import randint diff --git a/libs/subliminal_patch/providers/subtitulamostv.py b/libs/subliminal_patch/providers/subtitulamostv.py index 4ad74a67b..fa3d4afa9 100644 --- a/libs/subliminal_patch/providers/subtitulamostv.py +++ b/libs/subliminal_patch/providers/subtitulamostv.py @@ -120,7 +120,7 @@ class SubtitulamosTVProvider(Provider): if video.alternative_series is None: if video.series.lower() == s['name'].lower(): matches.add('series') - elif s['name'].lower() in [video.series.lower()]+map(lambda name: name.lower(), video.alternative_series): + elif s['name'].lower() in [video.series.lower()]+list(map(lambda name: name.lower(), video.alternative_series)): matches.add('series') if video.season == e['season']: matches.add('season') @@ -153,7 +153,7 @@ class SubtitulamosTVProvider(Provider): r.raise_for_status() r.encoding = "ISO-8859-1" - subtitle_content = r.text + subtitle_content = r.content if subtitle_content: subtitle.content = fix_line_ending(subtitle_content) diff --git a/libs/subliminal_patch/providers/subz.py b/libs/subliminal_patch/providers/subz.py index 1519c7f9e..b08676d6a 100644 --- a/libs/subliminal_patch/providers/subz.py +++ b/libs/subliminal_patch/providers/subz.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import io import json import logging diff --git a/libs/subliminal_patch/providers/supersubtitles.py b/libs/subliminal_patch/providers/supersubtitles.py index 97c841103..a537ac7ab 100644 --- a/libs/subliminal_patch/providers/supersubtitles.py +++ b/libs/subliminal_patch/providers/supersubtitles.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import io import six import os @@ -83,7 +84,7 @@ class SuperSubtitlesSubtitle(Subtitle): return str(self.subtitle_id) def get_matches(self, video): - matches = guess_matches(video, guessit(self.release_info.encode("utf-8"))) + matches = guess_matches(video, guessit(self.release_info)) # episode if isinstance(video, Episode): diff --git a/libs/subliminal_patch/providers/titlovi.py b/libs/subliminal_patch/providers/titlovi.py index 9be0a92f6..fda393343 100644 --- a/libs/subliminal_patch/providers/titlovi.py +++ b/libs/subliminal_patch/providers/titlovi.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import io import logging import re @@ -28,6 +29,7 @@ from subliminal.subtitle import fix_line_ending from subzero.language import Language from dogpile.cache.api import NO_VALUE from subliminal.cache import region +from six.moves import map # parsing regex definitions title_re = re.compile(r'(?P<title>(?:.+(?= [Aa][Kk][Aa] ))|.+)(?:(?:.+)(?P<altitle>(?<= [Aa][Kk][Aa] ).+))?') @@ -201,7 +203,7 @@ class TitloviProvider(Provider, ProviderSubtitleArchiveMixin): # handle possible duplicate use of Serbian Latin if "sr" in lang_strings and "sr-Latn" in lang_strings: logger.info('Duplicate entries <Language [sr]> and <Language [sr-Latn]> found, filtering languages') - used_languages = filter(lambda l: l != Language.fromietf('sr-Latn'), used_languages) + used_languages = [l for l in used_languages if l != Language.fromietf('sr-Latn')] logger.info('Filtered language list %r', used_languages) # convert list of languages into search string diff --git a/libs/subliminal_patch/providers/titulky.py b/libs/subliminal_patch/providers/titulky.py index 294549bd6..87b88e40f 100644 --- a/libs/subliminal_patch/providers/titulky.py +++ b/libs/subliminal_patch/providers/titulky.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import io import logging import os @@ -184,7 +185,7 @@ class TitulkyProvider(Provider): 'Password': self.password} r = self.session.post(self.server_url, data, allow_redirects=False, timeout=10) - if 'BadLogin' in r.content: + if 'BadLogin' in r.text: raise AuthenticationError(self.username) logger.debug('Logged in') @@ -258,9 +259,9 @@ class TitulkyProvider(Provider): langs = 'CZ' name = '%s (%s)' % (version, langs) - if ('CZ' in langs): + if b'CZ' in langs: language = Language('ces') - elif ('SK' in langs): + elif b'SK' in langs: language = Language('slk') # read the item # subtitle = self.subtitle_class(language, page_link, year, version, page_link.replace("detail", "dld")) diff --git a/libs/subliminal_patch/providers/tvsubtitles.py b/libs/subliminal_patch/providers/tvsubtitles.py index 7f359772d..9e0ec4216 100644 --- a/libs/subliminal_patch/providers/tvsubtitles.py +++ b/libs/subliminal_patch/providers/tvsubtitles.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import logging diff --git a/libs/subliminal_patch/providers/xsubs.py b/libs/subliminal_patch/providers/xsubs.py index c584aacf9..9f8854b92 100644 --- a/libs/subliminal_patch/providers/xsubs.py +++ b/libs/subliminal_patch/providers/xsubs.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import logging import re diff --git a/libs/subliminal_patch/providers/zimuku.py b/libs/subliminal_patch/providers/zimuku.py index 5090816f8..0dad58cde 100644 --- a/libs/subliminal_patch/providers/zimuku.py +++ b/libs/subliminal_patch/providers/zimuku.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import io import logging import os diff --git a/libs/subliminal_patch/refiners/common.py b/libs/subliminal_patch/refiners/common.py index eda3c967c..b8625b8eb 100644 --- a/libs/subliminal_patch/refiners/common.py +++ b/libs/subliminal_patch/refiners/common.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import logging import os diff --git a/libs/subliminal_patch/refiners/drone.py b/libs/subliminal_patch/refiners/drone.py index ae96e26c9..baa183d1e 100644 --- a/libs/subliminal_patch/refiners/drone.py +++ b/libs/subliminal_patch/refiners/drone.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import logging import types import os @@ -10,6 +11,7 @@ from requests.compat import urljoin, quote from subliminal import Episode, Movie, region from subliminal_patch.core import remove_crap_from_fn from subliminal_patch.http import CertifiSession +import six logger = logging.getLogger(__name__) @@ -52,11 +54,11 @@ class DroneAPIClient(object): :return: """ out = {} - for key, value in params.iteritems(): - if not isinstance(value, types.StringTypes): + for key, value in six.iteritems(params): + if not isinstance(value, (str,)): value = str(value) - elif isinstance(value, unicode): + elif isinstance(value, six.text_type): value = value.encode("utf-8") key = key.split('_')[0] + ''.join(x.capitalize() for x in key.split('_')[1:]) diff --git a/libs/subliminal_patch/refiners/file_info_file.py b/libs/subliminal_patch/refiners/file_info_file.py index 2670cec54..5af40d54e 100644 --- a/libs/subliminal_patch/refiners/file_info_file.py +++ b/libs/subliminal_patch/refiners/file_info_file.py @@ -1,10 +1,11 @@ # coding=utf-8 +from __future__ import absolute_import import sys import os import logging import codecs -from common import update_video +from .common import update_video logger = logging.getLogger(__name__) diff --git a/libs/subliminal_patch/refiners/filebot.py b/libs/subliminal_patch/refiners/filebot.py index 332026d9d..e71f2f569 100644 --- a/libs/subliminal_patch/refiners/filebot.py +++ b/libs/subliminal_patch/refiners/filebot.py @@ -1,8 +1,9 @@ # coding=utf-8 +from __future__ import absolute_import import logging from libfilebot import get_filebot_attrs -from common import update_video +from .common import update_video logger = logging.getLogger(__name__) diff --git a/libs/subliminal_patch/refiners/metadata.py b/libs/subliminal_patch/refiners/metadata.py index 74c9f278c..40a9bec3e 100644 --- a/libs/subliminal_patch/refiners/metadata.py +++ b/libs/subliminal_patch/refiners/metadata.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import logging import os diff --git a/libs/subliminal_patch/refiners/omdb.py b/libs/subliminal_patch/refiners/omdb.py index f8cfba105..fe209f323 100644 --- a/libs/subliminal_patch/refiners/omdb.py +++ b/libs/subliminal_patch/refiners/omdb.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import os import subliminal import base64 diff --git a/libs/subliminal_patch/refiners/symlinks.py b/libs/subliminal_patch/refiners/symlinks.py index 9fd246995..e7b9e5120 100644 --- a/libs/subliminal_patch/refiners/symlinks.py +++ b/libs/subliminal_patch/refiners/symlinks.py @@ -1,7 +1,8 @@ # coding=utf-8 +from __future__ import absolute_import import os -from common import update_video +from .common import update_video def refine(video, **kwargs): diff --git a/libs/subliminal_patch/refiners/tvdb.py b/libs/subliminal_patch/refiners/tvdb.py index 808c8ef90..dbbe5b5e2 100644 --- a/libs/subliminal_patch/refiners/tvdb.py +++ b/libs/subliminal_patch/refiners/tvdb.py @@ -1,11 +1,12 @@ # coding=utf-8 +from __future__ import absolute_import import datetime from subliminal.refiners.tvdb import Episode, logger, search_series, series_re, sanitize, get_series, \ get_series_episode, region, tvdb_client -from util import fix_session_bases +from .util import fix_session_bases TVDB_SEASON_EXPIRATION_TIME = datetime.timedelta(days=1).total_seconds() diff --git a/libs/subliminal_patch/refiners/util.py b/libs/subliminal_patch/refiners/util.py index f83701684..a681c88d4 100644 --- a/libs/subliminal_patch/refiners/util.py +++ b/libs/subliminal_patch/refiners/util.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import types from subliminal_patch.http import TimeoutSession diff --git a/libs/subliminal_patch/score.py b/libs/subliminal_patch/score.py index 918f8f668..c7b687e8c 100644 --- a/libs/subliminal_patch/score.py +++ b/libs/subliminal_patch/score.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import logging from subliminal.video import Episode, Movie diff --git a/libs/subliminal_patch/subtitle.py b/libs/subliminal_patch/subtitle.py index 057be546a..1b3ce002a 100644 --- a/libs/subliminal_patch/subtitle.py +++ b/libs/subliminal_patch/subtitle.py @@ -1,6 +1,7 @@ # coding=utf-8 +from __future__ import absolute_import import logging import traceback @@ -103,7 +104,7 @@ class Subtitle(Subtitle_): self.set_encoding("utf-8") # normalize line endings - self.content = self.content.replace("\r\n", "\n").replace('\r', '\n') + self.content = self.content.replace(b"\r\n", b"\n").replace(b'\r', b'\n') def guess_encoding(self): """Guess encoding using the language, falling back on chardet. @@ -272,9 +273,9 @@ class Subtitle(Subtitle_): def prepare_text(text, style): body = [] for fragment, sty in parse_tags(text, style, sub.styles): - fragment = fragment.replace(ur"\h", u" ") - fragment = fragment.replace(ur"\n", u"\n") - fragment = fragment.replace(ur"\N", u"\n") + fragment = fragment.replace(r"\h", u" ") + fragment = fragment.replace(r"\n", u"\n") + fragment = fragment.replace(r"\N", u"\n") if format == "srt": if sty.italic: fragment = u"<i>%s</i>" % fragment @@ -362,7 +363,7 @@ def guess_matches(video, guess, partial=False): # series if video.series and 'title' in guess: titles = guess["title"] - if not isinstance(titles, types.ListType): + if not isinstance(titles, list): titles = [titles] for title in titles: @@ -401,7 +402,7 @@ def guess_matches(video, guess, partial=False): # release_group if 'release_group' in guess: release_groups = guess["release_group"] - if not isinstance(release_groups, types.ListType): + if not isinstance(release_groups, list): release_groups = [release_groups] if video.release_group: @@ -418,7 +419,7 @@ def guess_matches(video, guess, partial=False): # format if 'format' in guess: formats = guess["format"] - if not isinstance(formats, types.ListType): + if not isinstance(formats, list): formats = [formats] if video.format: diff --git a/libs/subliminal_patch/utils.py b/libs/subliminal_patch/utils.py index 410b7a2be..e7b84a378 100644 --- a/libs/subliminal_patch/utils.py +++ b/libs/subliminal_patch/utils.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import re diff --git a/libs/subliminal_patch/video.py b/libs/subliminal_patch/video.py index a53b09ead..5487bf147 100644 --- a/libs/subliminal_patch/video.py +++ b/libs/subliminal_patch/video.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import os from subliminal.video import Video as Video_ diff --git a/libs/subscene_api/subscene.py b/libs/subscene_api/subscene.py index 827110369..086d31d98 100644 --- a/libs/subscene_api/subscene.py +++ b/libs/subscene_api/subscene.py @@ -25,7 +25,6 @@ this script that does the job by parsing the website"s pages. # imports import re - import enum import sys import requests @@ -38,7 +37,7 @@ if is_PY2: from urllib2 import Request, urlopen else: from contextlib import suppress - from urllib2.request import Request, urlopen + from urllib.request import Request, urlopen from dogpile.cache.api import NO_VALUE from subliminal.cache import region @@ -56,7 +55,7 @@ DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWeb"\ "Kit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36" -ENDPOINT_RE = re.compile(ur'(?uis)<form.+?action="/subtitles/(.+)">.*?<input type="text"') +ENDPOINT_RE = re.compile(r'(?uis)<form.+?action="/subtitles/(.+)">.*?<input type="text"') class NewEndpoint(Exception): diff --git a/libs/subzero/analytics.py b/libs/subzero/analytics.py index 297bcf2c5..bda9d6252 100644 --- a/libs/subzero/analytics.py +++ b/libs/subzero/analytics.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import struct import binascii diff --git a/libs/subzero/cache_backends/file.py b/libs/subzero/cache_backends/file.py index 1ad404deb..d72d7b84a 100644 --- a/libs/subzero/cache_backends/file.py +++ b/libs/subzero/cache_backends/file.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import from dogpile.cache.api import CacheBackend, NO_VALUE from fcache.cache import FileCache diff --git a/libs/subzero/history_storage.py b/libs/subzero/history_storage.py index 38d859d55..e4e382d4f 100644 --- a/libs/subzero/history_storage.py +++ b/libs/subzero/history_storage.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import datetime import logging import traceback @@ -7,7 +8,8 @@ import types from subzero.language import Language -from constants import mode_map +from .constants import mode_map +import six logger = logging.getLogger(__name__) @@ -54,10 +56,10 @@ class SubtitleHistoryItem(object): return mode_map.get(self.mode, "Unknown") def __repr__(self): - return unicode(self) + return six.text_type(self) def __unicode__(self): - return u"%s (Score: %s)" % (unicode(self.item_title), self.score) + return u"%s (Score: %s)" % (six.text_type(self.item_title), self.score) def __str__(self): return str(self.rating_key) @@ -108,7 +110,7 @@ class SubtitleHistory(object): items = [] logger.error("Failed to load history storage: %s" % traceback.format_exc()) - if not isinstance(items, types.ListType): + if not isinstance(items, list): items = [] else: items = items[:] diff --git a/libs/subzero/intent.py b/libs/subzero/intent.py index 174e68fee..338e57134 100644 --- a/libs/subzero/intent.py +++ b/libs/subzero/intent.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import datetime import threading diff --git a/libs/subzero/language.py b/libs/subzero/language.py index a13bab160..d212a02f4 100644 --- a/libs/subzero/language.py +++ b/libs/subzero/language.py @@ -1,9 +1,11 @@ # coding=utf-8 +from __future__ import absolute_import import types import re from babelfish.exceptions import LanguageError from babelfish import Language as Language_, basestr +from six.moves import zip repl_map = { "dk": "da", @@ -55,7 +57,7 @@ def wrap_forced(f): args = args[1:] s = args.pop(0) forced = None - if isinstance(s, types.StringTypes): + if isinstance(s, (str,)): base, forced = s.split(":") if ":" in s else (s, False) else: base = s @@ -81,6 +83,9 @@ class Language(Language_): def __setstate__(self, state): self.alpha3, self.country, self.script, self.forced = state + def __hash__(self): + return hash(str(self)) + def __eq__(self, other): if isinstance(other, basestr): return str(self) == other @@ -109,7 +114,7 @@ class Language(Language_): state = instance.__getstate__() attrs = ("country", "script", "forced") language = state[0] - kwa = dict(zip(attrs, state[1:])) + kwa = dict(list(zip(attrs, state[1:]))) kwa.update(replkw) return cls(language, **kwa) diff --git a/libs/subzero/lib/__init__.py b/libs/subzero/lib/__init__.py index 2c1fe6987..41167dbec 100644 --- a/libs/subzero/lib/__init__.py +++ b/libs/subzero/lib/__init__.py @@ -1,2 +1,3 @@ -import dict, geezip, httpfake, io, json, rar, which
\ No newline at end of file +from __future__ import absolute_import +from . import dict, geezip, httpfake, io, json, rar, which
\ No newline at end of file diff --git a/libs/subzero/lib/dict.py b/libs/subzero/lib/dict.py index 929a9a642..e89071bcb 100644 --- a/libs/subzero/lib/dict.py +++ b/libs/subzero/lib/dict.py @@ -1,3 +1,4 @@ +import six # coding=utf-8 @@ -19,11 +20,10 @@ class DictProxy(object): return getattr(super(DictProxy, self), name) def __setattr__(self, name, value): - if not self.__dict__.has_key( - '_DictProxy__initialized'): # this test allows attributes to be set in the __init__ method + if '_DictProxy__initialized' not in self.__dict__: # this test allows attributes to be set in the __init__ method return object.__setattr__(self, name, value) - elif self.__dict__.has_key(name): # any normal attributes are handled normally + elif name in self.__dict__: # any normal attributes are handled normally object.__setattr__(self, name, value) else: @@ -56,7 +56,7 @@ class DictProxy(object): return str(self.Dict[self.store]) def __len__(self): - return len(self.Dict[self.store].keys()) + return len(list(self.Dict[self.store].keys())) def __delitem__(self, key): del self.Dict[self.store][key] @@ -81,16 +81,16 @@ class DictProxy(object): return self.Dict[self.store].update(*args, **kwargs) def keys(self): - return self.Dict[self.store].keys() + return list(self.Dict[self.store].keys()) def values(self): - return self.Dict[self.store].values() + return list(self.Dict[self.store].values()) def items(self): - return self.Dict[self.store].items() + return list(self.Dict[self.store].items()) def __unicode__(self): - return unicode(repr(self.Dict[self.store])) + return six.text_type(repr(self.Dict[self.store])) def setup_defaults(self): raise NotImplementedError @@ -104,7 +104,7 @@ class Dicked(object): def __init__(self, **entries): self._entries = entries or None - for key, value in entries.iteritems(): + for key, value in six.iteritems(entries): self.__dict__[key] = (Dicked(**value) if isinstance(value, dict) else value) def has(self, key): @@ -117,7 +117,7 @@ class Dicked(object): return str(self) def __unicode__(self): - return unicode(self.__digged__) + return six.text_type(self.__digged__) def __str__(self): return str(self.__digged__) @@ -149,7 +149,7 @@ class Dicked(object): @property def __digged__(self): - return {key: value for key, value in self.__dict__.iteritems() if key != "_entries"} + return {key: value for key, value in six.iteritems(self.__dict__) if key != "_entries"} def __len__(self): return len(self.__digged__) diff --git a/libs/subzero/lib/geezip.py b/libs/subzero/lib/geezip.py index 89657834f..5ffd09a4c 100644 --- a/libs/subzero/lib/geezip.py +++ b/libs/subzero/lib/geezip.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import gzip from zlib import Z_FINISH @@ -28,7 +29,7 @@ class GeezipFile(gzip.GzipFile): fileobj.write(self.compress.flush(Z_FINISH)) gzip.write32u(fileobj, self.crc) # self.size may exceed 2GB, or even 4GB - gzip.write32u(fileobj, self.size & 0xffffffffL) + gzip.write32u(fileobj, self.size & 0xffffffff) fileobj.flush() finally: myfileobj = self.myfileobj diff --git a/libs/subzero/lib/httpfake.py b/libs/subzero/lib/httpfake.py index 93b591a55..f7a8eb4db 100644 --- a/libs/subzero/lib/httpfake.py +++ b/libs/subzero/lib/httpfake.py @@ -1,3 +1,4 @@ +import six # coding=utf-8 @@ -37,7 +38,7 @@ class PlexPyNativeResponseProxy(object): return str(self.data) def __unicode__(self): - return unicode(self.data) + return six.text_type(self.data) def __repr__(self): return repr(self.data) diff --git a/libs/subzero/lib/io.py b/libs/subzero/lib/io.py index 07c442972..bc28a2ffe 100644 --- a/libs/subzero/lib/io.py +++ b/libs/subzero/lib/io.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import os import sys diff --git a/libs/subzero/lib/json.py b/libs/subzero/lib/json.py index d3f3184a2..4fe6829f5 100644 --- a/libs/subzero/lib/json.py +++ b/libs/subzero/lib/json.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import from gzip import GzipFile from json_tricks import TricksEncoder diff --git a/libs/subzero/lib/rar.py b/libs/subzero/lib/rar.py index 8e9cdafc8..e9a08bb3b 100644 --- a/libs/subzero/lib/rar.py +++ b/libs/subzero/lib/rar.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import logging import rarfile diff --git a/libs/subzero/lib/which.py b/libs/subzero/lib/which.py index e65a7d900..14eb0c2e3 100644 --- a/libs/subzero/lib/which.py +++ b/libs/subzero/lib/which.py @@ -4,6 +4,7 @@ # Public domain code by anatoly techtonik <[email protected]> # AKA Linux `which` and Windows `where` +from __future__ import absolute_import import os import sys diff --git a/libs/subzero/modification/__init__.py b/libs/subzero/modification/__init__.py index d6008b17b..6bc059a5c 100644 --- a/libs/subzero/modification/__init__.py +++ b/libs/subzero/modification/__init__.py @@ -1,5 +1,6 @@ # coding=utf-8 -from registry import registry -from mods import hearing_impaired, ocr_fixes, fps, offset, common, color -from main import SubtitleModifications, SubMod +from __future__ import absolute_import +from .registry import registry +from .mods import hearing_impaired, ocr_fixes, fps, offset, common, color +from .main import SubtitleModifications, SubMod diff --git a/libs/subzero/modification/dictionaries/__init__.py b/libs/subzero/modification/dictionaries/__init__.py index 201b7409a..0fce979ff 100644 --- a/libs/subzero/modification/dictionaries/__init__.py +++ b/libs/subzero/modification/dictionaries/__init__.py @@ -1,3 +1,4 @@ # coding=utf-8 -from data import data
\ No newline at end of file +from __future__ import absolute_import +from .data import data
\ No newline at end of file diff --git a/libs/subzero/modification/dictionaries/data.py b/libs/subzero/modification/dictionaries/data.py index c56a0c809..61b35fef4 100644 --- a/libs/subzero/modification/dictionaries/data.py +++ b/libs/subzero/modification/dictionaries/data.py @@ -1,5 +1,7 @@ +from __future__ import absolute_import import re from collections import OrderedDict +import six data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'EndLines': {'data': OrderedDict(), @@ -192,7 +194,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'WholeWords': {'data': OrderedDict([(u'l\xe2rt', u'l\xe4rt'), (u'hederv\xe5rda', u'hederv\xe4rda'), (u'storm\xe2stare', u'storm\xe4stare'), (u'Avf\xe2rd', u'Avf\xe4rd'), (u't\xe2lten', u't\xe4lten'), (u'\xe2rjag', u'\xe4r jag'), (u'\xe4rjag', u'\xe4r jag'), (u'j\xe2mlikar', u'j\xe4mlikar'), (u'Riskako\ufb02', u'Riskakor'), (u'Karamellen/', u'Karamellen'), (u'Lngen\xfcng', u'Ingenting'), (u'\xe4rju', u'\xe4r ju'), (u'S\xe1', u'S\xe5'), (u'n\xe4rjag', u'n\xe4r jag'), (u'alltjag', u'allt jag'), (u'g\xf6rjag', u'g\xf6r jag'), (u'trorjag', u'tror jag'), (u'varju', u'var ju'), (u'g\xf6rju', u'g\xf6r ju'), (u'kanju', u'kan ju'), (u'blirjag', u'blir jag'), (u's\xe4gerjag', u's\xe4ger jag'), (u'beh\xe5llerjag', u'beh\xe5ller jag'), (u'pr\xf8blem', u'problem'), (u'r\xe4ddadeju', u'r\xe4ddade ju'), (u'hon\xf8m', u'honom'), (u'Ln', u'In'), (u'sv\xe5r\ufb02\xf6rtad', u'sv\xe5rfl\xf6rtad'), (u'\xf8ch', u'och'), (u'\ufb02\xf6rtar', u'fl\xf6rtar'), (u'k\xe4nnerjag', u'k\xe4nner jag'), (u'\ufb02ickan', u'flickan'), (u'sn\xf8', u'sn\xf6'), (u'gerju', u'ger ju'), (u'k\xf8ntakter', u'kontakter'), (u'\xf8lycka', u'olycka'), (u'n\xf8lla', u'nolla'), (u'sinnenajublar', u'sinnena jublar'), (u'ijobbet', u'i jobbet'), (u'F\xe5rjag', u'F\xe5r jag'), (u'Ar', u'\xc4r'), (u'liggerju', u'ligger ju'), (u'um', u'om'), (u'lbland', u'Ibland'), (u'skjuterjag', u'skjuter jag'), (u'Vadd\xe5', u'Vad d\xe5'), (u'pratarj\xe4mt', u'pratar j\xe4mt'), (u'harju', u'har ju'), (u'sitterjag', u'sitter jag'), (u'h\xe4\ufb02a', u'h\xe4rja'), (u's\ufb01\xe4l', u'stj\xe4l'), (u'F\xd6U', u'F\xf6lj'), (u'varf\xf6rjag', u'varf\xf6r jag'), (u's\ufb01\xe4rna', u'stj\xe4rna'), (u'b\xf6\ufb02ar', u'b\xf6rjar'), (u'b\xf6\ufb02an', u'b\xf6rjan'), (u'st\xe4ri', u'st\xe5r'), (u'p\xe4', u'p\xe5'), (u'harjag', u'har jag'), (u'attjag', u'att jag'), (u'Verkarjag', u'Verkar jag'), (u'K\xe4nnerjag', u'K\xe4nner jag'), (u'd\xe4rjag', u'd\xe4r jag'), (u'tu\ufb01', u'tuff'), (u'lurarjag', u'lurar jag'), (u'varj\xe4ttebra', u'var j\xe4ttebra'), (u'allvan', u'allvar'), (u'deth\xe4r', u'det h\xe4r'), (u'va\ufb02e', u'varje'), (u'F\xf6Uer', u'F\xf6ljer'), (u'personalm\xf6tetl', u'personalm\xf6tet!'), (u'harjust', u'har just'), (u'\xe4rj\xe4tteduktig', u'\xe4r j\xe4tteduktig'), (u'd\xe4rja', u'd\xe4r ja'), (u'lngen\xfcng', u'lngenting'), (u'iluften', u'i luften'), (u'\xf6sen', u'\xf6ser'), (u'tv\xe2', u'tv\xe5'), (u'Uejerna', u'Tjejerna'), (u'h\xe5n*', u'h\xe5rt'), (u'\xc4rjag', u'\xc4r jag'), (u'keL', u'Okej'), (u'F\xf6rjag', u'F\xf6r jag'), (u'varj\xe4ttekul', u'var j\xe4ttekul'), (u'k\xe4mpan', u'k\xe4mpar'), (u'mycketjobb', u'mycket jobb'), (u'Uus', u'ljus'), (u'serjag', u'ser jag'), (u'vetjag', u'vet jag'), (u'f\xe5rjag', u'f\xe5r jag'), (u'hurjag', u'hur jag'), (u'f\xf6rs\xf6kerjag', u'f\xf6rs\xf6ker jag'), (u't\xe1nagel', u't\xe5nagel'), (u'va\xfce', u'varje'), (u'Uudet', u'ljudet'), (u'amhopa', u'allihopa'), (u'V\xe4\xfc', u'V\xe4lj'), (u'g\xe4ri', u'g\xe5r'), (u'r\xf6d\xfcus', u'r\xf6dljus'), (u'Uuset', u'ljuset'), (u'Rid\xe0n', u'Rid\xe5n'), (u'vi\xfca', u'vilja'), (u'g\xe5ri', u'g\xe5r i'), (u'Hurd\xe5', u'Hur d\xe5'), (u'inter\\/juar', u'intervjuar'), (u'menarjag', u'menar jag'), (u'spyrjag', u'spyr jag'), (u'bri\xfcera', u'briljera'), (u'N\xe4rjag', u'N\xe4r jag'), (u'ner\\/\xf6s', u'nerv\xf6s'), (u'ilivets', u'i livets'), (u'n\xe4got', u'n\xe5got'), (u'p\xe0', u'p\xe5'), (u'Lnnan', u'Innan'), (u'Uf', u'Ut'), (u'lnnan', u'Innan'), (u'D\xe0ren', u'D\xe5ren'), (u'F\xe0rjag', u'F\xe5r jag'), (u'Vad\xe4rdetd\xe4L', u'Vad \xe4r det d\xe4r'), (u'sm\xe0tjuv', u'sm\xe5tjuv'), (u't\xe0gr\xe5nare', u't\xe5gr\xe5nare'), (u'dit\xe0t', u'dit\xe5t'), (u's\xe4', u's\xe5'), (u'v\xe0rdsl\xf6sa', u'v\xe5rdsl\xf6sa'), (u'n\xe0n', u'n\xe5n'), (u'kommerjag', u'kommer jag'), (u'\xe4rj\xe4ttebra', u'\xe4r j\xe4ttebra'), (u'\xe4rj\xe4vligt', u'\xe4r j\xe4vligt'), (u'\xe0kerjag', u'\xe5ker jag'), (u'ellerjapaner', u'eller japaner'), (u'attjaga', u'att jaga'), (u'eften', u'efter'), (u'h\xe4stan', u'h\xe4star'), (u'Lntensivare', u'Intensivare'), (u'fr\xe0garjag', u'fr\xe5gar jag'), (u'pen/ers', u'pervers'), (u'r\xe0barkade', u'r\xe5barkade'), (u'styrkon', u'styrkor'), (u'Dif\xe5f', u'Dit\xe5t'), (u'h\xe4nden', u'h\xe4nder'), (u'f\xf6\ufb01a', u'f\xf6lja'), (u'Idioten/', u'Idioter!'), (u'Varf\xf6rjagade', u'Varf\xf6r jagade'), (u'd\xe4rf\xf6rjag', u'd\xe4rf\xf6r jag'), (u'forjag', u'for jag'), (u'Iivsgladje', u'livsgl\xe4dje'), (u'narjag', u'n\xe4r jag'), (u'sajag', u'sa jag'), (u'genastja', u'genast ja'), (u'rockument\xe0ren', u'rockument\xe4ren'), (u'turne', u'turn\xe9'), (u'fickjag', u'fick jag'), (u'sager', u's\xe4ger'), (u'Ijush\xe5rig', u'ljush\xe5rig'), (u'tradg\xe5rdsolycka', u'tr\xe4dg\xe5rdsolycka'), (u'kvavdes', u'kv\xe4vdes'), (u'd\xe0rja', u'd\xe4r ja'), (u'hedersgaster', u'hedersg\xe4ster'), (u'Nar', u'N\xe4r'), (u'smaki\xf6sa', u'smakl\xf6sa'), (u'lan', u'Ian'), (u'Lan', u'Ian'), (u'eri', u'er i'), (u'universitetsamne', u'universitets\xe4mne'), (u'garna', u'g\xe4rna'), (u'ar', u'\xe4r'), (u'baltdjur', u'b\xe4ltdjur'), (u'varjag', u'var jag'), (u'\xe0r', u'\xe4r'), (u'f\xf6rf\xf6rst\xe0rkare', u'f\xf6rf\xf6rst\xe4rkare'), (u'arjattespeciell', u'\xe4r j\xe4ttespeciell'), (u'h\xe0rg\xe5r', u'h\xe4r g\xe5r'), (u'Ia', u'la'), (u'Iimousinen', u'limousinen'), (u'krickettra', u'krickettr\xe4'), (u'h\xe5rdrockv\xe0rlden', u'h\xe5rdrockv\xe4rlden'), (u'tr\xe0bit', u'tr\xe4bit'), (u'Mellanvastern', u'Mellanv\xe4stern'), (u'arju', u'\xe4r ju'), (u'turnen', u'turn\xe9n'), (u'kanns', u'k\xe4nns'), (u'battre', u'b\xe4ttre'), (u'v\xe0rldsturne', u'v\xe4rldsturne'), (u'dar', u'd\xe4r'), (u'sj\xe0lvant\xe0nder', u'sj\xe4lvant\xe4nder'), (u'jattelange', u'j\xe4ttel\xe4nge'), (u'berattade', u'ber\xe4ttade'), (u'S\xe4', u'S\xe5'), (u'vandpunkten', u'v\xe4ndpunkten'), (u'N\xe0rjag', u'N\xe4r jag'), (u'lasa', u'l\xe4sa'), (u'skitl\xe0skigt', u'skitl\xe4skigt'), (u'sambandsv\xe0g', u'sambandsv\xe4g'), (u'valdigt', u'v\xe4ldigt'), (u'Stamga\ufb01el', u'St\xe4mgaffel'), (u'\xe0rjag', u'\xe4r jag'), (u'tajming', u'tajmning'), (u'utg\xe4ng', u'utg\xe5ng'), (u'H\xe0r\xe5t', u'H\xe4r\xe5t'), (u'h\xe0r\xe5t', u'h\xe4r\xe5t'), (u'anvander', u'anv\xe4nder'), (u'harjobbat', u'har jobbat'), (u'imageide', u'imageid\xe9'), (u'kla\ufb01en', u'klaffen'), (u'sjalv', u'sj\xe4lv'), (u'dvarg', u'dv\xe4rg'), (u'detjag', u'det jag'), (u'dvargarna', u'dv\xe4rgarna'), (u'fantasiv\xe0rld', u'fantasiv\xe4rld'), (u'\ufb01olliga', u'Fjolliga'), (u'mandoiinstr\xe0ngar', u'mandollnstr\xe4ngar'), (u'mittjobb', u'mitt jobb'), (u'Skajag', u'Ska jag'), (u'landari', u'landar i'), (u'gang', u'g\xe4ng'), (u'Detjag', u'Det jag'), (u'Narmre', u'N\xe4rmre'), (u'I\xe5tjavelni', u'l\xe5tj\xe4veln'), (u'H\xe5llerjag', u'H\xe5ller jag'), (u'visionarer', u'vision\xe4rer'), (u'T\xfclvad', u'Till vad'), (u'milit\xe0rbas', u'milit\xe4rbas'), (u'jattegiada', u'j\xe4tteglada'), (u'Fastjag', u'Fast jag'), (u's\xe5jag', u's\xe5 jag'), (u'rockvarlden', u'rockv\xe4rlden'), (u'saknarjag', u'saknar jag'), (u'allafall', u'alla fall'), (u'\ufb01anta', u'fjanta'), (u'Kr\xe0ma', u'Kr\xe4ma'), (u'stammer', u'st\xe4mmer'), (u'budb\xe0rare', u'budb\xe4rare'), (u'Iivsfiiosofi', u'livsfiiosofi'), (u'f\xf6rj\xe4mnan', u'f\xf6r j\xe4mnan'), (u'gillarjag', u'gillar jag'), (u'Iarvat', u'larvat'), (u'klararjag', u'klarar jag'), (u"hatta\ufb01'\xe0r", u'hattaff\xe4r'), (u'D\xe0', u'D\xe5'), (u'upp\ufb01nna', u'uppfinna'), (u'R\xe0ttf\xe5glar', u'R\xe5ttf\xe5glar'), (u'Sv\xe4\xfcboda', u'Sv\xe4ljboda'), (u'P\xe5b\xf6\ufb02ar', u'P\xe5b\xf6rjar'), (u'slutarju', u'slutar ju'), (u'ni\ufb01skebu\xfcken', u'i fiskebutiken'), (u'h\xe4rj\xe4keln', u'h\xe4r j\xe4keln'), (u'H\xdfppa', u'Hoppa'), (u'f\xf6rst\xf6rds', u'f\xf6rst\xf6rdes'), (u'varj\xe4ttegoda', u'var j\xe4ttegoda'), (u'Kor\\/', u'Korv'), (u'br\xfcl\xe9el', u'br\xfcl\xe9e!'), (u'Hei', u'Hej'), (u'\xe4lskarjordgubbsglass', u'\xe4lskar jordgubbsglass'), (u'Sn\xf6bom', u'Sn\xf6boll'), (u'Sn\xf6boH', u'Sn\xf6boll'), (u'Sn\xf6bol', u'Sn\xf6boll'), (u'sn\xf6boH', u'sn\xf6boll'), (u'L\xe4ggerp\xe5', u'L\xe4gger p\xe5'), (u'lnge\ufb02', u'lnget!'), (u'S\xe4gerj\xe4ttesmarta', u'S\xe4ger j\xe4ttesmarta'), (u'dopplen/\xe4derradar', u'dopplerv\xe4derradar'), (u's\xe4kertj\xe4ttefin', u's\xe4kert j\xe4ttefin'), (u'\xe4rj\xe4ttefin', u'\xe4r j\xe4ttefin'), (u'verkarju', u'verkar ju'), (u'blirju', u'blir ju'), (u'kor\\/', u'korv'), (u'naturkatastro\ufb01', u'naturkatastrof!'), (u'stickerjag', u'stickerj ag'), (u'j\xe4ttebu\ufb01\xe9', u'j\xe4ttebuff\xe9'), (u'be\ufb01nner', u'befinner'), (u'Sp\ufb02ng', u'Spring'), (u'trec\ufb01e', u'tredje'), (u'ryckerjag', u'rycker jag'), (u'skullejag', u'skulle jag'), (u'vetju', u'vet ju'), (u'a\ufb02jag', u'att jag'), (u'\ufb02nns', u'finns'), (u'\xe4rl\xe5ng', u'\xe4r l\xe5ng'), (u'k\xe5ra', u'k\xe4ra'), (u'\xe4r\ufb01na', u'\xe4r \ufb01na'), (u'\xe4ri', u'\xe4r i'), (u'h\xf6rden', u'h\xf6r den'), (u'\xe4ttj\xe4g', u'att j\xe4g'), (u'g\xe4r', u'g\xe5r'), (u'f\xf6ri', u'f\xf6r i'), (u'Hurvisste', u'Hur visste'), (u'\ufb01ck', u'fick'), (u'\ufb01nns', u'finns'), (u'\ufb01n', u'fin'), (u'Fa', u'Bra.'), (u'bori', u'bor i'), (u'fiendeplanl', u'fiendeplan!'), (u'if\xf6rnamn', u'i f\xf6rnamn'), (u'detju', u'det ju'), (u'N\xfcd', u'Niki'), (u'hatarjag', u'hatar jag'), (u'Klararjag', u'Klarar jag'), (u'deta\ufb01er', u'detaljer'), (u'v\xe4/', u'v\xe4l'), (u'smakarju', u'smakar ju'), (u'Teache\ufb02', u'Teacher!'), (u'imorse', u'i morse'), (u'drickerjag', u'dricker jag'), (u'st\xe5ri', u'st\xe5r i'), (u'Harjag', u'Har jag'), (u'Talarjag', u'Talar jag'), (u'undrarjag', u'undrar jag'), (u'\xe5lderjag', u'\xe5lder jag'), (u'va\ufb01e', u'varje'), (u'f\xf6rfalskningl', u'f\xf6rfalskning!'), (u'Vi\ufb01iiiam', u'William'), (u'V\\\ufb01lliams', u'Williams'), (u'attjobba', u'att jobba'), (u'intei', u'inte i'), (u'n\xe4rV\\\ufb01lliam', u'n\xe4r William'), (u'V\\\ufb01lliam', u'William'), (u'E\ufb01ersom', u'Eftersom'), (u'Vl\ufb01lliam', u'William'), (u'I\xe4ngejag', u'l\xe4nge jag'), (u"'\ufb01digare", u'Tidigare'), (u'b\xf6rjadei', u'b\xf6rjade i'), (u'merjust', u'mer just'), (u'e\ufb01er\xe5t', u'efter\xe5t'), (u'gjordejag', u'gjorde jag'), (u'hadeju', u'hade ju'), (u'g\xe5rvi', u'g\xe5r vi'), (u'k\xf6perjag', u'k\xf6per jag'), (u'M\xe5stejag', u'M\xe5ste jag'), (u'k\xe4nnerju', u'k\xe4nner ju'), (u'\ufb02n', u'fin'), (u'treviig', u'trevlig'), (u'Grattisl', u'Grattis!'), (u'kande', u'k\xe4nde'), (u"'llden", u'Tiden'), (u'sakjag', u'sak jag'), (u'klartjag', u'klart jag'), (u'h\xe4\ufb01igt', u'h\xe4ftigt'), (u'I\xe4mnarjag', u'l\xe4mnar jag'), (u'gickju', u'gick ju'), (u'skajag', u'ska jag'), (u'G\xf6rjag', u'G\xf6r jag'), (u'm\xe5stejag', u'm\xe5ste jag'), (u'gra\\/iditet', u'graviditet'), (u'hittadqdin', u'hittade din'), (u'\xe4rjobbigt', u'\xe4r jobbigt'), (u'Overdrivet', u'\xd6verdrivet'), (u'hOgtidlig', u'h\xf6gtidlig'), (u'Overtyga', u'\xd6vertyga'), (u'SKILSMASSA', u'SKILSM\xc4SSA'), (u'brukarju', u'brukar ju'), (u'lsabel', u'Isabel'), (u'kundejag', u'kunde jag'), (u'\xe4rl\xe4get', u'\xe4r l\xe4get'), (u'blirinte', u'blir inte'), (u'ijakt', u'i jakt'), (u'avjordens', u'av jordens'), (u'90000O', u'900000'), (u'9O0', u'900'), (u'\xe4rp\xe5', u'\xe4r p\xe5'), (u'\xe4rproteserna', u'\xe4r proteserna'), (u'\xe4rytterst', u'\xe4r ytterst'), (u'beborjorden', u'bebor jorden'), (u'filmjag', u'film jag'), (u'fokuserarp\xe5', u'fokuserar p\xe5'), (u'folkjag', u'folk jag'), (u'f\xf6rest\xe4lldejag', u'f\xf6rest\xe4llde jag'), (u'f\xf6rpubliken', u'f\xf6r publiken'), (u'gilladejag', u'gillade jag'), (u'h\xe5llerp\xe5', u'h\xe5ller p\xe5'), (u'harp\xe5', u'har p\xe5'), (u'harplaner', u'har planer'), (u'harprylar', u'har prylar'), (u'kommerpubliken', u'kommer publiken'), (u'kostymerp\xe5', u'kostymer p\xe5'), (u'litarp\xe5', u'litar p\xe5'), (u'lngen', u'Ingen'), (u'lnom', u'Inom'), (u'lnte', u'Inte'), (u'ochjag', u'och jag'), (u'Ochjag', u'Och jag'), (u'ochjorden', u'och jorden'), (u'omjag', u'om jag'), (u'Omjag', u'Om jag'), (u'passarperfekt', u'passar perfekt'), (u's\xe4ttetjag', u's\xe4ttet jag'), (u'silverp\xe5', u'silver p\xe5'), (u'skruvarjag', u'skruvar jag'), (u'somjag', u'som jag'), (u'Somjag', u'Som jag'), (u'talarp\xe5', u'talar p\xe5'), (u't\xe4nktejag', u't\xe4nkte jag'), (u'tapparjag', u'tappar jag'), (u'tittarp\xe5', u'tittar p\xe5'), (u'visstejag', u'visste jag'), (u'medjetpacks', u'med jetpacks'), (u's\xe4tterp\xe5', u's\xe4tter p\xe5'), (u'st\xe5rp\xe5', u'st\xe5r p\xe5'), (u'tillh\xf6rp\xe5', u'tillh\xf6r p\xe5')]), 'pattern': u"(?um)(\\b|^)(?:l\\\xe2rt|hederv\\\xe5rda|storm\\\xe2stare|Avf\\\xe2rd|t\\\xe2lten|\\\xe2rjag|\\\xe4rjag|j\\\xe2mlikar|Riskako\\\ufb02|Karamellen\\/|Lngen\\\xfcng|\\\xe4rju|S\\\xe1|n\\\xe4rjag|alltjag|g\\\xf6rjag|trorjag|varju|g\\\xf6rju|kanju|blirjag|s\\\xe4gerjag|beh\\\xe5llerjag|pr\\\xf8blem|r\\\xe4ddadeju|hon\\\xf8m|Ln|sv\\\xe5r\\\ufb02\\\xf6rtad|\\\xf8ch|\\\ufb02\\\xf6rtar|k\\\xe4nnerjag|\\\ufb02ickan|sn\\\xf8|gerju|k\\\xf8ntakter|\\\xf8lycka|n\\\xf8lla|sinnenajublar|ijobbet|F\\\xe5rjag|Ar|liggerju|um|lbland|skjuterjag|Vadd\\\xe5|pratarj\\\xe4mt|harju|sitterjag|h\\\xe4\\\ufb02a|s\\\ufb01\\\xe4l|F\\\xd6U|varf\\\xf6rjag|s\\\ufb01\\\xe4rna|b\\\xf6\\\ufb02ar|b\\\xf6\\\ufb02an|st\\\xe4ri|p\\\xe4|harjag|attjag|Verkarjag|K\\\xe4nnerjag|d\\\xe4rjag|tu\\\ufb01|lurarjag|varj\\\xe4ttebra|allvan|deth\\\xe4r|va\\\ufb02e|F\\\xf6Uer|personalm\\\xf6tetl|harjust|\\\xe4rj\\\xe4tteduktig|d\\\xe4rja|lngen\\\xfcng|iluften|\\\xf6sen|tv\\\xe2|Uejerna|h\\\xe5n\\*|\\\xc4rjag|keL|F\\\xf6rjag|varj\\\xe4ttekul|k\\\xe4mpan|mycketjobb|Uus|serjag|vetjag|f\\\xe5rjag|hurjag|f\\\xf6rs\\\xf6kerjag|t\\\xe1nagel|va\\\xfce|Uudet|amhopa|V\\\xe4\\\xfc|g\\\xe4ri|r\\\xf6d\\\xfcus|Uuset|Rid\\\xe0n|vi\\\xfca|g\\\xe5ri|Hurd\\\xe5|inter\\\\\\/juar|menarjag|spyrjag|bri\\\xfcera|N\\\xe4rjag|ner\\\\\\/\\\xf6s|ilivets|n\\\xe4got|p\\\xe0|Lnnan|Uf|lnnan|D\\\xe0ren|F\\\xe0rjag|Vad\\\xe4rdetd\\\xe4L|sm\\\xe0tjuv|t\\\xe0gr\\\xe5nare|dit\\\xe0t|s\\\xe4|v\\\xe0rdsl\\\xf6sa|n\\\xe0n|kommerjag|\\\xe4rj\\\xe4ttebra|\\\xe4rj\\\xe4vligt|\\\xe0kerjag|ellerjapaner|attjaga|eften|h\\\xe4stan|Lntensivare|fr\\\xe0garjag|pen\\/ers|r\\\xe0barkade|styrkon|Dif\\\xe5f|h\\\xe4nden|f\\\xf6\\\ufb01a|Idioten\\/|Varf\\\xf6rjagade|d\\\xe4rf\\\xf6rjag|forjag|Iivsgladje|narjag|sajag|genastja|rockument\\\xe0ren|turne|fickjag|sager|Ijush\\\xe5rig|tradg\\\xe5rdsolycka|kvavdes|d\\\xe0rja|hedersgaster|Nar|smaki\\\xf6sa|lan|Lan|eri|universitetsamne|garna|ar|baltdjur|varjag|\\\xe0r|f\\\xf6rf\\\xf6rst\\\xe0rkare|arjattespeciell|h\\\xe0rg\\\xe5r|Ia|Iimousinen|krickettra|h\\\xe5rdrockv\\\xe0rlden|tr\\\xe0bit|Mellanvastern|arju|turnen|kanns|battre|v\\\xe0rldsturne|dar|sj\\\xe0lvant\\\xe0nder|jattelange|berattade|S\\\xe4|vandpunkten|N\\\xe0rjag|lasa|skitl\\\xe0skigt|sambandsv\\\xe0g|valdigt|Stamga\\\ufb01el|\\\xe0rjag|tajming|utg\\\xe4ng|H\\\xe0r\\\xe5t|h\\\xe0r\\\xe5t|anvander|harjobbat|imageide|kla\\\ufb01en|sjalv|dvarg|detjag|dvargarna|fantasiv\\\xe0rld|\\\ufb01olliga|mandoiinstr\\\xe0ngar|mittjobb|Skajag|landari|gang|Detjag|Narmre|I\\\xe5tjavelni|H\\\xe5llerjag|visionarer|T\\\xfclvad|milit\\\xe0rbas|jattegiada|Fastjag|s\\\xe5jag|rockvarlden|saknarjag|allafall|\\\ufb01anta|Kr\\\xe0ma|stammer|budb\\\xe0rare|Iivsfiiosofi|f\\\xf6rj\\\xe4mnan|gillarjag|Iarvat|klararjag|hatta\\\ufb01\\'\\\xe0r|D\\\xe0|upp\\\ufb01nna|R\\\xe0ttf\\\xe5glar|Sv\\\xe4\\\xfcboda|P\\\xe5b\\\xf6\\\ufb02ar|slutarju|ni\\\ufb01skebu\\\xfcken|h\\\xe4rj\\\xe4keln|H\\\xdfppa|f\\\xf6rst\\\xf6rds|varj\\\xe4ttegoda|Kor\\\\\\/|br\\\xfcl\\\xe9el|Hei|\\\xe4lskarjordgubbsglass|Sn\\\xf6bom|Sn\\\xf6boH|Sn\\\xf6bol|sn\\\xf6boH|L\\\xe4ggerp\\\xe5|lnge\\\ufb02|S\\\xe4gerj\\\xe4ttesmarta|dopplen\\/\\\xe4derradar|s\\\xe4kertj\\\xe4ttefin|\\\xe4rj\\\xe4ttefin|verkarju|blirju|kor\\\\\\/|naturkatastro\\\ufb01|stickerjag|j\\\xe4ttebu\\\ufb01\\\xe9|be\\\ufb01nner|Sp\\\ufb02ng|trec\\\ufb01e|ryckerjag|skullejag|vetju|a\\\ufb02jag|\\\ufb02nns|\\\xe4rl\\\xe5ng|k\\\xe5ra|\\\xe4r\\\ufb01na|\\\xe4ri|h\\\xf6rden|\\\xe4ttj\\\xe4g|g\\\xe4r|f\\\xf6ri|Hurvisste|\\\ufb01ck|\\\ufb01nns|\\\ufb01n|Fa|bori|fiendeplanl|if\\\xf6rnamn|detju|N\\\xfcd|hatarjag|Klararjag|deta\\\ufb01er|v\\\xe4\\/|smakarju|Teache\\\ufb02|imorse|drickerjag|st\\\xe5ri|Harjag|Talarjag|undrarjag|\\\xe5lderjag|va\\\ufb01e|f\\\xf6rfalskningl|Vi\\\ufb01iiiam|V\\\\\\\ufb01lliams|attjobba|intei|n\\\xe4rV\\\\\\\ufb01lliam|V\\\\\\\ufb01lliam|E\\\ufb01ersom|Vl\\\ufb01lliam|I\\\xe4ngejag|\\'\\\ufb01digare|b\\\xf6rjadei|merjust|e\\\ufb01er\\\xe5t|gjordejag|hadeju|g\\\xe5rvi|k\\\xf6perjag|M\\\xe5stejag|k\\\xe4nnerju|\\\ufb02n|treviig|Grattisl|kande|\\'llden|sakjag|klartjag|h\\\xe4\\\ufb01igt|I\\\xe4mnarjag|gickju|skajag|G\\\xf6rjag|m\\\xe5stejag|gra\\\\\\/iditet|hittadqdin|\\\xe4rjobbigt|Overdrivet|hOgtidlig|Overtyga|SKILSMASSA|brukarju|lsabel|kundejag|\\\xe4rl\\\xe4get|blirinte|ijakt|avjordens|90000O|9O0|\\\xe4rp\\\xe5|\\\xe4rproteserna|\\\xe4rytterst|beborjorden|filmjag|fokuserarp\\\xe5|folkjag|f\\\xf6rest\\\xe4lldejag|f\\\xf6rpubliken|gilladejag|h\\\xe5llerp\\\xe5|harp\\\xe5|harplaner|harprylar|kommerpubliken|kostymerp\\\xe5|litarp\\\xe5|lngen|lnom|lnte|ochjag|Ochjag|ochjorden|omjag|Omjag|passarperfekt|s\\\xe4ttetjag|silverp\\\xe5|skruvarjag|somjag|Somjag|talarp\\\xe5|t\\\xe4nktejag|tapparjag|tittarp\\\xe5|visstejag|medjetpacks|s\\\xe4tterp\\\xe5|st\\\xe5rp\\\xe5|tillh\\\xf6rp\\\xe5)(\\b|$)"}}} -for lang, grps in data.iteritems(): - for grp in grps.iterkeys(): +for lang, grps in six.iteritems(data): + for grp in six.iterkeys(grps): if data[lang][grp]["pattern"]: data[lang][grp]["pattern"] = re.compile(data[lang][grp]["pattern"]) diff --git a/libs/subzero/modification/dictionaries/make_data.py b/libs/subzero/modification/dictionaries/make_data.py index 7a0482dad..726208d3d 100644 --- a/libs/subzero/modification/dictionaries/make_data.py +++ b/libs/subzero/modification/dictionaries/make_data.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import re import os import pprint diff --git a/libs/subzero/modification/dictionaries/test_data.py b/libs/subzero/modification/dictionaries/test_data.py index 538c99fc0..55dcca643 100644 --- a/libs/subzero/modification/dictionaries/test_data.py +++ b/libs/subzero/modification/dictionaries/test_data.py @@ -1,10 +1,13 @@ # coding=utf-8 +from __future__ import absolute_import +from __future__ import print_function from subzero.language import Language -from data import data +from .data import data +import six #for lang, data in data.iteritems(): # print Language.fromietf(lang).alpha2 -for find, rep in data["dan"].iteritems(): - print find, rep
\ No newline at end of file +for find, rep in six.iteritems(data["dan"]): + print(find, rep)
\ No newline at end of file diff --git a/libs/subzero/modification/main.py b/libs/subzero/modification/main.py index 7d35c2e27..e67042755 100644 --- a/libs/subzero/modification/main.py +++ b/libs/subzero/modification/main.py @@ -1,19 +1,21 @@ # coding=utf-8 +from __future__ import absolute_import import traceback import re import pysubs2 import logging import time -from mods import EMPTY_TAG_PROCESSOR, EmptyEntryError -from registry import registry +from .mods import EMPTY_TAG_PROCESSOR, EmptyEntryError +from .registry import registry from subzero.language import Language +import six logger = logging.getLogger(__name__) -lowercase_re = re.compile(ur'(?sux)[a-zà-ž]') +lowercase_re = re.compile(r'(?sux)[a-zà-ž]') class SubtitleModifications(object): @@ -128,7 +130,7 @@ class SubtitleModifications(object): used_mods.append(orig_identifier) # finalize merged mods into final and used mods - for identifier, args in mods_merged.iteritems(): + for identifier, args in six.iteritems(mods_merged): pos_preserve_index = used_mods.index("%s_ORIG_POSITION" % identifier) # clear empty mods after merging @@ -143,7 +145,7 @@ class SubtitleModifications(object): continue # clear empty args - final_mod_args = dict(filter(lambda (k, v): bool(v), args.iteritems())) + final_mod_args = dict([k_v for k_v in six.iteritems(args) if bool(k_v[1])]) _data = SubtitleModifications.get_mod_signature(identifier, **final_mod_args) if _data == mods_merged_log[identifier]["final_identifier"]: @@ -159,11 +161,11 @@ class SubtitleModifications(object): final_mods[identifier] = args if self.debug: - for identifier, data in mods_merged_log.iteritems(): + for identifier, data in six.iteritems(mods_merged_log): logger.debug("Merged %s to %s", data["identifiers"], data["final_identifier"]) # separate all mods into line and non-line mods - for identifier, args in final_mods.iteritems(): + for identifier, args in six.iteritems(final_mods): mod_cls = registry.mods[identifier] if mod_cls.modifies_whole_file: non_line_mods.append((identifier, args)) @@ -180,7 +182,7 @@ class SubtitleModifications(object): entries_used = 0 for entry in self.f: entry_used = False - for sub in entry.text.strip().split("\N"): + for sub in entry.text.strip().split(r"\N"): # skip HI bracket entries, those might actually be lowercase sub = sub.strip() for processor in registry.mods["remove_HI"].processors[:4]: @@ -272,7 +274,7 @@ class SubtitleModifications(object): continue skip_entry = False - for line in t.split(ur"\N"): + for line in t.split(r"\N"): # don't bother the mods with surrounding tags old_line = line line = line.strip() @@ -377,7 +379,7 @@ class SubtitleModifications(object): logger.debug(u"%d: %r -> ''", index, entry.text) continue - new_text = ur"\N".join(lines) + new_text = r"\N".join(lines) # cheap man's approach to avoid open tags add_start_tags = [] diff --git a/libs/subzero/modification/mods/__init__.py b/libs/subzero/modification/mods/__init__.py index aaf4c37e4..e38e899c9 100644 --- a/libs/subzero/modification/mods/__init__.py +++ b/libs/subzero/modification/mods/__init__.py @@ -1,8 +1,10 @@ # coding=utf-8 +from __future__ import absolute_import import re import logging from subzero.modification.processors.re_processor import ReProcessor, NReProcessor +import six logger = logging.getLogger(__name__) @@ -83,7 +85,7 @@ class SubtitleModification(object): @classmethod def get_signature(cls, **kwargs): - string_args = ",".join(["%s=%s" % (key, value) for key, value in kwargs.iteritems()]) + string_args = ",".join(["%s=%s" % (key, value) for key, value in six.iteritems(kwargs)]) return "%s(%s)" % (cls.identifier, string_args) @classmethod @@ -95,7 +97,7 @@ class SubtitleTextModification(SubtitleModification): pass -TAG = ur"(?:\s*{\\[iusb][0-1]}\s*)*" +TAG = r"(?:\s*{\\[iusb][0-1]}\s*)*" EMPTY_TAG_PROCESSOR = ReProcessor(re.compile(r'({\\\w1})[\s.,-_!?]*({\\\w0})'), "", name="empty_tag") empty_line_post_processors = [ diff --git a/libs/subzero/modification/mods/color.py b/libs/subzero/modification/mods/color.py index 9e883b101..6e845298b 100644 --- a/libs/subzero/modification/mods/color.py +++ b/libs/subzero/modification/mods/color.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import logging from collections import OrderedDict diff --git a/libs/subzero/modification/mods/common.py b/libs/subzero/modification/mods/common.py index 14c360937..f4b22180c 100644 --- a/libs/subzero/modification/mods/common.py +++ b/libs/subzero/modification/mods/common.py @@ -1,5 +1,7 @@ # coding=utf-8 +from __future__ import absolute_import +from __future__ import unicode_literals import re from subzero.language import Language @@ -22,10 +24,10 @@ class CommonFixes(SubtitleTextModification): processors = [ # normalize hyphens - NReProcessor(re.compile(ur'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"), + NReProcessor(re.compile(r'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"), # -- = em dash - NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1—", name="CM_multidash"), + NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), r"\1—", name="CM_multidash"), # line = _/-/\s NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="<CM_non_word_only"), @@ -37,23 +39,23 @@ class CommonFixes(SubtitleTextModification): NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"), # fix music symbols - NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'), + NReProcessor(re.compile(r'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'), lambda x: u"♪ " if x.group(1) else u" ♪", name="CM_music_symbols"), # '' = " - NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"), + NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"), # double quotes instead of single quotes inside words - NReProcessor(re.compile(ur'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), ur"\1'\2", name="CM_double_as_single"), + NReProcessor(re.compile(r'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), r"\1'\2", name="CM_double_as_single"), # normalize quotes - NReProcessor(re.compile(ur'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'), + NReProcessor(re.compile(r'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'), lambda match: '"' + (" " if match.group(2).endswith(" ") else ""), name="CM_normalize_quotes"), # normalize single quotes - NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"), + NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"), # remove leading ... NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"), @@ -89,8 +91,8 @@ class CommonFixes(SubtitleTextModification): # space before ending doublequote? # replace uppercase I with lowercase L in words - NReProcessor(re.compile(ur'(?u)([a-zà-ž]+)(I+)'), - lambda match: ur'%s%s' % (match.group(1), "l" * len(match.group(2))), + NReProcessor(re.compile(r'(?u)([a-zà-ž]+)(I+)'), + lambda match: r'%s%s' % (match.group(1), "l" * len(match.group(2))), name="CM_uppercase_i_in_word"), # fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be @@ -101,11 +103,11 @@ class CommonFixes(SubtitleTextModification): name="CM_spaces_in_numbers"), # uppercase after dot - NReProcessor(re.compile(ur'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'), - lambda match: ur'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"), + NReProcessor(re.compile(r'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'), + lambda match: r'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"), # remove double interpunction - NReProcessor(re.compile(ur'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'), + NReProcessor(re.compile(r'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'), lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""), name="CM_double_interpunct"), @@ -149,14 +151,14 @@ class ReverseRTL(SubtitleModification): processors = [ # new? (?u)(^([\s.!?]*)(.+?)(\s*)(-?\s*)$); \5\4\3\2 - #NReProcessor(re.compile(ur"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2", + #NReProcessor(re.compile(r"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2", # name="CM_RTL_reverse") - NReProcessor(re.compile(ur"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2", + NReProcessor(re.compile(r"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2", name="CM_RTL_reverse") ] -split_upper_re = re.compile(ur"(\s*[.!?♪\-]\s*)") +split_upper_re = re.compile(r"(\s*[.!?♪\-]\s*)") class FixUppercase(SubtitleModification): diff --git a/libs/subzero/modification/mods/fps.py b/libs/subzero/modification/mods/fps.py index b209eddff..856af9e15 100644 --- a/libs/subzero/modification/mods/fps.py +++ b/libs/subzero/modification/mods/fps.py @@ -1,5 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import import logging from subzero.modification.mods import SubtitleModification diff --git a/libs/subzero/modification/mods/hearing_impaired.py b/libs/subzero/modification/mods/hearing_impaired.py index cb72d898c..b4de7c737 100644 --- a/libs/subzero/modification/mods/hearing_impaired.py +++ b/libs/subzero/modification/mods/hearing_impaired.py @@ -1,4 +1,6 @@ # coding=utf-8 +from __future__ import absolute_import +from __future__ import unicode_literals import re from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, EmptyEntryError, TAG @@ -26,71 +28,71 @@ class HearingImpaired(SubtitleTextModification): processors = [ # full bracket entry, single or multiline; starting with brackets and ending with brackets - FullBracketEntryProcessor(re.compile(ur'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}), + FullBracketEntryProcessor(re.compile(r'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}), "", name="HI_brackets_full"), # uppercase text before colon (at least 3 uppercase chars); at start or after a sentence, # possibly with a dash in front; ignore anything ending with a quote - NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])' - ur'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "", + NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])' + r'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "", name="HI_before_colon_caps"), # any text before colon (at least 3 chars); at start or after a sentence, # possibly with a dash in front; try not breaking actual sentences with a colon at the end by not matching if # a space is inside the text; ignore anything ending with a quote - NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])' - ur'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9])'), + NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])' + r'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9])'), lambda match: match.group(1) if (match.group(2).count(" ") > 0 or match.group(1).count("-") > 0) else "" if not match.group(1).startswith(" ") else " ", name="HI_before_colon_noncaps"), # brackets (only remove if at least 3 chars in brackets) - NReProcessor(re.compile(ur'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % + NReProcessor(re.compile(r'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "", name="HI_brackets"), - #NReProcessor(re.compile(ur'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}), + #NReProcessor(re.compile(r'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}), # "", name="HI_bracket_open_start"), - #NReProcessor(re.compile(ur'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "", + #NReProcessor(re.compile(r'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "", # name="HI_bracket_open_end"), # text before colon (and possible dash in front), max 11 chars after the first whitespace (if any) # NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"), # starting text before colon (at least 3 chars) - #NReProcessor(re.compile(ur'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "", + #NReProcessor(re.compile(r'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "", # name="HI_before_colon"), # text in brackets at start, after optional dash, before colon or at end of line # fixme: may be too aggressive - #NReProcessor(re.compile(ur'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "", + #NReProcessor(re.compile(r'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "", # name="HI_brackets_special"), # all caps line (at least 4 consecutive uppercase chars) - NReProcessor(re.compile(ur'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps", + NReProcessor(re.compile(r'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps", supported=lambda p: not p.only_uppercase), # remove MAN: - NReProcessor(re.compile(ur'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"), + NReProcessor(re.compile(r'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"), # dash in front # NReProcessor(re.compile(r'(?u)^\s*-\s*'), "", name="HI_starting_dash"), # all caps at start before new sentence - NReProcessor(re.compile(ur'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1", + NReProcessor(re.compile(r'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1", name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase), ] post_processors = empty_line_post_processors last_processors = [ # remove music symbols - NReProcessor(re.compile(ur'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}), + NReProcessor(re.compile(r'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}), "", name="HI_music_symbols_only"), # remove music entries - NReProcessor(re.compile(ur'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'), + NReProcessor(re.compile(r'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'), "", name="HI_music"), ] diff --git a/libs/subzero/modification/mods/ocr_fixes.py b/libs/subzero/modification/mods/ocr_fixes.py index ec57ca006..31fc94cdc 100644 --- a/libs/subzero/modification/mods/ocr_fixes.py +++ b/libs/subzero/modification/mods/ocr_fixes.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import logging import re @@ -38,11 +39,11 @@ class FixOCR(SubtitleTextModification): return [ # remove broken HI tag colons (ANNOUNCER'., ". instead of :) after at least 3 uppercase chars # don't modify stuff inside quotes - NReProcessor(re.compile(ur'(?u)(^[^"\'’ʼ❜‘‛”“‟„]*(?<=[A-ZÀ-Ž]{3})[A-ZÀ-Ž-_\s0-9]+)' - ur'(["\'’ʼ❜‘‛”“‟„]*[.,‚،⹁、;]+)(\s*)(?!["\'’ʼ❜‘‛”“‟„])'), + NReProcessor(re.compile(r'(?u)(^[^"\'’ʼ❜‘‛”“‟„]*(?<=[A-ZÀ-Ž]{3})[A-ZÀ-Ž-_\s0-9]+)' + r'(["\'’ʼ❜‘‛”“‟„]*[.,‚،⹁、;]+)(\s*)(?!["\'’ʼ❜‘‛”“‟„])'), r"\1:\3", name="OCR_fix_HI_colons", supported=lambda p: not p.only_uppercase), # fix F'bla - NReProcessor(re.compile(ur'(?u)(\bF)(\')([A-zÀ-ž]*\b)'), r"\1\3", name="OCR_fix_F"), + NReProcessor(re.compile(r'(?u)(\bF)(\')([A-zÀ-ž]*\b)'), r"\1\3", name="OCR_fix_F"), WholeLineProcessor(self.data_dict["WholeLines"], name="OCR_replace_line"), MultipleWordReProcessor(self.data_dict["WholeWords"], name="OCR_replace_word"), MultipleWordReProcessor(self.data_dict["BeginLines"], name="OCR_replace_beginline"), diff --git a/libs/subzero/modification/mods/offset.py b/libs/subzero/modification/mods/offset.py index 2e342c0a1..6f645a1ac 100644 --- a/libs/subzero/modification/mods/offset.py +++ b/libs/subzero/modification/mods/offset.py @@ -1,9 +1,11 @@ # coding=utf-8 +from __future__ import absolute_import import logging from subzero.modification.mods import SubtitleModification from subzero.modification import registry +import six logger = logging.getLogger(__name__) @@ -20,9 +22,9 @@ class ShiftOffset(SubtitleModification): @classmethod def merge_args(cls, args1, args2): - new_args = dict((key, int(value)) for key, value in args1.iteritems()) + new_args = dict((key, int(value)) for key, value in six.iteritems(args1)) - for key, value in args2.iteritems(): + for key, value in six.iteritems(args2): if not int(value): continue @@ -31,7 +33,7 @@ class ShiftOffset(SubtitleModification): else: new_args[key] = int(value) - return dict(filter(lambda (k, v): bool(v), new_args.iteritems())) + return dict([k_v for k_v in six.iteritems(new_args) if bool(k_v[1])]) def modify(self, content, debug=False, parent=None, **kwargs): parent.f.shift(h=int(kwargs.get("h", 0)), m=int(kwargs.get("m", 0)), s=int(kwargs.get("s", 0)), diff --git a/libs/subzero/modification/processors/__init__.py b/libs/subzero/modification/processors/__init__.py index 7cfb0aa33..4bbf29267 100644 --- a/libs/subzero/modification/processors/__init__.py +++ b/libs/subzero/modification/processors/__init__.py @@ -1,3 +1,4 @@ +import six # coding=utf-8 @@ -29,7 +30,7 @@ class Processor(object): return repr(self) def __unicode__(self): - return unicode(repr(self)) + return six.text_type(repr(self)) class FuncProcessor(Processor): diff --git a/libs/subzero/modification/processors/re_processor.py b/libs/subzero/modification/processors/re_processor.py index 5ff76331e..ad07240d2 100644 --- a/libs/subzero/modification/processors/re_processor.py +++ b/libs/subzero/modification/processors/re_processor.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import re import logging diff --git a/libs/subzero/modification/processors/string_processor.py b/libs/subzero/modification/processors/string_processor.py index 270fd76f8..30a53de85 100644 --- a/libs/subzero/modification/processors/string_processor.py +++ b/libs/subzero/modification/processors/string_processor.py @@ -1,8 +1,10 @@ # coding=utf-8 +from __future__ import absolute_import import logging from subzero.modification.processors import Processor +import six logger = logging.getLogger(__name__) @@ -39,7 +41,7 @@ class MultipleLineProcessor(Processor): if not self.snr_dict["data"]: return content - for key, value in self.snr_dict["data"].iteritems(): + for key, value in six.iteritems(self.snr_dict["data"]): if debug and key in content: logger.debug(u"Replacing '%s' with '%s' in '%s'", key, value, content) @@ -54,7 +56,7 @@ class WholeLineProcessor(MultipleLineProcessor): return content content = content.strip() - for key, value in self.snr_dict["data"].iteritems(): + for key, value in six.iteritems(self.snr_dict["data"]): if content == key: if debug: logger.debug(u"Replacing '%s' with '%s'", key, value) diff --git a/libs/subzero/modification/registry.py b/libs/subzero/modification/registry.py index 7b240a69c..277abefa5 100644 --- a/libs/subzero/modification/registry.py +++ b/libs/subzero/modification/registry.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import from collections import OrderedDict diff --git a/libs/subzero/prefs.py b/libs/subzero/prefs.py index 252164fb9..5744d4eac 100644 --- a/libs/subzero/prefs.py +++ b/libs/subzero/prefs.py @@ -1,6 +1,7 @@ # coding=utf-8 +from __future__ import absolute_import import traceback -from constants import PLUGIN_IDENTIFIER +from .constants import PLUGIN_IDENTIFIER def get_user_prefs(Prefs, Logger): diff --git a/libs/subzero/sandbox.py b/libs/subzero/sandbox.py index f409b1824..888b17a9b 100644 --- a/libs/subzero/sandbox.py +++ b/libs/subzero/sandbox.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import sys diff --git a/libs/subzero/score_range.py b/libs/subzero/score_range.py index ec80d665f..52d0680ad 100644 --- a/libs/subzero/score_range.py +++ b/libs/subzero/score_range.py @@ -1,5 +1,7 @@ # coding=utf-8 +from __future__ import absolute_import +from __future__ import print_function import sys from itertools import chain, combinations, permutations @@ -15,8 +17,8 @@ if __name__ == "__main__": leftover_keys = list(set(scores.keys()) - set(base_score_keys)) base_score = sum([val for key, val in scores.items() if key in base_score_keys]) leftover_scores = set([score for key, score in scores.items() if key in leftover_keys]) - print "base score:", base_score - print "leftover:", sorted(set(leftover_scores)) + print("base score:", base_score) + print("leftover:", sorted(set(leftover_scores))) # print sum_possible_scores(base_score, leftover_scores) # print list(permutations(leftover_scores)) - print ",\n".join(map(lambda x: '"%s"' % x, ["66"] + sorted(set(permute(leftover_scores))))) + print(",\n".join(['"%s"' % x for x in ["66"] + sorted(set(permute(leftover_scores)))])) diff --git a/libs/subzero/subtitle_storage.py b/libs/subzero/subtitle_storage.py index 186a03a38..53a31b8f2 100644 --- a/libs/subzero/subtitle_storage.py +++ b/libs/subzero/subtitle_storage.py @@ -1,4 +1,5 @@ # coding=utf-8 +from __future__ import absolute_import import datetime import gzip import hashlib @@ -13,7 +14,8 @@ import sys from json_tricks.nonp import loads from subzero.lib.json import dumps from scandir import scandir, scandir_generic as _scandir_generic -from constants import mode_map +from .constants import mode_map +import six logger = logging.getLogger(__name__) @@ -67,7 +69,7 @@ class JSONStoredSubtitle(object): def deserialize(self, data): if data["content"]: # legacy: storage was unicode; content is always present in encoded form - if isinstance(data["content"], types.UnicodeType): + if isinstance(data["content"], str): data["content"] = data["content"].encode(data["encoding"]) self.initialize(**data) @@ -103,18 +105,18 @@ class JSONStoredVideoSubtitles(object): self.__dict__.update(data) if parts: - for part_id, part in parts.iteritems(): + for part_id, part in six.iteritems(parts): self.parts[part_id] = {} - for language, sub_data in part.iteritems(): + for language, sub_data in six.iteritems(part): self.parts[part_id][language] = {} - for sub_key, subtitle_data in sub_data.iteritems(): + for sub_key, subtitle_data in six.iteritems(sub_data): if sub_key == "current": if not isinstance(subtitle_data, tuple): subtitle_data = tuple(subtitle_data.split("__")) self.parts[part_id][language]["current"] = subtitle_data elif sub_key == "blacklist": - bl = dict((tuple([str(a) for a in k.split("__")]), v) for k, v in subtitle_data.iteritems()) + bl = dict((tuple([str(a) for a in k.split("__")]), v) for k, v in six.iteritems(subtitle_data)) self.parts[part_id][language]["blacklist"] = bl else: sub = JSONStoredSubtitle() @@ -127,21 +129,21 @@ class JSONStoredVideoSubtitles(object): def serialize(self): data = {"parts": {}} - for key, value in self.__dict__.iteritems(): + for key, value in six.iteritems(self.__dict__): if key != "parts": data[key] = value - for part_id, part in self.parts.iteritems(): + for part_id, part in six.iteritems(self.parts): data["parts"][part_id] = {} - for language, sub_data in part.iteritems(): + for language, sub_data in six.iteritems(part): data["parts"][part_id][language] = {} - for sub_key, stored_subtitle in sub_data.iteritems(): + for sub_key, stored_subtitle in six.iteritems(sub_data): if sub_key == "current": data["parts"][part_id][language]["current"] = "__".join(stored_subtitle) elif sub_key == "blacklist": data["parts"][part_id][language]["blacklist"] = dict(("__".join(k), v) for k, v in - stored_subtitle.iteritems()) + six.iteritems(stored_subtitle)) else: if stored_subtitle.content and not stored_subtitle.encoding: continue @@ -223,7 +225,7 @@ class JSONStoredVideoSubtitles(object): if not all_subs: return out - for key, subtitle in all_subs.iteritems(): + for key, subtitle in six.iteritems(all_subs): if key in ("current", "blacklist"): continue @@ -238,7 +240,7 @@ class JSONStoredVideoSubtitles(object): return 0 subs = part.get(str(lang)) - return len(filter(lambda key: key not in ("current", "blacklist"), subs.keys())) + return len([key for key in list(subs.keys()) if key not in ("current", "blacklist")]) def get_sub_key(self, provider_name, id): return provider_name, str(id) @@ -275,7 +277,7 @@ class JSONStoredVideoSubtitles(object): subs["blacklist"] = current_bl def __repr__(self): - return unicode(self) + return six.text_type(self) def __unicode__(self): return u"%s (%s)" % (self.title, self.video_id) @@ -399,7 +401,7 @@ class StoredSubtitlesManager(object): pass # remove known info about non-existing languages - for part_id, part in stored_subs.parts.iteritems(): + for part_id, part in six.iteritems(stored_subs.parts): missing_languages = set(part).difference(wanted_languages) if missing_languages: logger.debug("Languages removed: %s:%s:%s, removing data", video_id, part_id, missing_languages) diff --git a/libs/subzero/util.py b/libs/subzero/util.py index 25ab0cd60..61f6e9299 100644 --- a/libs/subzero/util.py +++ b/libs/subzero/util.py @@ -1,7 +1,9 @@ # coding=utf-8 +from __future__ import absolute_import import os from subzero.lib.io import get_viable_encoding +import six def get_root_path(): - return os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(unicode(__file__, get_viable_encoding()))), "..")) + return os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(six.text_type(__file__, get_viable_encoding()))), "..")) diff --git a/libs/subzero/video.py b/libs/subzero/video.py index 160e1afec..0a06240f6 100644 --- a/libs/subzero/video.py +++ b/libs/subzero/video.py @@ -1,11 +1,13 @@ # coding=utf-8 +from __future__ import absolute_import import logging import os from babelfish.exceptions import LanguageError from subzero.language import Language, language_from_stream from subliminal_patch import scan_video, refine, search_external_subtitles +import six logger = logging.getLogger(__name__) @@ -100,7 +102,7 @@ def refine_video(video, no_refining=False, refiner_settings=None): # our own metadata refiner :) if "stream" in video_info: - for key, value in video_info["stream"].iteritems(): + for key, value in six.iteritems(video_info["stream"]): if hasattr(video, key) and not getattr(video, key): logger.info(u"Adding stream %s info: %s", key, value) setattr(video, key, value) diff --git a/libs/urllib3/__init__.py b/libs/urllib3/__init__.py index 61915465f..c4c0dde54 100644 --- a/libs/urllib3/__init__.py +++ b/libs/urllib3/__init__.py @@ -1,7 +1,6 @@ """ urllib3 - Thread-safe connection pooling and re-using. """ - from __future__ import absolute_import import warnings @@ -27,7 +26,7 @@ from logging import NullHandler __author__ = 'Andrey Petrov ([email protected])' __license__ = 'MIT' -__version__ = '1.24.2' +__version__ = '1.25.3' __all__ = ( 'HTTPConnectionPool', diff --git a/libs/urllib3/connection.py b/libs/urllib3/connection.py index 02b36654b..57c58fedb 100644 --- a/libs/urllib3/connection.py +++ b/libs/urllib3/connection.py @@ -19,10 +19,11 @@ except (ImportError, AttributeError): # Platform-specific: No SSL. pass -try: # Python 3: - # Not a no-op, we're adding this to the namespace so it can be imported. +try: + # Python 3: not a no-op, we're adding this to the namespace so it can be imported. ConnectionError = ConnectionError -except NameError: # Python 2: +except NameError: + # Python 2 class ConnectionError(Exception): pass @@ -101,7 +102,7 @@ class HTTPConnection(_HTTPConnection, object): is_verified = False def __init__(self, *args, **kw): - if six.PY3: # Python 3 + if six.PY3: kw.pop('strict', None) # Pre-set source_address. @@ -158,7 +159,7 @@ class HTTPConnection(_HTTPConnection, object): conn = connection.create_connection( (self._dns_host, self.port), self.timeout, **extra_kw) - except SocketTimeout as e: + except SocketTimeout: raise ConnectTimeoutError( self, "Connection to %s timed out. (connect timeout=%s)" % (self.host, self.timeout)) @@ -171,7 +172,8 @@ class HTTPConnection(_HTTPConnection, object): def _prepare_conn(self, conn): self.sock = conn - if self._tunnel_host: + # Google App Engine's httplib does not define _tunnel_host + if getattr(self, '_tunnel_host', None): # TODO: Fix tunnel so it doesn't depend on self.sock state. self._tunnel() # Mark this connection as not reusable @@ -226,7 +228,8 @@ class HTTPSConnection(HTTPConnection): ssl_version = None def __init__(self, host, port=None, key_file=None, cert_file=None, - strict=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, + key_password=None, strict=None, + timeout=socket._GLOBAL_DEFAULT_TIMEOUT, ssl_context=None, server_hostname=None, **kw): HTTPConnection.__init__(self, host, port, strict=strict, @@ -234,6 +237,7 @@ class HTTPSConnection(HTTPConnection): self.key_file = key_file self.cert_file = cert_file + self.key_password = key_password self.ssl_context = ssl_context self.server_hostname = server_hostname @@ -245,16 +249,28 @@ class HTTPSConnection(HTTPConnection): conn = self._new_conn() self._prepare_conn(conn) + # Wrap socket using verification with the root certs in + # trusted_root_certs + default_ssl_context = False if self.ssl_context is None: + default_ssl_context = True self.ssl_context = create_urllib3_context( - ssl_version=resolve_ssl_version(None), - cert_reqs=resolve_cert_reqs(None), + ssl_version=resolve_ssl_version(self.ssl_version), + cert_reqs=resolve_cert_reqs(self.cert_reqs), ) + # Try to load OS default certs if none are given. + # Works well on Windows (requires Python3.4+) + context = self.ssl_context + if (not self.ca_certs and not self.ca_cert_dir and default_ssl_context + and hasattr(context, 'load_default_certs')): + context.load_default_certs() + self.sock = ssl_wrap_socket( sock=conn, keyfile=self.key_file, certfile=self.cert_file, + key_password=self.key_password, ssl_context=self.ssl_context, server_hostname=self.server_hostname ) @@ -272,25 +288,24 @@ class VerifiedHTTPSConnection(HTTPSConnection): assert_fingerprint = None def set_cert(self, key_file=None, cert_file=None, - cert_reqs=None, ca_certs=None, + cert_reqs=None, key_password=None, ca_certs=None, assert_hostname=None, assert_fingerprint=None, ca_cert_dir=None): """ This method should only be called once, before the connection is used. """ - # If cert_reqs is not provided, we can try to guess. If the user gave - # us a cert database, we assume they want to use it: otherwise, if - # they gave us an SSL Context object we should use whatever is set for - # it. + # If cert_reqs is not provided we'll assume CERT_REQUIRED unless we also + # have an SSLContext object in which case we'll use its verify_mode. if cert_reqs is None: - if ca_certs or ca_cert_dir: - cert_reqs = 'CERT_REQUIRED' - elif self.ssl_context is not None: + if self.ssl_context is not None: cert_reqs = self.ssl_context.verify_mode + else: + cert_reqs = resolve_cert_reqs(None) self.key_file = key_file self.cert_file = cert_file self.cert_reqs = cert_reqs + self.key_password = key_password self.assert_hostname = assert_hostname self.assert_fingerprint = assert_fingerprint self.ca_certs = ca_certs and os.path.expanduser(ca_certs) @@ -301,7 +316,8 @@ class VerifiedHTTPSConnection(HTTPSConnection): conn = self._new_conn() hostname = self.host - if self._tunnel_host: + # Google App Engine's httplib does not define _tunnel_host + if getattr(self, '_tunnel_host', None): self.sock = conn # Calls self._set_hostport(), so self.host is # self._tunnel_host below. @@ -326,7 +342,9 @@ class VerifiedHTTPSConnection(HTTPSConnection): # Wrap socket using verification with the root certs in # trusted_root_certs + default_ssl_context = False if self.ssl_context is None: + default_ssl_context = True self.ssl_context = create_urllib3_context( ssl_version=resolve_ssl_version(self.ssl_version), cert_reqs=resolve_cert_reqs(self.cert_reqs), @@ -334,10 +352,18 @@ class VerifiedHTTPSConnection(HTTPSConnection): context = self.ssl_context context.verify_mode = resolve_cert_reqs(self.cert_reqs) + + # Try to load OS default certs if none are given. + # Works well on Windows (requires Python3.4+) + if (not self.ca_certs and not self.ca_cert_dir and default_ssl_context + and hasattr(context, 'load_default_certs')): + context.load_default_certs() + self.sock = ssl_wrap_socket( sock=conn, keyfile=self.key_file, certfile=self.cert_file, + key_password=self.key_password, ca_certs=self.ca_certs, ca_cert_dir=self.ca_cert_dir, server_hostname=server_hostname, diff --git a/libs/urllib3/connectionpool.py b/libs/urllib3/connectionpool.py index f7a8f193d..157568a39 100644 --- a/libs/urllib3/connectionpool.py +++ b/libs/urllib3/connectionpool.py @@ -26,6 +26,7 @@ from .exceptions import ( from .packages.ssl_match_hostname import CertificateError from .packages import six from .packages.six.moves import queue +from .packages.rfc3986.normalizers import normalize_host from .connection import ( port_by_scheme, DummyConnection, @@ -65,7 +66,7 @@ class ConnectionPool(object): if not host: raise LocationValueError("No host specified.") - self.host = _ipv6_host(host, self.scheme) + self.host = _normalize_host(host, scheme=self.scheme) self._proxy_host = host.lower() self.port = port @@ -373,9 +374,11 @@ class HTTPConnectionPool(ConnectionPool, RequestMethods): # Receive the response from the server try: - try: # Python 2.7, use buffering of HTTP responses + try: + # Python 2.7, use buffering of HTTP responses httplib_response = conn.getresponse(buffering=True) - except TypeError: # Python 3 + except TypeError: + # Python 3 try: httplib_response = conn.getresponse() except Exception as e: @@ -432,8 +435,8 @@ class HTTPConnectionPool(ConnectionPool, RequestMethods): # TODO: Add optional support for socket.gethostbyname checking. scheme, host, port = get_host(url) - - host = _ipv6_host(host, self.scheme) + if host is not None: + host = _normalize_host(host, scheme=scheme) # Use explicit default port for comparison when none is given if self.port and not port: @@ -672,7 +675,7 @@ class HTTPConnectionPool(ConnectionPool, RequestMethods): # released back to the pool once the entire response is read response.read() except (TimeoutError, HTTPException, SocketError, ProtocolError, - BaseSSLError, SSLError) as e: + BaseSSLError, SSLError): pass # Handle redirect? @@ -746,8 +749,8 @@ class HTTPSConnectionPool(HTTPConnectionPool): If ``assert_hostname`` is False, no verification is done. The ``key_file``, ``cert_file``, ``cert_reqs``, ``ca_certs``, - ``ca_cert_dir``, and ``ssl_version`` are only used if :mod:`ssl` is - available and are fed into :meth:`urllib3.util.ssl_wrap_socket` to upgrade + ``ca_cert_dir``, ``ssl_version``, ``key_password`` are only used if :mod:`ssl` + is available and are fed into :meth:`urllib3.util.ssl_wrap_socket` to upgrade the connection socket into an SSL socket. """ @@ -759,7 +762,7 @@ class HTTPSConnectionPool(HTTPConnectionPool): block=False, headers=None, retries=None, _proxy=None, _proxy_headers=None, key_file=None, cert_file=None, cert_reqs=None, - ca_certs=None, ssl_version=None, + key_password=None, ca_certs=None, ssl_version=None, assert_hostname=None, assert_fingerprint=None, ca_cert_dir=None, **conn_kw): @@ -767,12 +770,10 @@ class HTTPSConnectionPool(HTTPConnectionPool): block, headers, retries, _proxy, _proxy_headers, **conn_kw) - if ca_certs and cert_reqs is None: - cert_reqs = 'CERT_REQUIRED' - self.key_file = key_file self.cert_file = cert_file self.cert_reqs = cert_reqs + self.key_password = key_password self.ca_certs = ca_certs self.ca_cert_dir = ca_cert_dir self.ssl_version = ssl_version @@ -787,6 +788,7 @@ class HTTPSConnectionPool(HTTPConnectionPool): if isinstance(conn, VerifiedHTTPSConnection): conn.set_cert(key_file=self.key_file, + key_password=self.key_password, cert_file=self.cert_file, cert_reqs=self.cert_reqs, ca_certs=self.ca_certs, @@ -824,7 +826,9 @@ class HTTPSConnectionPool(HTTPConnectionPool): conn = self.ConnectionCls(host=actual_host, port=actual_port, timeout=self.timeout.connect_timeout, - strict=self.strict, **self.conn_kw) + strict=self.strict, cert_file=self.cert_file, + key_file=self.key_file, key_password=self.key_password, + **self.conn_kw) return self._prepare_conn(conn) @@ -875,9 +879,9 @@ def connection_from_url(url, **kw): return HTTPConnectionPool(host, port=port, **kw) -def _ipv6_host(host, scheme): +def _normalize_host(host, scheme): """ - Process IPv6 address literals + Normalize hosts for comparisons and use with sockets. """ # httplib doesn't like it when we include brackets in IPv6 addresses @@ -886,11 +890,8 @@ def _ipv6_host(host, scheme): # Instead, we need to make sure we never pass ``None`` as the port. # However, for backward compatibility reasons we can't actually # *assert* that. See http://bugs.python.org/issue28539 - # - # Also if an IPv6 address literal has a zone identifier, the - # percent sign might be URIencoded, convert it back into ASCII if host.startswith('[') and host.endswith(']'): - host = host.replace('%25', '%').strip('[]') + host = host.strip('[]') if scheme in NORMALIZABLE_SCHEMES: - host = host.lower() + host = normalize_host(host) return host diff --git a/libs/urllib3/contrib/_securetransport/bindings.py b/libs/urllib3/contrib/_securetransport/bindings.py index bcf41c02b..be3421535 100644 --- a/libs/urllib3/contrib/_securetransport/bindings.py +++ b/libs/urllib3/contrib/_securetransport/bindings.py @@ -516,6 +516,8 @@ class SecurityConst(object): kTLSProtocol1 = 4 kTLSProtocol11 = 7 kTLSProtocol12 = 8 + kTLSProtocol13 = 10 + kTLSProtocolMaxSupported = 999 kSSLClientSide = 1 kSSLStreamType = 0 @@ -558,30 +560,27 @@ class SecurityConst(object): errSecInvalidTrustSettings = -25262 # Cipher suites. We only pick the ones our default cipher string allows. + # Source: https://developer.apple.com/documentation/security/1550981-ssl_cipher_suite_values TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 = 0xC02C TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 = 0xC030 TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256 = 0xC02B TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 = 0xC02F - TLS_DHE_DSS_WITH_AES_256_GCM_SHA384 = 0x00A3 + TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256 = 0xCCA9 + TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256 = 0xCCA8 TLS_DHE_RSA_WITH_AES_256_GCM_SHA384 = 0x009F - TLS_DHE_DSS_WITH_AES_128_GCM_SHA256 = 0x00A2 TLS_DHE_RSA_WITH_AES_128_GCM_SHA256 = 0x009E TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384 = 0xC024 TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384 = 0xC028 TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA = 0xC00A TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA = 0xC014 TLS_DHE_RSA_WITH_AES_256_CBC_SHA256 = 0x006B - TLS_DHE_DSS_WITH_AES_256_CBC_SHA256 = 0x006A TLS_DHE_RSA_WITH_AES_256_CBC_SHA = 0x0039 - TLS_DHE_DSS_WITH_AES_256_CBC_SHA = 0x0038 TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256 = 0xC023 TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 = 0xC027 TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA = 0xC009 TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA = 0xC013 TLS_DHE_RSA_WITH_AES_128_CBC_SHA256 = 0x0067 - TLS_DHE_DSS_WITH_AES_128_CBC_SHA256 = 0x0040 TLS_DHE_RSA_WITH_AES_128_CBC_SHA = 0x0033 - TLS_DHE_DSS_WITH_AES_128_CBC_SHA = 0x0032 TLS_RSA_WITH_AES_256_GCM_SHA384 = 0x009D TLS_RSA_WITH_AES_128_GCM_SHA256 = 0x009C TLS_RSA_WITH_AES_256_CBC_SHA256 = 0x003D @@ -590,4 +589,5 @@ class SecurityConst(object): TLS_RSA_WITH_AES_128_CBC_SHA = 0x002F TLS_AES_128_GCM_SHA256 = 0x1301 TLS_AES_256_GCM_SHA384 = 0x1302 - TLS_CHACHA20_POLY1305_SHA256 = 0x1303 + TLS_AES_128_CCM_8_SHA256 = 0x1305 + TLS_AES_128_CCM_SHA256 = 0x1304 diff --git a/libs/urllib3/contrib/pyopenssl.py b/libs/urllib3/contrib/pyopenssl.py index 5ab7803ac..821c174fd 100644 --- a/libs/urllib3/contrib/pyopenssl.py +++ b/libs/urllib3/contrib/pyopenssl.py @@ -70,6 +70,7 @@ import sys from .. import util + __all__ = ['inject_into_urllib3', 'extract_from_urllib3'] # SNI always works. @@ -77,20 +78,19 @@ HAS_SNI = True # Map from urllib3 to PyOpenSSL compatible parameter-values. _openssl_versions = { - ssl.PROTOCOL_SSLv23: OpenSSL.SSL.SSLv23_METHOD, + util.PROTOCOL_TLS: OpenSSL.SSL.SSLv23_METHOD, ssl.PROTOCOL_TLSv1: OpenSSL.SSL.TLSv1_METHOD, } +if hasattr(ssl, 'PROTOCOL_SSLv3') and hasattr(OpenSSL.SSL, 'SSLv3_METHOD'): + _openssl_versions[ssl.PROTOCOL_SSLv3] = OpenSSL.SSL.SSLv3_METHOD + if hasattr(ssl, 'PROTOCOL_TLSv1_1') and hasattr(OpenSSL.SSL, 'TLSv1_1_METHOD'): _openssl_versions[ssl.PROTOCOL_TLSv1_1] = OpenSSL.SSL.TLSv1_1_METHOD if hasattr(ssl, 'PROTOCOL_TLSv1_2') and hasattr(OpenSSL.SSL, 'TLSv1_2_METHOD'): _openssl_versions[ssl.PROTOCOL_TLSv1_2] = OpenSSL.SSL.TLSv1_2_METHOD -try: - _openssl_versions.update({ssl.PROTOCOL_SSLv3: OpenSSL.SSL.SSLv3_METHOD}) -except AttributeError: - pass _stdlib_to_openssl_verify = { ssl.CERT_NONE: OpenSSL.SSL.VERIFY_NONE, @@ -117,6 +117,7 @@ def inject_into_urllib3(): _validate_dependencies_met() + util.SSLContext = PyOpenSSLContext util.ssl_.SSLContext = PyOpenSSLContext util.HAS_SNI = HAS_SNI util.ssl_.HAS_SNI = HAS_SNI @@ -127,6 +128,7 @@ def inject_into_urllib3(): def extract_from_urllib3(): 'Undo monkey-patching by :func:`inject_into_urllib3`.' + util.SSLContext = orig_util_SSLContext util.ssl_.SSLContext = orig_util_SSLContext util.HAS_SNI = orig_util_HAS_SNI util.ssl_.HAS_SNI = orig_util_HAS_SNI @@ -184,6 +186,7 @@ def _dnsname_to_stdlib(name): except idna.core.IDNAError: return None + # Don't send IPv6 addresses through the IDNA encoder. if ':' in name: return name @@ -279,7 +282,7 @@ class WrappedSocket(object): return b'' else: raise SocketError(str(e)) - except OpenSSL.SSL.ZeroReturnError as e: + except OpenSSL.SSL.ZeroReturnError: if self.connection.get_shutdown() == OpenSSL.SSL.RECEIVED_SHUTDOWN: return b'' else: @@ -289,6 +292,10 @@ class WrappedSocket(object): raise timeout('The read operation timed out') else: return self.recv(*args, **kwargs) + + # TLS 1.3 post-handshake authentication + except OpenSSL.SSL.Error as e: + raise ssl.SSLError("read error: %r" % e) else: return data @@ -300,7 +307,7 @@ class WrappedSocket(object): return 0 else: raise SocketError(str(e)) - except OpenSSL.SSL.ZeroReturnError as e: + except OpenSSL.SSL.ZeroReturnError: if self.connection.get_shutdown() == OpenSSL.SSL.RECEIVED_SHUTDOWN: return 0 else: @@ -311,6 +318,10 @@ class WrappedSocket(object): else: return self.recv_into(*args, **kwargs) + # TLS 1.3 post-handshake authentication + except OpenSSL.SSL.Error as e: + raise ssl.SSLError("read error: %r" % e) + def settimeout(self, timeout): return self.socket.settimeout(timeout) @@ -363,6 +374,9 @@ class WrappedSocket(object): 'subjectAltName': get_subj_alt_name(x509) } + def version(self): + return self.connection.get_protocol_version_name() + def _reuse(self): self._makefile_refs += 1 @@ -435,7 +449,9 @@ class PyOpenSSLContext(object): def load_cert_chain(self, certfile, keyfile=None, password=None): self._ctx.use_certificate_chain_file(certfile) if password is not None: - self._ctx.set_passwd_cb(lambda max_length, prompt_twice, userdata: password) + if not isinstance(password, six.binary_type): + password = password.encode('utf-8') + self._ctx.set_passwd_cb(lambda *_: password) self._ctx.use_privatekey_file(keyfile or certfile) def wrap_socket(self, sock, server_side=False, diff --git a/libs/urllib3/contrib/securetransport.py b/libs/urllib3/contrib/securetransport.py index 77cb59ed7..4dc484841 100644 --- a/libs/urllib3/contrib/securetransport.py +++ b/libs/urllib3/contrib/securetransport.py @@ -23,6 +23,31 @@ To use this module, simply import and inject it:: urllib3.contrib.securetransport.inject_into_urllib3() Happy TLSing! + +This code is a bastardised version of the code found in Will Bond's oscrypto +library. An enormous debt is owed to him for blazing this trail for us. For +that reason, this code should be considered to be covered both by urllib3's +license and by oscrypto's: + + Copyright (c) 2015-2016 Will Bond <[email protected]> + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. """ from __future__ import absolute_import @@ -86,35 +111,32 @@ SSL_WRITE_BLOCKSIZE = 16384 # individual cipher suites. We need to do this because this is how # SecureTransport wants them. CIPHER_SUITES = [ - SecurityConst.TLS_AES_256_GCM_SHA384, - SecurityConst.TLS_CHACHA20_POLY1305_SHA256, - SecurityConst.TLS_AES_128_GCM_SHA256, SecurityConst.TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384, - SecurityConst.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384, SecurityConst.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256, + SecurityConst.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384, SecurityConst.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, - SecurityConst.TLS_DHE_DSS_WITH_AES_256_GCM_SHA384, + SecurityConst.TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256, + SecurityConst.TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256, SecurityConst.TLS_DHE_RSA_WITH_AES_256_GCM_SHA384, - SecurityConst.TLS_DHE_DSS_WITH_AES_128_GCM_SHA256, SecurityConst.TLS_DHE_RSA_WITH_AES_128_GCM_SHA256, SecurityConst.TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384, - SecurityConst.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384, SecurityConst.TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA, - SecurityConst.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA, - SecurityConst.TLS_DHE_RSA_WITH_AES_256_CBC_SHA256, - SecurityConst.TLS_DHE_DSS_WITH_AES_256_CBC_SHA256, - SecurityConst.TLS_DHE_RSA_WITH_AES_256_CBC_SHA, - SecurityConst.TLS_DHE_DSS_WITH_AES_256_CBC_SHA, SecurityConst.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256, - SecurityConst.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256, SecurityConst.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA, + SecurityConst.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384, + SecurityConst.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA, + SecurityConst.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256, SecurityConst.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, + SecurityConst.TLS_DHE_RSA_WITH_AES_256_CBC_SHA256, + SecurityConst.TLS_DHE_RSA_WITH_AES_256_CBC_SHA, SecurityConst.TLS_DHE_RSA_WITH_AES_128_CBC_SHA256, - SecurityConst.TLS_DHE_DSS_WITH_AES_128_CBC_SHA256, SecurityConst.TLS_DHE_RSA_WITH_AES_128_CBC_SHA, - SecurityConst.TLS_DHE_DSS_WITH_AES_128_CBC_SHA, + SecurityConst.TLS_AES_256_GCM_SHA384, + SecurityConst.TLS_AES_128_GCM_SHA256, SecurityConst.TLS_RSA_WITH_AES_256_GCM_SHA384, SecurityConst.TLS_RSA_WITH_AES_128_GCM_SHA256, + SecurityConst.TLS_AES_128_CCM_8_SHA256, + SecurityConst.TLS_AES_128_CCM_SHA256, SecurityConst.TLS_RSA_WITH_AES_256_CBC_SHA256, SecurityConst.TLS_RSA_WITH_AES_128_CBC_SHA256, SecurityConst.TLS_RSA_WITH_AES_256_CBC_SHA, @@ -122,9 +144,10 @@ CIPHER_SUITES = [ ] # Basically this is simple: for PROTOCOL_SSLv23 we turn it into a low of -# TLSv1 and a high of TLSv1.2. For everything else, we pin to that version. +# TLSv1 and a high of TLSv1.3. For everything else, we pin to that version. +# TLSv1 to 1.2 are supported on macOS 10.8+ and TLSv1.3 is macOS 10.13+ _protocol_to_min_max = { - ssl.PROTOCOL_SSLv23: (SecurityConst.kTLSProtocol1, SecurityConst.kTLSProtocol12), + util.PROTOCOL_TLS: (SecurityConst.kTLSProtocol1, SecurityConst.kTLSProtocolMaxSupported), } if hasattr(ssl, "PROTOCOL_SSLv2"): @@ -147,14 +170,13 @@ if hasattr(ssl, "PROTOCOL_TLSv1_2"): _protocol_to_min_max[ssl.PROTOCOL_TLSv1_2] = ( SecurityConst.kTLSProtocol12, SecurityConst.kTLSProtocol12 ) -if hasattr(ssl, "PROTOCOL_TLS"): - _protocol_to_min_max[ssl.PROTOCOL_TLS] = _protocol_to_min_max[ssl.PROTOCOL_SSLv23] def inject_into_urllib3(): """ Monkey-patch urllib3 with SecureTransport-backed SSL-support. """ + util.SSLContext = SecureTransportContext util.ssl_.SSLContext = SecureTransportContext util.HAS_SNI = HAS_SNI util.ssl_.HAS_SNI = HAS_SNI @@ -166,6 +188,7 @@ def extract_from_urllib3(): """ Undo monkey-patching by :func:`inject_into_urllib3`. """ + util.SSLContext = orig_util_SSLContext util.ssl_.SSLContext = orig_util_SSLContext util.HAS_SNI = orig_util_HAS_SNI util.ssl_.HAS_SNI = orig_util_HAS_SNI @@ -458,7 +481,14 @@ class WrappedSocket(object): # Set the minimum and maximum TLS versions. result = Security.SSLSetProtocolVersionMin(self.context, min_version) _assert_no_error(result) + + # TLS 1.3 isn't necessarily enabled by the OS + # so we have to detect when we error out and try + # setting TLS 1.3 if it's allowed. kTLSProtocolMaxSupported + # was added in macOS 10.13 along with kTLSProtocol13. result = Security.SSLSetProtocolVersionMax(self.context, max_version) + if result != 0 and max_version == SecurityConst.kTLSProtocolMaxSupported: + result = Security.SSLSetProtocolVersionMax(self.context, SecurityConst.kTLSProtocol12) _assert_no_error(result) # If there's a trust DB, we need to use it. We do that by telling @@ -667,6 +697,25 @@ class WrappedSocket(object): return der_bytes + def version(self): + protocol = Security.SSLProtocol() + result = Security.SSLGetNegotiatedProtocolVersion(self.context, ctypes.byref(protocol)) + _assert_no_error(result) + if protocol.value == SecurityConst.kTLSProtocol13: + return 'TLSv1.3' + elif protocol.value == SecurityConst.kTLSProtocol12: + return 'TLSv1.2' + elif protocol.value == SecurityConst.kTLSProtocol11: + return 'TLSv1.1' + elif protocol.value == SecurityConst.kTLSProtocol1: + return 'TLSv1' + elif protocol.value == SecurityConst.kSSLProtocol3: + return 'SSLv3' + elif protocol.value == SecurityConst.kSSLProtocol2: + return 'SSLv2' + else: + raise ssl.SSLError('Unknown TLS version: %r' % protocol) + def _reuse(self): self._makefile_refs += 1 diff --git a/libs/urllib3/contrib/socks.py b/libs/urllib3/contrib/socks.py index 811e312ec..636d261fb 100644 --- a/libs/urllib3/contrib/socks.py +++ b/libs/urllib3/contrib/socks.py @@ -1,25 +1,38 @@ # -*- coding: utf-8 -*- """ This module contains provisional support for SOCKS proxies from within -urllib3. This module supports SOCKS4 (specifically the SOCKS4A variant) and +urllib3. This module supports SOCKS4, SOCKS4A (an extension of SOCKS4), and SOCKS5. To enable its functionality, either install PySocks or install this module with the ``socks`` extra. The SOCKS implementation supports the full range of urllib3 features. It also supports the following SOCKS features: -- SOCKS4 -- SOCKS4a -- SOCKS5 +- SOCKS4A (``proxy_url='socks4a://...``) +- SOCKS4 (``proxy_url='socks4://...``) +- SOCKS5 with remote DNS (``proxy_url='socks5h://...``) +- SOCKS5 with local DNS (``proxy_url='socks5://...``) - Usernames and passwords for the SOCKS proxy -Known Limitations: + .. note:: + It is recommended to use ``socks5h://`` or ``socks4a://`` schemes in + your ``proxy_url`` to ensure that DNS resolution is done from the remote + server instead of client-side when connecting to a domain name. + +SOCKS4 supports IPv4 and domain names with the SOCKS4A extension. SOCKS5 +supports IPv4, IPv6, and domain names. + +When connecting to a SOCKS4 proxy the ``username`` portion of the ``proxy_url`` +will be sent as the ``userid`` section of the SOCKS request:: + + proxy_url="socks4a://<userid>@proxy-host" + +When connecting to a SOCKS5 proxy the ``username`` and ``password`` portion +of the ``proxy_url`` will be sent as the username/password to authenticate +with the proxy:: + + proxy_url="socks5h://<username>:<password>@proxy-host" -- Currently PySocks does not support contacting remote websites via literal - IPv6 addresses. Any such connection attempt will fail. You must use a domain - name. -- Currently PySocks does not support IPv6 connections to the SOCKS proxy. Any - such connection attempt will fail. """ from __future__ import absolute_import @@ -88,7 +101,7 @@ class SOCKSConnection(HTTPConnection): **extra_kw ) - except SocketTimeout as e: + except SocketTimeout: raise ConnectTimeoutError( self, "Connection to %s timed out. (connect timeout=%s)" % (self.host, self.timeout)) diff --git a/libs/urllib3/fields.py b/libs/urllib3/fields.py index 37fe64a3e..6a9a5a7f5 100644 --- a/libs/urllib3/fields.py +++ b/libs/urllib3/fields.py @@ -1,6 +1,7 @@ from __future__ import absolute_import import email.utils import mimetypes +import re from .packages import six @@ -19,57 +20,147 @@ def guess_content_type(filename, default='application/octet-stream'): return default -def format_header_param(name, value): +def format_header_param_rfc2231(name, value): """ - Helper function to format and quote a single header parameter. + Helper function to format and quote a single header parameter using the + strategy defined in RFC 2231. Particularly useful for header parameters which might contain - non-ASCII values, like file names. This follows RFC 2231, as - suggested by RFC 2388 Section 4.4. + non-ASCII values, like file names. This follows RFC 2388 Section 4.4. :param name: The name of the parameter, a string expected to be ASCII only. :param value: - The value of the parameter, provided as a unicode string. + The value of the parameter, provided as ``bytes`` or `str``. + :ret: + An RFC-2231-formatted unicode string. """ + if isinstance(value, six.binary_type): + value = value.decode("utf-8") + if not any(ch in value for ch in '"\\\r\n'): - result = '%s="%s"' % (name, value) + result = u'%s="%s"' % (name, value) try: result.encode('ascii') except (UnicodeEncodeError, UnicodeDecodeError): pass else: return result - if not six.PY3 and isinstance(value, six.text_type): # Python 2: + + if not six.PY3: # Python 2: value = value.encode('utf-8') + + # encode_rfc2231 accepts an encoded string and returns an ascii-encoded + # string in Python 2 but accepts and returns unicode strings in Python 3 value = email.utils.encode_rfc2231(value, 'utf-8') value = '%s*=%s' % (name, value) + + if not six.PY3: # Python 2: + value = value.decode('utf-8') + return value +_HTML5_REPLACEMENTS = { + u"\u0022": u"%22", + # Replace "\" with "\\". + u"\u005C": u"\u005C\u005C", + u"\u005C": u"\u005C\u005C", +} + +# All control characters from 0x00 to 0x1F *except* 0x1B. +_HTML5_REPLACEMENTS.update({ + six.unichr(cc): u"%{:02X}".format(cc) + for cc + in range(0x00, 0x1F+1) + if cc not in (0x1B,) +}) + + +def _replace_multiple(value, needles_and_replacements): + + def replacer(match): + return needles_and_replacements[match.group(0)] + + pattern = re.compile( + r"|".join([ + re.escape(needle) for needle in needles_and_replacements.keys() + ]) + ) + + result = pattern.sub(replacer, value) + + return result + + +def format_header_param_html5(name, value): + """ + Helper function to format and quote a single header parameter using the + HTML5 strategy. + + Particularly useful for header parameters which might contain + non-ASCII values, like file names. This follows the `HTML5 Working Draft + Section 4.10.22.7`_ and matches the behavior of curl and modern browsers. + + .. _HTML5 Working Draft Section 4.10.22.7: + https://w3c.github.io/html/sec-forms.html#multipart-form-data + + :param name: + The name of the parameter, a string expected to be ASCII only. + :param value: + The value of the parameter, provided as ``bytes`` or `str``. + :ret: + A unicode string, stripped of troublesome characters. + """ + if isinstance(value, six.binary_type): + value = value.decode("utf-8") + + value = _replace_multiple(value, _HTML5_REPLACEMENTS) + + return u'%s="%s"' % (name, value) + + +# For backwards-compatibility. +format_header_param = format_header_param_html5 + + class RequestField(object): """ A data container for request body parameters. :param name: - The name of this request field. + The name of this request field. Must be unicode. :param data: The data/value body. :param filename: - An optional filename of the request field. + An optional filename of the request field. Must be unicode. :param headers: An optional dict-like object of headers to initially use for the field. + :param header_formatter: + An optional callable that is used to encode and format the headers. By + default, this is :func:`format_header_param_html5`. """ - def __init__(self, name, data, filename=None, headers=None): + def __init__( + self, + name, + data, + filename=None, + headers=None, + header_formatter=format_header_param_html5): self._name = name self._filename = filename self.data = data self.headers = {} if headers: self.headers = dict(headers) + self.header_formatter = header_formatter @classmethod - def from_tuples(cls, fieldname, value): + def from_tuples( + cls, + fieldname, + value, + header_formatter=format_header_param_html5): """ A :class:`~urllib3.fields.RequestField` factory from old-style tuple parameters. @@ -97,21 +188,24 @@ class RequestField(object): content_type = None data = value - request_param = cls(fieldname, data, filename=filename) + request_param = cls( + fieldname, data, filename=filename, header_formatter=header_formatter) request_param.make_multipart(content_type=content_type) return request_param def _render_part(self, name, value): """ - Overridable helper function to format a single header parameter. + Overridable helper function to format a single header parameter. By + default, this calls ``self.header_formatter``. :param name: The name of the parameter, a string expected to be ASCII only. :param value: The value of the parameter, provided as a unicode string. """ - return format_header_param(name, value) + + return self.header_formatter(name, value) def _render_parts(self, header_parts): """ @@ -133,7 +227,7 @@ class RequestField(object): if value is not None: parts.append(self._render_part(name, value)) - return '; '.join(parts) + return u'; '.join(parts) def render_headers(self): """ @@ -144,15 +238,15 @@ class RequestField(object): sort_keys = ['Content-Disposition', 'Content-Type', 'Content-Location'] for sort_key in sort_keys: if self.headers.get(sort_key, False): - lines.append('%s: %s' % (sort_key, self.headers[sort_key])) + lines.append(u'%s: %s' % (sort_key, self.headers[sort_key])) for header_name, header_value in self.headers.items(): if header_name not in sort_keys: if header_value: - lines.append('%s: %s' % (header_name, header_value)) + lines.append(u'%s: %s' % (header_name, header_value)) - lines.append('\r\n') - return '\r\n'.join(lines) + lines.append(u'\r\n') + return u'\r\n'.join(lines) def make_multipart(self, content_disposition=None, content_type=None, content_location=None): @@ -168,10 +262,10 @@ class RequestField(object): The 'Content-Location' of the request body. """ - self.headers['Content-Disposition'] = content_disposition or 'form-data' - self.headers['Content-Disposition'] += '; '.join([ - '', self._render_parts( - (('name', self._name), ('filename', self._filename)) + self.headers['Content-Disposition'] = content_disposition or u'form-data' + self.headers['Content-Disposition'] += u'; '.join([ + u'', self._render_parts( + ((u'name', self._name), (u'filename', self._filename)) ) ]) self.headers['Content-Type'] = content_type diff --git a/libs/urllib3/packages/ordered_dict.py b/libs/urllib3/packages/ordered_dict.py deleted file mode 100644 index 4479363cc..000000000 --- a/libs/urllib3/packages/ordered_dict.py +++ /dev/null @@ -1,259 +0,0 @@ -# Backport of OrderedDict() class that runs on Python 2.4, 2.5, 2.6, 2.7 and pypy. -# Passes Python2.7's test suite and incorporates all the latest updates. -# Copyright 2009 Raymond Hettinger, released under the MIT License. -# http://code.activestate.com/recipes/576693/ -try: - from thread import get_ident as _get_ident -except ImportError: - from dummy_thread import get_ident as _get_ident - -try: - from _abcoll import KeysView, ValuesView, ItemsView -except ImportError: - pass - - -class OrderedDict(dict): - 'Dictionary that remembers insertion order' - # An inherited dict maps keys to values. - # The inherited dict provides __getitem__, __len__, __contains__, and get. - # The remaining methods are order-aware. - # Big-O running times for all methods are the same as for regular dictionaries. - - # The internal self.__map dictionary maps keys to links in a doubly linked list. - # The circular doubly linked list starts and ends with a sentinel element. - # The sentinel element never gets deleted (this simplifies the algorithm). - # Each link is stored as a list of length three: [PREV, NEXT, KEY]. - - def __init__(self, *args, **kwds): - '''Initialize an ordered dictionary. Signature is the same as for - regular dictionaries, but keyword arguments are not recommended - because their insertion order is arbitrary. - - ''' - if len(args) > 1: - raise TypeError('expected at most 1 arguments, got %d' % len(args)) - try: - self.__root - except AttributeError: - self.__root = root = [] # sentinel node - root[:] = [root, root, None] - self.__map = {} - self.__update(*args, **kwds) - - def __setitem__(self, key, value, dict_setitem=dict.__setitem__): - 'od.__setitem__(i, y) <==> od[i]=y' - # Setting a new item creates a new link which goes at the end of the linked - # list, and the inherited dictionary is updated with the new key/value pair. - if key not in self: - root = self.__root - last = root[0] - last[1] = root[0] = self.__map[key] = [last, root, key] - dict_setitem(self, key, value) - - def __delitem__(self, key, dict_delitem=dict.__delitem__): - 'od.__delitem__(y) <==> del od[y]' - # Deleting an existing item uses self.__map to find the link which is - # then removed by updating the links in the predecessor and successor nodes. - dict_delitem(self, key) - link_prev, link_next, key = self.__map.pop(key) - link_prev[1] = link_next - link_next[0] = link_prev - - def __iter__(self): - 'od.__iter__() <==> iter(od)' - root = self.__root - curr = root[1] - while curr is not root: - yield curr[2] - curr = curr[1] - - def __reversed__(self): - 'od.__reversed__() <==> reversed(od)' - root = self.__root - curr = root[0] - while curr is not root: - yield curr[2] - curr = curr[0] - - def clear(self): - 'od.clear() -> None. Remove all items from od.' - try: - for node in self.__map.itervalues(): - del node[:] - root = self.__root - root[:] = [root, root, None] - self.__map.clear() - except AttributeError: - pass - dict.clear(self) - - def popitem(self, last=True): - '''od.popitem() -> (k, v), return and remove a (key, value) pair. - Pairs are returned in LIFO order if last is true or FIFO order if false. - - ''' - if not self: - raise KeyError('dictionary is empty') - root = self.__root - if last: - link = root[0] - link_prev = link[0] - link_prev[1] = root - root[0] = link_prev - else: - link = root[1] - link_next = link[1] - root[1] = link_next - link_next[0] = root - key = link[2] - del self.__map[key] - value = dict.pop(self, key) - return key, value - - # -- the following methods do not depend on the internal structure -- - - def keys(self): - 'od.keys() -> list of keys in od' - return list(self) - - def values(self): - 'od.values() -> list of values in od' - return [self[key] for key in self] - - def items(self): - 'od.items() -> list of (key, value) pairs in od' - return [(key, self[key]) for key in self] - - def iterkeys(self): - 'od.iterkeys() -> an iterator over the keys in od' - return iter(self) - - def itervalues(self): - 'od.itervalues -> an iterator over the values in od' - for k in self: - yield self[k] - - def iteritems(self): - 'od.iteritems -> an iterator over the (key, value) items in od' - for k in self: - yield (k, self[k]) - - def update(*args, **kwds): - '''od.update(E, **F) -> None. Update od from dict/iterable E and F. - - If E is a dict instance, does: for k in E: od[k] = E[k] - If E has a .keys() method, does: for k in E.keys(): od[k] = E[k] - Or if E is an iterable of items, does: for k, v in E: od[k] = v - In either case, this is followed by: for k, v in F.items(): od[k] = v - - ''' - if len(args) > 2: - raise TypeError('update() takes at most 2 positional ' - 'arguments (%d given)' % (len(args),)) - elif not args: - raise TypeError('update() takes at least 1 argument (0 given)') - self = args[0] - # Make progressively weaker assumptions about "other" - other = () - if len(args) == 2: - other = args[1] - if isinstance(other, dict): - for key in other: - self[key] = other[key] - elif hasattr(other, 'keys'): - for key in other.keys(): - self[key] = other[key] - else: - for key, value in other: - self[key] = value - for key, value in kwds.items(): - self[key] = value - - __update = update # let subclasses override update without breaking __init__ - - __marker = object() - - def pop(self, key, default=__marker): - '''od.pop(k[,d]) -> v, remove specified key and return the corresponding value. - If key is not found, d is returned if given, otherwise KeyError is raised. - - ''' - if key in self: - result = self[key] - del self[key] - return result - if default is self.__marker: - raise KeyError(key) - return default - - def setdefault(self, key, default=None): - 'od.setdefault(k[,d]) -> od.get(k,d), also set od[k]=d if k not in od' - if key in self: - return self[key] - self[key] = default - return default - - def __repr__(self, _repr_running={}): - 'od.__repr__() <==> repr(od)' - call_key = id(self), _get_ident() - if call_key in _repr_running: - return '...' - _repr_running[call_key] = 1 - try: - if not self: - return '%s()' % (self.__class__.__name__,) - return '%s(%r)' % (self.__class__.__name__, self.items()) - finally: - del _repr_running[call_key] - - def __reduce__(self): - 'Return state information for pickling' - items = [[k, self[k]] for k in self] - inst_dict = vars(self).copy() - for k in vars(OrderedDict()): - inst_dict.pop(k, None) - if inst_dict: - return (self.__class__, (items,), inst_dict) - return self.__class__, (items,) - - def copy(self): - 'od.copy() -> a shallow copy of od' - return self.__class__(self) - - @classmethod - def fromkeys(cls, iterable, value=None): - '''OD.fromkeys(S[, v]) -> New ordered dictionary with keys from S - and values equal to v (which defaults to None). - - ''' - d = cls() - for key in iterable: - d[key] = value - return d - - def __eq__(self, other): - '''od.__eq__(y) <==> od==y. Comparison to another OD is order-sensitive - while comparison to a regular mapping is order-insensitive. - - ''' - if isinstance(other, OrderedDict): - return len(self)==len(other) and self.items() == other.items() - return dict.__eq__(self, other) - - def __ne__(self, other): - return not self == other - - # -- the following methods are only used in Python 2.7 -- - - def viewkeys(self): - "od.viewkeys() -> a set-like object providing a view on od's keys" - return KeysView(self) - - def viewvalues(self): - "od.viewvalues() -> an object providing a view on od's values" - return ValuesView(self) - - def viewitems(self): - "od.viewitems() -> a set-like object providing a view on od's items" - return ItemsView(self) diff --git a/libs/urllib3/packages/rfc3986/__init__.py b/libs/urllib3/packages/rfc3986/__init__.py new file mode 100644 index 000000000..371c6dd51 --- /dev/null +++ b/libs/urllib3/packages/rfc3986/__init__.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +An implementation of semantics and validations described in RFC 3986. + +See http://rfc3986.readthedocs.io/ for detailed documentation. + +:copyright: (c) 2014 Rackspace +:license: Apache v2.0, see LICENSE for details +""" + +from .api import iri_reference +from .api import IRIReference +from .api import is_valid_uri +from .api import normalize_uri +from .api import uri_reference +from .api import URIReference +from .api import urlparse +from .parseresult import ParseResult + +__title__ = 'rfc3986' +__author__ = 'Ian Stapleton Cordasco' +__author_email__ = '[email protected]' +__license__ = 'Apache v2.0' +__copyright__ = 'Copyright 2014 Rackspace' +__version__ = '1.3.2' + +__all__ = ( + 'ParseResult', + 'URIReference', + 'IRIReference', + 'is_valid_uri', + 'normalize_uri', + 'uri_reference', + 'iri_reference', + 'urlparse', + '__title__', + '__author__', + '__author_email__', + '__license__', + '__copyright__', + '__version__', +) diff --git a/libs/urllib3/packages/rfc3986/_mixin.py b/libs/urllib3/packages/rfc3986/_mixin.py new file mode 100644 index 000000000..543925cdb --- /dev/null +++ b/libs/urllib3/packages/rfc3986/_mixin.py @@ -0,0 +1,353 @@ +"""Module containing the implementation of the URIMixin class.""" +import warnings + +from . import exceptions as exc +from . import misc +from . import normalizers +from . import validators + + +class URIMixin(object): + """Mixin with all shared methods for URIs and IRIs.""" + + __hash__ = tuple.__hash__ + + def authority_info(self): + """Return a dictionary with the ``userinfo``, ``host``, and ``port``. + + If the authority is not valid, it will raise a + :class:`~rfc3986.exceptions.InvalidAuthority` Exception. + + :returns: + ``{'userinfo': 'username:password', 'host': 'www.example.com', + 'port': '80'}`` + :rtype: dict + :raises rfc3986.exceptions.InvalidAuthority: + If the authority is not ``None`` and can not be parsed. + """ + if not self.authority: + return {'userinfo': None, 'host': None, 'port': None} + + match = self._match_subauthority() + + if match is None: + # In this case, we have an authority that was parsed from the URI + # Reference, but it cannot be further parsed by our + # misc.SUBAUTHORITY_MATCHER. In this case it must not be a valid + # authority. + raise exc.InvalidAuthority(self.authority.encode(self.encoding)) + + # We had a match, now let's ensure that it is actually a valid host + # address if it is IPv4 + matches = match.groupdict() + host = matches.get('host') + + if (host and misc.IPv4_MATCHER.match(host) and not + validators.valid_ipv4_host_address(host)): + # If we have a host, it appears to be IPv4 and it does not have + # valid bytes, it is an InvalidAuthority. + raise exc.InvalidAuthority(self.authority.encode(self.encoding)) + + return matches + + def _match_subauthority(self): + return misc.SUBAUTHORITY_MATCHER.match(self.authority) + + @property + def host(self): + """If present, a string representing the host.""" + try: + authority = self.authority_info() + except exc.InvalidAuthority: + return None + return authority['host'] + + @property + def port(self): + """If present, the port extracted from the authority.""" + try: + authority = self.authority_info() + except exc.InvalidAuthority: + return None + return authority['port'] + + @property + def userinfo(self): + """If present, the userinfo extracted from the authority.""" + try: + authority = self.authority_info() + except exc.InvalidAuthority: + return None + return authority['userinfo'] + + def is_absolute(self): + """Determine if this URI Reference is an absolute URI. + + See http://tools.ietf.org/html/rfc3986#section-4.3 for explanation. + + :returns: ``True`` if it is an absolute URI, ``False`` otherwise. + :rtype: bool + """ + return bool(misc.ABSOLUTE_URI_MATCHER.match(self.unsplit())) + + def is_valid(self, **kwargs): + """Determine if the URI is valid. + + .. deprecated:: 1.1.0 + + Use the :class:`~rfc3986.validators.Validator` object instead. + + :param bool require_scheme: Set to ``True`` if you wish to require the + presence of the scheme component. + :param bool require_authority: Set to ``True`` if you wish to require + the presence of the authority component. + :param bool require_path: Set to ``True`` if you wish to require the + presence of the path component. + :param bool require_query: Set to ``True`` if you wish to require the + presence of the query component. + :param bool require_fragment: Set to ``True`` if you wish to require + the presence of the fragment component. + :returns: ``True`` if the URI is valid. ``False`` otherwise. + :rtype: bool + """ + warnings.warn("Please use rfc3986.validators.Validator instead. " + "This method will be eventually removed.", + DeprecationWarning) + validators = [ + (self.scheme_is_valid, kwargs.get('require_scheme', False)), + (self.authority_is_valid, kwargs.get('require_authority', False)), + (self.path_is_valid, kwargs.get('require_path', False)), + (self.query_is_valid, kwargs.get('require_query', False)), + (self.fragment_is_valid, kwargs.get('require_fragment', False)), + ] + return all(v(r) for v, r in validators) + + def authority_is_valid(self, require=False): + """Determine if the authority component is valid. + + .. deprecated:: 1.1.0 + + Use the :class:`~rfc3986.validators.Validator` object instead. + + :param bool require: + Set to ``True`` to require the presence of this component. + :returns: + ``True`` if the authority is valid. ``False`` otherwise. + :rtype: + bool + """ + warnings.warn("Please use rfc3986.validators.Validator instead. " + "This method will be eventually removed.", + DeprecationWarning) + try: + self.authority_info() + except exc.InvalidAuthority: + return False + + return validators.authority_is_valid( + self.authority, + host=self.host, + require=require, + ) + + def scheme_is_valid(self, require=False): + """Determine if the scheme component is valid. + + .. deprecated:: 1.1.0 + + Use the :class:`~rfc3986.validators.Validator` object instead. + + :param str require: Set to ``True`` to require the presence of this + component. + :returns: ``True`` if the scheme is valid. ``False`` otherwise. + :rtype: bool + """ + warnings.warn("Please use rfc3986.validators.Validator instead. " + "This method will be eventually removed.", + DeprecationWarning) + return validators.scheme_is_valid(self.scheme, require) + + def path_is_valid(self, require=False): + """Determine if the path component is valid. + + .. deprecated:: 1.1.0 + + Use the :class:`~rfc3986.validators.Validator` object instead. + + :param str require: Set to ``True`` to require the presence of this + component. + :returns: ``True`` if the path is valid. ``False`` otherwise. + :rtype: bool + """ + warnings.warn("Please use rfc3986.validators.Validator instead. " + "This method will be eventually removed.", + DeprecationWarning) + return validators.path_is_valid(self.path, require) + + def query_is_valid(self, require=False): + """Determine if the query component is valid. + + .. deprecated:: 1.1.0 + + Use the :class:`~rfc3986.validators.Validator` object instead. + + :param str require: Set to ``True`` to require the presence of this + component. + :returns: ``True`` if the query is valid. ``False`` otherwise. + :rtype: bool + """ + warnings.warn("Please use rfc3986.validators.Validator instead. " + "This method will be eventually removed.", + DeprecationWarning) + return validators.query_is_valid(self.query, require) + + def fragment_is_valid(self, require=False): + """Determine if the fragment component is valid. + + .. deprecated:: 1.1.0 + + Use the Validator object instead. + + :param str require: Set to ``True`` to require the presence of this + component. + :returns: ``True`` if the fragment is valid. ``False`` otherwise. + :rtype: bool + """ + warnings.warn("Please use rfc3986.validators.Validator instead. " + "This method will be eventually removed.", + DeprecationWarning) + return validators.fragment_is_valid(self.fragment, require) + + def normalized_equality(self, other_ref): + """Compare this URIReference to another URIReference. + + :param URIReference other_ref: (required), The reference with which + we're comparing. + :returns: ``True`` if the references are equal, ``False`` otherwise. + :rtype: bool + """ + return tuple(self.normalize()) == tuple(other_ref.normalize()) + + def resolve_with(self, base_uri, strict=False): + """Use an absolute URI Reference to resolve this relative reference. + + Assuming this is a relative reference that you would like to resolve, + use the provided base URI to resolve it. + + See http://tools.ietf.org/html/rfc3986#section-5 for more information. + + :param base_uri: Either a string or URIReference. It must be an + absolute URI or it will raise an exception. + :returns: A new URIReference which is the result of resolving this + reference using ``base_uri``. + :rtype: :class:`URIReference` + :raises rfc3986.exceptions.ResolutionError: + If the ``base_uri`` is not an absolute URI. + """ + if not isinstance(base_uri, URIMixin): + base_uri = type(self).from_string(base_uri) + + if not base_uri.is_absolute(): + raise exc.ResolutionError(base_uri) + + # This is optional per + # http://tools.ietf.org/html/rfc3986#section-5.2.1 + base_uri = base_uri.normalize() + + # The reference we're resolving + resolving = self + + if not strict and resolving.scheme == base_uri.scheme: + resolving = resolving.copy_with(scheme=None) + + # http://tools.ietf.org/html/rfc3986#page-32 + if resolving.scheme is not None: + target = resolving.copy_with( + path=normalizers.normalize_path(resolving.path) + ) + else: + if resolving.authority is not None: + target = resolving.copy_with( + scheme=base_uri.scheme, + path=normalizers.normalize_path(resolving.path) + ) + else: + if resolving.path is None: + if resolving.query is not None: + query = resolving.query + else: + query = base_uri.query + target = resolving.copy_with( + scheme=base_uri.scheme, + authority=base_uri.authority, + path=base_uri.path, + query=query + ) + else: + if resolving.path.startswith('/'): + path = normalizers.normalize_path(resolving.path) + else: + path = normalizers.normalize_path( + misc.merge_paths(base_uri, resolving.path) + ) + target = resolving.copy_with( + scheme=base_uri.scheme, + authority=base_uri.authority, + path=path, + query=resolving.query + ) + return target + + def unsplit(self): + """Create a URI string from the components. + + :returns: The URI Reference reconstituted as a string. + :rtype: str + """ + # See http://tools.ietf.org/html/rfc3986#section-5.3 + result_list = [] + if self.scheme: + result_list.extend([self.scheme, ':']) + if self.authority: + result_list.extend(['//', self.authority]) + if self.path: + result_list.append(self.path) + if self.query is not None: + result_list.extend(['?', self.query]) + if self.fragment is not None: + result_list.extend(['#', self.fragment]) + return ''.join(result_list) + + def copy_with(self, scheme=misc.UseExisting, authority=misc.UseExisting, + path=misc.UseExisting, query=misc.UseExisting, + fragment=misc.UseExisting): + """Create a copy of this reference with the new components. + + :param str scheme: + (optional) The scheme to use for the new reference. + :param str authority: + (optional) The authority to use for the new reference. + :param str path: + (optional) The path to use for the new reference. + :param str query: + (optional) The query to use for the new reference. + :param str fragment: + (optional) The fragment to use for the new reference. + :returns: + New URIReference with provided components. + :rtype: + URIReference + """ + attributes = { + 'scheme': scheme, + 'authority': authority, + 'path': path, + 'query': query, + 'fragment': fragment, + } + for key, value in list(attributes.items()): + if value is misc.UseExisting: + del attributes[key] + uri = self._replace(**attributes) + uri.encoding = self.encoding + return uri diff --git a/libs/urllib3/packages/rfc3986/abnf_regexp.py b/libs/urllib3/packages/rfc3986/abnf_regexp.py new file mode 100644 index 000000000..24c9c3d00 --- /dev/null +++ b/libs/urllib3/packages/rfc3986/abnf_regexp.py @@ -0,0 +1,267 @@ +# -*- coding: utf-8 -*- +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module for the regular expressions crafted from ABNF.""" + +import sys + +# https://tools.ietf.org/html/rfc3986#page-13 +GEN_DELIMS = GENERIC_DELIMITERS = ":/?#[]@" +GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS) +# https://tools.ietf.org/html/rfc3986#page-13 +SUB_DELIMS = SUB_DELIMITERS = "!$&'()*+,;=" +SUB_DELIMITERS_SET = set(SUB_DELIMITERS) +# Escape the '*' for use in regular expressions +SUB_DELIMITERS_RE = r"!$&'()\*+,;=" +RESERVED_CHARS_SET = GENERIC_DELIMITERS_SET.union(SUB_DELIMITERS_SET) +ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' +DIGIT = '0123456789' +# https://tools.ietf.org/html/rfc3986#section-2.3 +UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + r'._!-' +UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS) +NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET) +# We need to escape the '-' in this case: +UNRESERVED_RE = r'A-Za-z0-9._~\-' + +# Percent encoded character values +PERCENT_ENCODED = PCT_ENCODED = '%[A-Fa-f0-9]{2}' +PCHAR = '([' + UNRESERVED_RE + SUB_DELIMITERS_RE + ':@]|%s)' % PCT_ENCODED + +# NOTE(sigmavirus24): We're going to use more strict regular expressions +# than appear in Appendix B for scheme. This will prevent over-eager +# consuming of items that aren't schemes. +SCHEME_RE = '[a-zA-Z][a-zA-Z0-9+.-]*' +_AUTHORITY_RE = '[^/?#]*' +_PATH_RE = '[^?#]*' +_QUERY_RE = '[^#]*' +_FRAGMENT_RE = '.*' + +# Extracted from http://tools.ietf.org/html/rfc3986#appendix-B +COMPONENT_PATTERN_DICT = { + 'scheme': SCHEME_RE, + 'authority': _AUTHORITY_RE, + 'path': _PATH_RE, + 'query': _QUERY_RE, + 'fragment': _FRAGMENT_RE, +} + +# See http://tools.ietf.org/html/rfc3986#appendix-B +# In this case, we name each of the important matches so we can use +# SRE_Match#groupdict to parse the values out if we so choose. This is also +# modified to ignore other matches that are not important to the parsing of +# the reference so we can also simply use SRE_Match#groups. +URL_PARSING_RE = ( + r'(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?' + r'(?P<path>{path})(?:\?(?P<query>{query}))?' + r'(?:#(?P<fragment>{fragment}))?' +).format(**COMPONENT_PATTERN_DICT) + + +# ######################### +# Authority Matcher Section +# ######################### + +# Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2 +# The pattern for a regular name, e.g., www.google.com, api.github.com +REGULAR_NAME_RE = REG_NAME = '((?:{0}|[{1}])*)'.format( + '%[0-9A-Fa-f]{2}', SUB_DELIMITERS_RE + UNRESERVED_RE +) +# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1, +IPv4_RE = r'([0-9]{1,3}\.){3}[0-9]{1,3}' +# Hexadecimal characters used in each piece of an IPv6 address +HEXDIG_RE = '[0-9A-Fa-f]{1,4}' +# Least-significant 32 bits of an IPv6 address +LS32_RE = '({hex}:{hex}|{ipv4})'.format(hex=HEXDIG_RE, ipv4=IPv4_RE) +# Substitutions into the following patterns for IPv6 patterns defined +# http://tools.ietf.org/html/rfc3986#page-20 +_subs = {'hex': HEXDIG_RE, 'ls32': LS32_RE} + +# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details +# about ABNF (Augmented Backus-Naur Form) use in the comments +variations = [ + # 6( h16 ":" ) ls32 + '(%(hex)s:){6}%(ls32)s' % _subs, + # "::" 5( h16 ":" ) ls32 + '::(%(hex)s:){5}%(ls32)s' % _subs, + # [ h16 ] "::" 4( h16 ":" ) ls32 + '(%(hex)s)?::(%(hex)s:){4}%(ls32)s' % _subs, + # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 + '((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s' % _subs, + # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 + '((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s' % _subs, + # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 + '((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s' % _subs, + # [ *4( h16 ":" ) h16 ] "::" ls32 + '((%(hex)s:){0,4}%(hex)s)?::%(ls32)s' % _subs, + # [ *5( h16 ":" ) h16 ] "::" h16 + '((%(hex)s:){0,5}%(hex)s)?::%(hex)s' % _subs, + # [ *6( h16 ":" ) h16 ] "::" + '((%(hex)s:){0,6}%(hex)s)?::' % _subs, +] + +IPv6_RE = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7})|({8}))'.format( + *variations +) + +IPv_FUTURE_RE = r'v[0-9A-Fa-f]+\.[%s]+' % ( + UNRESERVED_RE + SUB_DELIMITERS_RE + ':' +) + +# RFC 6874 Zone ID ABNF +ZONE_ID = '(?:[' + UNRESERVED_RE + ']|' + PCT_ENCODED + ')+' + +IPv6_ADDRZ_RFC4007_RE = IPv6_RE + '(?:(?:%25|%)' + ZONE_ID + ')?' +IPv6_ADDRZ_RE = IPv6_RE + '(?:%25' + ZONE_ID + ')?' + +IP_LITERAL_RE = r'\[({0}|{1})\]'.format( + IPv6_ADDRZ_RFC4007_RE, + IPv_FUTURE_RE, +) + +# Pattern for matching the host piece of the authority +HOST_RE = HOST_PATTERN = '({0}|{1}|{2})'.format( + REG_NAME, + IPv4_RE, + IP_LITERAL_RE, +) +USERINFO_RE = '^([' + UNRESERVED_RE + SUB_DELIMITERS_RE + ':]|%s)+' % ( + PCT_ENCODED +) +PORT_RE = '[0-9]{1,5}' + +# #################### +# Path Matcher Section +# #################### + +# See http://tools.ietf.org/html/rfc3986#section-3.3 for more information +# about the path patterns defined below. +segments = { + 'segment': PCHAR + '*', + # Non-zero length segment + 'segment-nz': PCHAR + '+', + # Non-zero length segment without ":" + 'segment-nz-nc': PCHAR.replace(':', '') + '+' +} + +# Path types taken from Section 3.3 (linked above) +PATH_EMPTY = '^$' +PATH_ROOTLESS = '%(segment-nz)s(/%(segment)s)*' % segments +PATH_NOSCHEME = '%(segment-nz-nc)s(/%(segment)s)*' % segments +PATH_ABSOLUTE = '/(%s)?' % PATH_ROOTLESS +PATH_ABEMPTY = '(/%(segment)s)*' % segments +PATH_RE = '^(%s|%s|%s|%s|%s)$' % ( + PATH_ABEMPTY, PATH_ABSOLUTE, PATH_NOSCHEME, PATH_ROOTLESS, PATH_EMPTY +) + +FRAGMENT_RE = QUERY_RE = ( + '^([/?:@' + UNRESERVED_RE + SUB_DELIMITERS_RE + ']|%s)*$' % PCT_ENCODED +) + +# ########################## +# Relative reference matcher +# ########################## + +# See http://tools.ietf.org/html/rfc3986#section-4.2 for details +RELATIVE_PART_RE = '(//%s%s|%s|%s|%s)' % ( + COMPONENT_PATTERN_DICT['authority'], + PATH_ABEMPTY, + PATH_ABSOLUTE, + PATH_NOSCHEME, + PATH_EMPTY, +) + +# See http://tools.ietf.org/html/rfc3986#section-3 for definition +HIER_PART_RE = '(//%s%s|%s|%s|%s)' % ( + COMPONENT_PATTERN_DICT['authority'], + PATH_ABEMPTY, + PATH_ABSOLUTE, + PATH_ROOTLESS, + PATH_EMPTY, +) + +# ############### +# IRIs / RFC 3987 +# ############### + +# Only wide-unicode gets the high-ranges of UCSCHAR +if sys.maxunicode > 0xFFFF: # pragma: no cover + IPRIVATE = u'\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD' + UCSCHAR_RE = ( + u'\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF' + u'\U00010000-\U0001FFFD\U00020000-\U0002FFFD' + u'\U00030000-\U0003FFFD\U00040000-\U0004FFFD' + u'\U00050000-\U0005FFFD\U00060000-\U0006FFFD' + u'\U00070000-\U0007FFFD\U00080000-\U0008FFFD' + u'\U00090000-\U0009FFFD\U000A0000-\U000AFFFD' + u'\U000B0000-\U000BFFFD\U000C0000-\U000CFFFD' + u'\U000D0000-\U000DFFFD\U000E1000-\U000EFFFD' + ) +else: # pragma: no cover + IPRIVATE = u'\uE000-\uF8FF' + UCSCHAR_RE = ( + u'\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF' + ) + +IUNRESERVED_RE = u'A-Za-z0-9\\._~\\-' + UCSCHAR_RE +IPCHAR = u'([' + IUNRESERVED_RE + SUB_DELIMITERS_RE + u':@]|%s)' % PCT_ENCODED + +isegments = { + 'isegment': IPCHAR + u'*', + # Non-zero length segment + 'isegment-nz': IPCHAR + u'+', + # Non-zero length segment without ":" + 'isegment-nz-nc': IPCHAR.replace(':', '') + u'+' +} + +IPATH_ROOTLESS = u'%(isegment-nz)s(/%(isegment)s)*' % isegments +IPATH_NOSCHEME = u'%(isegment-nz-nc)s(/%(isegment)s)*' % isegments +IPATH_ABSOLUTE = u'/(?:%s)?' % IPATH_ROOTLESS +IPATH_ABEMPTY = u'(?:/%(isegment)s)*' % isegments +IPATH_RE = u'^(?:%s|%s|%s|%s|%s)$' % ( + IPATH_ABEMPTY, IPATH_ABSOLUTE, IPATH_NOSCHEME, IPATH_ROOTLESS, PATH_EMPTY +) + +IREGULAR_NAME_RE = IREG_NAME = u'(?:{0}|[{1}])*'.format( + u'%[0-9A-Fa-f]{2}', SUB_DELIMITERS_RE + IUNRESERVED_RE +) + +IHOST_RE = IHOST_PATTERN = u'({0}|{1}|{2})'.format( + IREG_NAME, + IPv4_RE, + IP_LITERAL_RE, +) + +IUSERINFO_RE = u'^(?:[' + IUNRESERVED_RE + SUB_DELIMITERS_RE + u':]|%s)+' % ( + PCT_ENCODED +) + +IFRAGMENT_RE = (u'^(?:[/?:@' + IUNRESERVED_RE + SUB_DELIMITERS_RE + + u']|%s)*$' % PCT_ENCODED) +IQUERY_RE = (u'^(?:[/?:@' + IUNRESERVED_RE + SUB_DELIMITERS_RE + + IPRIVATE + u']|%s)*$' % PCT_ENCODED) + +IRELATIVE_PART_RE = u'(//%s%s|%s|%s|%s)' % ( + COMPONENT_PATTERN_DICT['authority'], + IPATH_ABEMPTY, + IPATH_ABSOLUTE, + IPATH_NOSCHEME, + PATH_EMPTY, +) + +IHIER_PART_RE = u'(//%s%s|%s|%s|%s)' % ( + COMPONENT_PATTERN_DICT['authority'], + IPATH_ABEMPTY, + IPATH_ABSOLUTE, + IPATH_ROOTLESS, + PATH_EMPTY, +) diff --git a/libs/urllib3/packages/rfc3986/api.py b/libs/urllib3/packages/rfc3986/api.py new file mode 100644 index 000000000..ddc4a1cd2 --- /dev/null +++ b/libs/urllib3/packages/rfc3986/api.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Module containing the simple and functional API for rfc3986. + +This module defines functions and provides access to the public attributes +and classes of rfc3986. +""" + +from .iri import IRIReference +from .parseresult import ParseResult +from .uri import URIReference + + +def uri_reference(uri, encoding='utf-8'): + """Parse a URI string into a URIReference. + + This is a convenience function. You could achieve the same end by using + ``URIReference.from_string(uri)``. + + :param str uri: The URI which needs to be parsed into a reference. + :param str encoding: The encoding of the string provided + :returns: A parsed URI + :rtype: :class:`URIReference` + """ + return URIReference.from_string(uri, encoding) + + +def iri_reference(iri, encoding='utf-8'): + """Parse a IRI string into an IRIReference. + + This is a convenience function. You could achieve the same end by using + ``IRIReference.from_string(iri)``. + + :param str iri: The IRI which needs to be parsed into a reference. + :param str encoding: The encoding of the string provided + :returns: A parsed IRI + :rtype: :class:`IRIReference` + """ + return IRIReference.from_string(iri, encoding) + + +def is_valid_uri(uri, encoding='utf-8', **kwargs): + """Determine if the URI given is valid. + + This is a convenience function. You could use either + ``uri_reference(uri).is_valid()`` or + ``URIReference.from_string(uri).is_valid()`` to achieve the same result. + + :param str uri: The URI to be validated. + :param str encoding: The encoding of the string provided + :param bool require_scheme: Set to ``True`` if you wish to require the + presence of the scheme component. + :param bool require_authority: Set to ``True`` if you wish to require the + presence of the authority component. + :param bool require_path: Set to ``True`` if you wish to require the + presence of the path component. + :param bool require_query: Set to ``True`` if you wish to require the + presence of the query component. + :param bool require_fragment: Set to ``True`` if you wish to require the + presence of the fragment component. + :returns: ``True`` if the URI is valid, ``False`` otherwise. + :rtype: bool + """ + return URIReference.from_string(uri, encoding).is_valid(**kwargs) + + +def normalize_uri(uri, encoding='utf-8'): + """Normalize the given URI. + + This is a convenience function. You could use either + ``uri_reference(uri).normalize().unsplit()`` or + ``URIReference.from_string(uri).normalize().unsplit()`` instead. + + :param str uri: The URI to be normalized. + :param str encoding: The encoding of the string provided + :returns: The normalized URI. + :rtype: str + """ + normalized_reference = URIReference.from_string(uri, encoding).normalize() + return normalized_reference.unsplit() + + +def urlparse(uri, encoding='utf-8'): + """Parse a given URI and return a ParseResult. + + This is a partial replacement of the standard library's urlparse function. + + :param str uri: The URI to be parsed. + :param str encoding: The encoding of the string provided. + :returns: A parsed URI + :rtype: :class:`~rfc3986.parseresult.ParseResult` + """ + return ParseResult.from_string(uri, encoding, strict=False) diff --git a/libs/urllib3/packages/rfc3986/builder.py b/libs/urllib3/packages/rfc3986/builder.py new file mode 100644 index 000000000..793427999 --- /dev/null +++ b/libs/urllib3/packages/rfc3986/builder.py @@ -0,0 +1,298 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Ian Stapleton Cordasco +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing the logic for the URIBuilder object.""" +from . import compat +from . import normalizers +from . import uri + + +class URIBuilder(object): + """Object to aid in building up a URI Reference from parts. + + .. note:: + + This object should be instantiated by the user, but it's recommended + that it is not provided with arguments. Instead, use the available + method to populate the fields. + + """ + + def __init__(self, scheme=None, userinfo=None, host=None, port=None, + path=None, query=None, fragment=None): + """Initialize our URI builder. + + :param str scheme: + (optional) + :param str userinfo: + (optional) + :param str host: + (optional) + :param int port: + (optional) + :param str path: + (optional) + :param str query: + (optional) + :param str fragment: + (optional) + """ + self.scheme = scheme + self.userinfo = userinfo + self.host = host + self.port = port + self.path = path + self.query = query + self.fragment = fragment + + def __repr__(self): + """Provide a convenient view of our builder object.""" + formatstr = ('URIBuilder(scheme={b.scheme}, userinfo={b.userinfo}, ' + 'host={b.host}, port={b.port}, path={b.path}, ' + 'query={b.query}, fragment={b.fragment})') + return formatstr.format(b=self) + + def add_scheme(self, scheme): + """Add a scheme to our builder object. + + After normalizing, this will generate a new URIBuilder instance with + the specified scheme and all other attributes the same. + + .. code-block:: python + + >>> URIBuilder().add_scheme('HTTPS') + URIBuilder(scheme='https', userinfo=None, host=None, port=None, + path=None, query=None, fragment=None) + + """ + scheme = normalizers.normalize_scheme(scheme) + return URIBuilder( + scheme=scheme, + userinfo=self.userinfo, + host=self.host, + port=self.port, + path=self.path, + query=self.query, + fragment=self.fragment, + ) + + def add_credentials(self, username, password): + """Add credentials as the userinfo portion of the URI. + + .. code-block:: python + + >>> URIBuilder().add_credentials('root', 's3crete') + URIBuilder(scheme=None, userinfo='root:s3crete', host=None, + port=None, path=None, query=None, fragment=None) + + >>> URIBuilder().add_credentials('root', None) + URIBuilder(scheme=None, userinfo='root', host=None, + port=None, path=None, query=None, fragment=None) + """ + if username is None: + raise ValueError('Username cannot be None') + userinfo = normalizers.normalize_username(username) + + if password is not None: + userinfo = '{}:{}'.format( + userinfo, + normalizers.normalize_password(password), + ) + + return URIBuilder( + scheme=self.scheme, + userinfo=userinfo, + host=self.host, + port=self.port, + path=self.path, + query=self.query, + fragment=self.fragment, + ) + + def add_host(self, host): + """Add hostname to the URI. + + .. code-block:: python + + >>> URIBuilder().add_host('google.com') + URIBuilder(scheme=None, userinfo=None, host='google.com', + port=None, path=None, query=None, fragment=None) + + """ + return URIBuilder( + scheme=self.scheme, + userinfo=self.userinfo, + host=normalizers.normalize_host(host), + port=self.port, + path=self.path, + query=self.query, + fragment=self.fragment, + ) + + def add_port(self, port): + """Add port to the URI. + + .. code-block:: python + + >>> URIBuilder().add_port(80) + URIBuilder(scheme=None, userinfo=None, host=None, port='80', + path=None, query=None, fragment=None) + + >>> URIBuilder().add_port(443) + URIBuilder(scheme=None, userinfo=None, host=None, port='443', + path=None, query=None, fragment=None) + + """ + port_int = int(port) + if port_int < 0: + raise ValueError( + 'ports are not allowed to be negative. You provided {}'.format( + port_int, + ) + ) + if port_int > 65535: + raise ValueError( + 'ports are not allowed to be larger than 65535. ' + 'You provided {}'.format( + port_int, + ) + ) + + return URIBuilder( + scheme=self.scheme, + userinfo=self.userinfo, + host=self.host, + port='{}'.format(port_int), + path=self.path, + query=self.query, + fragment=self.fragment, + ) + + def add_path(self, path): + """Add a path to the URI. + + .. code-block:: python + + >>> URIBuilder().add_path('sigmavirus24/rfc3985') + URIBuilder(scheme=None, userinfo=None, host=None, port=None, + path='/sigmavirus24/rfc3986', query=None, fragment=None) + + >>> URIBuilder().add_path('/checkout.php') + URIBuilder(scheme=None, userinfo=None, host=None, port=None, + path='/checkout.php', query=None, fragment=None) + + """ + if not path.startswith('/'): + path = '/{}'.format(path) + + return URIBuilder( + scheme=self.scheme, + userinfo=self.userinfo, + host=self.host, + port=self.port, + path=normalizers.normalize_path(path), + query=self.query, + fragment=self.fragment, + ) + + def add_query_from(self, query_items): + """Generate and add a query a dictionary or list of tuples. + + .. code-block:: python + + >>> URIBuilder().add_query_from({'a': 'b c'}) + URIBuilder(scheme=None, userinfo=None, host=None, port=None, + path=None, query='a=b+c', fragment=None) + + >>> URIBuilder().add_query_from([('a', 'b c')]) + URIBuilder(scheme=None, userinfo=None, host=None, port=None, + path=None, query='a=b+c', fragment=None) + + """ + query = normalizers.normalize_query(compat.urlencode(query_items)) + + return URIBuilder( + scheme=self.scheme, + userinfo=self.userinfo, + host=self.host, + port=self.port, + path=self.path, + query=query, + fragment=self.fragment, + ) + + def add_query(self, query): + """Add a pre-formated query string to the URI. + + .. code-block:: python + + >>> URIBuilder().add_query('a=b&c=d') + URIBuilder(scheme=None, userinfo=None, host=None, port=None, + path=None, query='a=b&c=d', fragment=None) + + """ + return URIBuilder( + scheme=self.scheme, + userinfo=self.userinfo, + host=self.host, + port=self.port, + path=self.path, + query=normalizers.normalize_query(query), + fragment=self.fragment, + ) + + def add_fragment(self, fragment): + """Add a fragment to the URI. + + .. code-block:: python + + >>> URIBuilder().add_fragment('section-2.6.1') + URIBuilder(scheme=None, userinfo=None, host=None, port=None, + path=None, query=None, fragment='section-2.6.1') + + """ + return URIBuilder( + scheme=self.scheme, + userinfo=self.userinfo, + host=self.host, + port=self.port, + path=self.path, + query=self.query, + fragment=normalizers.normalize_fragment(fragment), + ) + + def finalize(self): + """Create a URIReference from our builder. + + .. code-block:: python + + >>> URIBuilder().add_scheme('https').add_host('github.com' + ... ).add_path('sigmavirus24/rfc3986').finalize().unsplit() + 'https://github.com/sigmavirus24/rfc3986' + + >>> URIBuilder().add_scheme('https').add_host('github.com' + ... ).add_path('sigmavirus24/rfc3986').add_credentials( + ... 'sigmavirus24', 'not-re@l').finalize().unsplit() + 'https://sigmavirus24:not-re%[email protected]/sigmavirus24/rfc3986' + + """ + return uri.URIReference( + self.scheme, + normalizers.normalize_authority( + (self.userinfo, self.host, self.port) + ), + self.path, + self.query, + self.fragment, + ) diff --git a/libs/urllib3/packages/rfc3986/compat.py b/libs/urllib3/packages/rfc3986/compat.py new file mode 100644 index 000000000..8968c3843 --- /dev/null +++ b/libs/urllib3/packages/rfc3986/compat.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Compatibility module for Python 2 and 3 support.""" +import sys + +try: + from urllib.parse import quote as urlquote +except ImportError: # Python 2.x + from urllib import quote as urlquote + +try: + from urllib.parse import urlencode +except ImportError: # Python 2.x + from urllib import urlencode + +__all__ = ( + 'to_bytes', + 'to_str', + 'urlquote', + 'urlencode', +) + +PY3 = (3, 0) <= sys.version_info < (4, 0) +PY2 = (2, 6) <= sys.version_info < (2, 8) + + +if PY3: + unicode = str # Python 3.x + + +def to_str(b, encoding='utf-8'): + """Ensure that b is text in the specified encoding.""" + if hasattr(b, 'decode') and not isinstance(b, unicode): + b = b.decode(encoding) + return b + + +def to_bytes(s, encoding='utf-8'): + """Ensure that s is converted to bytes from the encoding.""" + if hasattr(s, 'encode') and not isinstance(s, bytes): + s = s.encode(encoding) + return s diff --git a/libs/urllib3/packages/rfc3986/exceptions.py b/libs/urllib3/packages/rfc3986/exceptions.py new file mode 100644 index 000000000..da8ca7cb1 --- /dev/null +++ b/libs/urllib3/packages/rfc3986/exceptions.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- +"""Exceptions module for rfc3986.""" + +from . import compat + + +class RFC3986Exception(Exception): + """Base class for all rfc3986 exception classes.""" + + pass + + +class InvalidAuthority(RFC3986Exception): + """Exception when the authority string is invalid.""" + + def __init__(self, authority): + """Initialize the exception with the invalid authority.""" + super(InvalidAuthority, self).__init__( + u"The authority ({0}) is not valid.".format( + compat.to_str(authority))) + + +class InvalidPort(RFC3986Exception): + """Exception when the port is invalid.""" + + def __init__(self, port): + """Initialize the exception with the invalid port.""" + super(InvalidPort, self).__init__( + 'The port ("{0}") is not valid.'.format(port)) + + +class ResolutionError(RFC3986Exception): + """Exception to indicate a failure to resolve a URI.""" + + def __init__(self, uri): + """Initialize the error with the failed URI.""" + super(ResolutionError, self).__init__( + "{0} is not an absolute URI.".format(uri.unsplit())) + + +class ValidationError(RFC3986Exception): + """Exception raised during Validation of a URI.""" + + pass + + +class MissingComponentError(ValidationError): + """Exception raised when a required component is missing.""" + + def __init__(self, uri, *component_names): + """Initialize the error with the missing component name.""" + verb = 'was' + if len(component_names) > 1: + verb = 'were' + + self.uri = uri + self.components = sorted(component_names) + components = ', '.join(self.components) + super(MissingComponentError, self).__init__( + "{} {} required but missing".format(components, verb), + uri, + self.components, + ) + + +class UnpermittedComponentError(ValidationError): + """Exception raised when a component has an unpermitted value.""" + + def __init__(self, component_name, component_value, allowed_values): + """Initialize the error with the unpermitted component.""" + super(UnpermittedComponentError, self).__init__( + "{} was required to be one of {!r} but was {!r}".format( + component_name, list(sorted(allowed_values)), component_value, + ), + component_name, + component_value, + allowed_values, + ) + self.component_name = component_name + self.component_value = component_value + self.allowed_values = allowed_values + + +class PasswordForbidden(ValidationError): + """Exception raised when a URL has a password in the userinfo section.""" + + def __init__(self, uri): + """Initialize the error with the URI that failed validation.""" + unsplit = getattr(uri, 'unsplit', lambda: uri) + super(PasswordForbidden, self).__init__( + '"{}" contained a password when validation forbade it'.format( + unsplit() + ) + ) + self.uri = uri + + +class InvalidComponentsError(ValidationError): + """Exception raised when one or more components are invalid.""" + + def __init__(self, uri, *component_names): + """Initialize the error with the invalid component name(s).""" + verb = 'was' + if len(component_names) > 1: + verb = 'were' + + self.uri = uri + self.components = sorted(component_names) + components = ', '.join(self.components) + super(InvalidComponentsError, self).__init__( + "{} {} found to be invalid".format(components, verb), + uri, + self.components, + ) + + +class MissingDependencyError(RFC3986Exception): + """Exception raised when an IRI is encoded without the 'idna' module.""" diff --git a/libs/urllib3/packages/rfc3986/iri.py b/libs/urllib3/packages/rfc3986/iri.py new file mode 100644 index 000000000..9c01fe1cd --- /dev/null +++ b/libs/urllib3/packages/rfc3986/iri.py @@ -0,0 +1,147 @@ +"""Module containing the implementation of the IRIReference class.""" +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Copyright (c) 2015 Ian Stapleton Cordasco +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import namedtuple + +from . import compat +from . import exceptions +from . import misc +from . import normalizers +from . import uri + + +try: + import idna +except ImportError: # pragma: no cover + idna = None + + +class IRIReference(namedtuple('IRIReference', misc.URI_COMPONENTS), + uri.URIMixin): + """Immutable object representing a parsed IRI Reference. + + Can be encoded into an URIReference object via the procedure + specified in RFC 3987 Section 3.1 + + .. note:: + The IRI submodule is a new interface and may possibly change in + the future. Check for changes to the interface when upgrading. + """ + + slots = () + + def __new__(cls, scheme, authority, path, query, fragment, + encoding='utf-8'): + """Create a new IRIReference.""" + ref = super(IRIReference, cls).__new__( + cls, + scheme or None, + authority or None, + path or None, + query, + fragment) + ref.encoding = encoding + return ref + + def __eq__(self, other): + """Compare this reference to another.""" + other_ref = other + if isinstance(other, tuple): + other_ref = self.__class__(*other) + elif not isinstance(other, IRIReference): + try: + other_ref = self.__class__.from_string(other) + except TypeError: + raise TypeError( + 'Unable to compare {0}() to {1}()'.format( + type(self).__name__, type(other).__name__)) + + # See http://tools.ietf.org/html/rfc3986#section-6.2 + return tuple(self) == tuple(other_ref) + + def _match_subauthority(self): + return misc.ISUBAUTHORITY_MATCHER.match(self.authority) + + @classmethod + def from_string(cls, iri_string, encoding='utf-8'): + """Parse a IRI reference from the given unicode IRI string. + + :param str iri_string: Unicode IRI to be parsed into a reference. + :param str encoding: The encoding of the string provided + :returns: :class:`IRIReference` or subclass thereof + """ + iri_string = compat.to_str(iri_string, encoding) + + split_iri = misc.IRI_MATCHER.match(iri_string).groupdict() + return cls( + split_iri['scheme'], split_iri['authority'], + normalizers.encode_component(split_iri['path'], encoding), + normalizers.encode_component(split_iri['query'], encoding), + normalizers.encode_component(split_iri['fragment'], encoding), + encoding, + ) + + def encode(self, idna_encoder=None): # noqa: C901 + """Encode an IRIReference into a URIReference instance. + + If the ``idna`` module is installed or the ``rfc3986[idna]`` + extra is used then unicode characters in the IRI host + component will be encoded with IDNA2008. + + :param idna_encoder: + Function that encodes each part of the host component + If not given will raise an exception if the IRI + contains a host component. + :rtype: uri.URIReference + :returns: A URI reference + """ + authority = self.authority + if authority: + if idna_encoder is None: + if idna is None: # pragma: no cover + raise exceptions.MissingDependencyError( + "Could not import the 'idna' module " + "and the IRI hostname requires encoding" + ) + + def idna_encoder(name): + if any(ord(c) > 128 for c in name): + try: + return idna.encode(name.lower(), + strict=True, + std3_rules=True) + except idna.IDNAError: + raise exceptions.InvalidAuthority(self.authority) + return name + + authority = "" + if self.host: + authority = ".".join([compat.to_str(idna_encoder(part)) + for part in self.host.split(".")]) + + if self.userinfo is not None: + authority = (normalizers.encode_component( + self.userinfo, self.encoding) + '@' + authority) + + if self.port is not None: + authority += ":" + str(self.port) + + return uri.URIReference(self.scheme, + authority, + path=self.path, + query=self.query, + fragment=self.fragment, + encoding=self.encoding) diff --git a/libs/urllib3/packages/rfc3986/misc.py b/libs/urllib3/packages/rfc3986/misc.py new file mode 100644 index 000000000..b735e0440 --- /dev/null +++ b/libs/urllib3/packages/rfc3986/misc.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Module containing compiled regular expressions and constants. + +This module contains important constants, patterns, and compiled regular +expressions for parsing and validating URIs and their components. +""" + +import re + +from . import abnf_regexp + +# These are enumerated for the named tuple used as a superclass of +# URIReference +URI_COMPONENTS = ['scheme', 'authority', 'path', 'query', 'fragment'] + +important_characters = { + 'generic_delimiters': abnf_regexp.GENERIC_DELIMITERS, + 'sub_delimiters': abnf_regexp.SUB_DELIMITERS, + # We need to escape the '*' in this case + 're_sub_delimiters': abnf_regexp.SUB_DELIMITERS_RE, + 'unreserved_chars': abnf_regexp.UNRESERVED_CHARS, + # We need to escape the '-' in this case: + 're_unreserved': abnf_regexp.UNRESERVED_RE, +} + +# For details about delimiters and reserved characters, see: +# http://tools.ietf.org/html/rfc3986#section-2.2 +GENERIC_DELIMITERS = abnf_regexp.GENERIC_DELIMITERS_SET +SUB_DELIMITERS = abnf_regexp.SUB_DELIMITERS_SET +RESERVED_CHARS = abnf_regexp.RESERVED_CHARS_SET +# For details about unreserved characters, see: +# http://tools.ietf.org/html/rfc3986#section-2.3 +UNRESERVED_CHARS = abnf_regexp.UNRESERVED_CHARS_SET +NON_PCT_ENCODED = abnf_regexp.NON_PCT_ENCODED_SET + +URI_MATCHER = re.compile(abnf_regexp.URL_PARSING_RE) + +SUBAUTHORITY_MATCHER = re.compile(( + '^(?:(?P<userinfo>{0})@)?' # userinfo + '(?P<host>{1})' # host + ':?(?P<port>{2})?$' # port + ).format(abnf_regexp.USERINFO_RE, + abnf_regexp.HOST_PATTERN, + abnf_regexp.PORT_RE)) + + +HOST_MATCHER = re.compile('^' + abnf_regexp.HOST_RE + '$') +IPv4_MATCHER = re.compile('^' + abnf_regexp.IPv4_RE + '$') +IPv6_MATCHER = re.compile(r'^\[' + abnf_regexp.IPv6_ADDRZ_RFC4007_RE + r'\]$') + +# Used by host validator +IPv6_NO_RFC4007_MATCHER = re.compile(r'^\[%s\]$' % ( + abnf_regexp.IPv6_ADDRZ_RE +)) + +# Matcher used to validate path components +PATH_MATCHER = re.compile(abnf_regexp.PATH_RE) + + +# ################################## +# Query and Fragment Matcher Section +# ################################## + +QUERY_MATCHER = re.compile(abnf_regexp.QUERY_RE) + +FRAGMENT_MATCHER = QUERY_MATCHER + +# Scheme validation, see: http://tools.ietf.org/html/rfc3986#section-3.1 +SCHEME_MATCHER = re.compile('^{0}$'.format(abnf_regexp.SCHEME_RE)) + +RELATIVE_REF_MATCHER = re.compile(r'^%s(\?%s)?(#%s)?$' % ( + abnf_regexp.RELATIVE_PART_RE, + abnf_regexp.QUERY_RE, + abnf_regexp.FRAGMENT_RE, +)) + +# See http://tools.ietf.org/html/rfc3986#section-4.3 +ABSOLUTE_URI_MATCHER = re.compile(r'^%s:%s(\?%s)?$' % ( + abnf_regexp.COMPONENT_PATTERN_DICT['scheme'], + abnf_regexp.HIER_PART_RE, + abnf_regexp.QUERY_RE[1:-1], +)) + +# ############### +# IRIs / RFC 3987 +# ############### + +IRI_MATCHER = re.compile(abnf_regexp.URL_PARSING_RE, re.UNICODE) + +ISUBAUTHORITY_MATCHER = re.compile(( + u'^(?:(?P<userinfo>{0})@)?' # iuserinfo + u'(?P<host>{1})' # ihost + u':?(?P<port>{2})?$' # port + ).format(abnf_regexp.IUSERINFO_RE, + abnf_regexp.IHOST_RE, + abnf_regexp.PORT_RE), re.UNICODE) + + +# Path merger as defined in http://tools.ietf.org/html/rfc3986#section-5.2.3 +def merge_paths(base_uri, relative_path): + """Merge a base URI's path with a relative URI's path.""" + if base_uri.path is None and base_uri.authority is not None: + return '/' + relative_path + else: + path = base_uri.path or '' + index = path.rfind('/') + return path[:index] + '/' + relative_path + + +UseExisting = object() diff --git a/libs/urllib3/packages/rfc3986/normalizers.py b/libs/urllib3/packages/rfc3986/normalizers.py new file mode 100644 index 000000000..2eb1bb36f --- /dev/null +++ b/libs/urllib3/packages/rfc3986/normalizers.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module with functions to normalize components.""" +import re + +from . import compat +from . import misc + + +def normalize_scheme(scheme): + """Normalize the scheme component.""" + return scheme.lower() + + +def normalize_authority(authority): + """Normalize an authority tuple to a string.""" + userinfo, host, port = authority + result = '' + if userinfo: + result += normalize_percent_characters(userinfo) + '@' + if host: + result += normalize_host(host) + if port: + result += ':' + port + return result + + +def normalize_username(username): + """Normalize a username to make it safe to include in userinfo.""" + return compat.urlquote(username) + + +def normalize_password(password): + """Normalize a password to make safe for userinfo.""" + return compat.urlquote(password) + + +def normalize_host(host): + """Normalize a host string.""" + if misc.IPv6_MATCHER.match(host): + percent = host.find('%') + if percent != -1: + percent_25 = host.find('%25') + + # Replace RFC 4007 IPv6 Zone ID delimiter '%' with '%25' + # from RFC 6874. If the host is '[<IPv6 addr>%25]' then we + # assume RFC 4007 and normalize to '[<IPV6 addr>%2525]' + if percent_25 == -1 or percent < percent_25 or \ + (percent == percent_25 and percent_25 == len(host) - 4): + host = host.replace('%', '%25', 1) + + # Don't normalize the casing of the Zone ID + return host[:percent].lower() + host[percent:] + + return host.lower() + + +def normalize_path(path): + """Normalize the path string.""" + if not path: + return path + + path = normalize_percent_characters(path) + return remove_dot_segments(path) + + +def normalize_query(query): + """Normalize the query string.""" + if not query: + return query + return normalize_percent_characters(query) + + +def normalize_fragment(fragment): + """Normalize the fragment string.""" + if not fragment: + return fragment + return normalize_percent_characters(fragment) + + +PERCENT_MATCHER = re.compile('%[A-Fa-f0-9]{2}') + + +def normalize_percent_characters(s): + """All percent characters should be upper-cased. + + For example, ``"%3afoo%DF%ab"`` should be turned into ``"%3Afoo%DF%AB"``. + """ + matches = set(PERCENT_MATCHER.findall(s)) + for m in matches: + if not m.isupper(): + s = s.replace(m, m.upper()) + return s + + +def remove_dot_segments(s): + """Remove dot segments from the string. + + See also Section 5.2.4 of :rfc:`3986`. + """ + # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code + segments = s.split('/') # Turn the path into a list of segments + output = [] # Initialize the variable to use to store output + + for segment in segments: + # '.' is the current directory, so ignore it, it is superfluous + if segment == '.': + continue + # Anything other than '..', should be appended to the output + elif segment != '..': + output.append(segment) + # In this case segment == '..', if we can, we should pop the last + # element + elif output: + output.pop() + + # If the path starts with '/' and the output is empty or the first string + # is non-empty + if s.startswith('/') and (not output or output[0]): + output.insert(0, '') + + # If the path starts with '/.' or '/..' ensure we add one more empty + # string to add a trailing '/' + if s.endswith(('/.', '/..')): + output.append('') + + return '/'.join(output) + + +def encode_component(uri_component, encoding): + """Encode the specific component in the provided encoding.""" + if uri_component is None: + return uri_component + + # Try to see if the component we're encoding is already percent-encoded + # so we can skip all '%' characters but still encode all others. + percent_encodings = len(PERCENT_MATCHER.findall( + compat.to_str(uri_component, encoding))) + + uri_bytes = compat.to_bytes(uri_component, encoding) + is_percent_encoded = percent_encodings == uri_bytes.count(b'%') + + encoded_uri = bytearray() + + for i in range(0, len(uri_bytes)): + # Will return a single character bytestring on both Python 2 & 3 + byte = uri_bytes[i:i+1] + byte_ord = ord(byte) + if ((is_percent_encoded and byte == b'%') + or (byte_ord < 128 and byte.decode() in misc.NON_PCT_ENCODED)): + encoded_uri.extend(byte) + continue + encoded_uri.extend('%{0:02x}'.format(byte_ord).encode().upper()) + + return encoded_uri.decode(encoding) diff --git a/libs/urllib3/packages/rfc3986/parseresult.py b/libs/urllib3/packages/rfc3986/parseresult.py new file mode 100644 index 000000000..0a7345669 --- /dev/null +++ b/libs/urllib3/packages/rfc3986/parseresult.py @@ -0,0 +1,385 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2015 Ian Stapleton Cordasco +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing the urlparse compatibility logic.""" +from collections import namedtuple + +from . import compat +from . import exceptions +from . import misc +from . import normalizers +from . import uri + +__all__ = ('ParseResult', 'ParseResultBytes') + +PARSED_COMPONENTS = ('scheme', 'userinfo', 'host', 'port', 'path', 'query', + 'fragment') + + +class ParseResultMixin(object): + def _generate_authority(self, attributes): + # I swear I did not align the comparisons below. That's just how they + # happened to align based on pep8 and attribute lengths. + userinfo, host, port = (attributes[p] + for p in ('userinfo', 'host', 'port')) + if (self.userinfo != userinfo or + self.host != host or + self.port != port): + if port: + port = '{0}'.format(port) + return normalizers.normalize_authority( + (compat.to_str(userinfo, self.encoding), + compat.to_str(host, self.encoding), + port) + ) + return self.authority + + def geturl(self): + """Shim to match the standard library method.""" + return self.unsplit() + + @property + def hostname(self): + """Shim to match the standard library.""" + return self.host + + @property + def netloc(self): + """Shim to match the standard library.""" + return self.authority + + @property + def params(self): + """Shim to match the standard library.""" + return self.query + + +class ParseResult(namedtuple('ParseResult', PARSED_COMPONENTS), + ParseResultMixin): + """Implementation of urlparse compatibility class. + + This uses the URIReference logic to handle compatibility with the + urlparse.ParseResult class. + """ + + slots = () + + def __new__(cls, scheme, userinfo, host, port, path, query, fragment, + uri_ref, encoding='utf-8'): + """Create a new ParseResult.""" + parse_result = super(ParseResult, cls).__new__( + cls, + scheme or None, + userinfo or None, + host, + port or None, + path or None, + query, + fragment) + parse_result.encoding = encoding + parse_result.reference = uri_ref + return parse_result + + @classmethod + def from_parts(cls, scheme=None, userinfo=None, host=None, port=None, + path=None, query=None, fragment=None, encoding='utf-8'): + """Create a ParseResult instance from its parts.""" + authority = '' + if userinfo is not None: + authority += userinfo + '@' + if host is not None: + authority += host + if port is not None: + authority += ':{0}'.format(port) + uri_ref = uri.URIReference(scheme=scheme, + authority=authority, + path=path, + query=query, + fragment=fragment, + encoding=encoding).normalize() + userinfo, host, port = authority_from(uri_ref, strict=True) + return cls(scheme=uri_ref.scheme, + userinfo=userinfo, + host=host, + port=port, + path=uri_ref.path, + query=uri_ref.query, + fragment=uri_ref.fragment, + uri_ref=uri_ref, + encoding=encoding) + + @classmethod + def from_string(cls, uri_string, encoding='utf-8', strict=True, + lazy_normalize=True): + """Parse a URI from the given unicode URI string. + + :param str uri_string: Unicode URI to be parsed into a reference. + :param str encoding: The encoding of the string provided + :param bool strict: Parse strictly according to :rfc:`3986` if True. + If False, parse similarly to the standard library's urlparse + function. + :returns: :class:`ParseResult` or subclass thereof + """ + reference = uri.URIReference.from_string(uri_string, encoding) + if not lazy_normalize: + reference = reference.normalize() + userinfo, host, port = authority_from(reference, strict) + + return cls(scheme=reference.scheme, + userinfo=userinfo, + host=host, + port=port, + path=reference.path, + query=reference.query, + fragment=reference.fragment, + uri_ref=reference, + encoding=encoding) + + @property + def authority(self): + """Return the normalized authority.""" + return self.reference.authority + + def copy_with(self, scheme=misc.UseExisting, userinfo=misc.UseExisting, + host=misc.UseExisting, port=misc.UseExisting, + path=misc.UseExisting, query=misc.UseExisting, + fragment=misc.UseExisting): + """Create a copy of this instance replacing with specified parts.""" + attributes = zip(PARSED_COMPONENTS, + (scheme, userinfo, host, port, path, query, fragment)) + attrs_dict = {} + for name, value in attributes: + if value is misc.UseExisting: + value = getattr(self, name) + attrs_dict[name] = value + authority = self._generate_authority(attrs_dict) + ref = self.reference.copy_with(scheme=attrs_dict['scheme'], + authority=authority, + path=attrs_dict['path'], + query=attrs_dict['query'], + fragment=attrs_dict['fragment']) + return ParseResult(uri_ref=ref, encoding=self.encoding, **attrs_dict) + + def encode(self, encoding=None): + """Convert to an instance of ParseResultBytes.""" + encoding = encoding or self.encoding + attrs = dict( + zip(PARSED_COMPONENTS, + (attr.encode(encoding) if hasattr(attr, 'encode') else attr + for attr in self))) + return ParseResultBytes( + uri_ref=self.reference, + encoding=encoding, + **attrs + ) + + def unsplit(self, use_idna=False): + """Create a URI string from the components. + + :returns: The parsed URI reconstituted as a string. + :rtype: str + """ + parse_result = self + if use_idna and self.host: + hostbytes = self.host.encode('idna') + host = hostbytes.decode(self.encoding) + parse_result = self.copy_with(host=host) + return parse_result.reference.unsplit() + + +class ParseResultBytes(namedtuple('ParseResultBytes', PARSED_COMPONENTS), + ParseResultMixin): + """Compatibility shim for the urlparse.ParseResultBytes object.""" + + def __new__(cls, scheme, userinfo, host, port, path, query, fragment, + uri_ref, encoding='utf-8', lazy_normalize=True): + """Create a new ParseResultBytes instance.""" + parse_result = super(ParseResultBytes, cls).__new__( + cls, + scheme or None, + userinfo or None, + host, + port or None, + path or None, + query or None, + fragment or None) + parse_result.encoding = encoding + parse_result.reference = uri_ref + parse_result.lazy_normalize = lazy_normalize + return parse_result + + @classmethod + def from_parts(cls, scheme=None, userinfo=None, host=None, port=None, + path=None, query=None, fragment=None, encoding='utf-8', + lazy_normalize=True): + """Create a ParseResult instance from its parts.""" + authority = '' + if userinfo is not None: + authority += userinfo + '@' + if host is not None: + authority += host + if port is not None: + authority += ':{0}'.format(int(port)) + uri_ref = uri.URIReference(scheme=scheme, + authority=authority, + path=path, + query=query, + fragment=fragment, + encoding=encoding) + if not lazy_normalize: + uri_ref = uri_ref.normalize() + to_bytes = compat.to_bytes + userinfo, host, port = authority_from(uri_ref, strict=True) + return cls(scheme=to_bytes(scheme, encoding), + userinfo=to_bytes(userinfo, encoding), + host=to_bytes(host, encoding), + port=port, + path=to_bytes(path, encoding), + query=to_bytes(query, encoding), + fragment=to_bytes(fragment, encoding), + uri_ref=uri_ref, + encoding=encoding, + lazy_normalize=lazy_normalize) + + @classmethod + def from_string(cls, uri_string, encoding='utf-8', strict=True, + lazy_normalize=True): + """Parse a URI from the given unicode URI string. + + :param str uri_string: Unicode URI to be parsed into a reference. + :param str encoding: The encoding of the string provided + :param bool strict: Parse strictly according to :rfc:`3986` if True. + If False, parse similarly to the standard library's urlparse + function. + :returns: :class:`ParseResultBytes` or subclass thereof + """ + reference = uri.URIReference.from_string(uri_string, encoding) + if not lazy_normalize: + reference = reference.normalize() + userinfo, host, port = authority_from(reference, strict) + + to_bytes = compat.to_bytes + return cls(scheme=to_bytes(reference.scheme, encoding), + userinfo=to_bytes(userinfo, encoding), + host=to_bytes(host, encoding), + port=port, + path=to_bytes(reference.path, encoding), + query=to_bytes(reference.query, encoding), + fragment=to_bytes(reference.fragment, encoding), + uri_ref=reference, + encoding=encoding, + lazy_normalize=lazy_normalize) + + @property + def authority(self): + """Return the normalized authority.""" + return self.reference.authority.encode(self.encoding) + + def copy_with(self, scheme=misc.UseExisting, userinfo=misc.UseExisting, + host=misc.UseExisting, port=misc.UseExisting, + path=misc.UseExisting, query=misc.UseExisting, + fragment=misc.UseExisting, lazy_normalize=True): + """Create a copy of this instance replacing with specified parts.""" + attributes = zip(PARSED_COMPONENTS, + (scheme, userinfo, host, port, path, query, fragment)) + attrs_dict = {} + for name, value in attributes: + if value is misc.UseExisting: + value = getattr(self, name) + if not isinstance(value, bytes) and hasattr(value, 'encode'): + value = value.encode(self.encoding) + attrs_dict[name] = value + authority = self._generate_authority(attrs_dict) + to_str = compat.to_str + ref = self.reference.copy_with( + scheme=to_str(attrs_dict['scheme'], self.encoding), + authority=to_str(authority, self.encoding), + path=to_str(attrs_dict['path'], self.encoding), + query=to_str(attrs_dict['query'], self.encoding), + fragment=to_str(attrs_dict['fragment'], self.encoding) + ) + if not lazy_normalize: + ref = ref.normalize() + return ParseResultBytes( + uri_ref=ref, + encoding=self.encoding, + lazy_normalize=lazy_normalize, + **attrs_dict + ) + + def unsplit(self, use_idna=False): + """Create a URI bytes object from the components. + + :returns: The parsed URI reconstituted as a string. + :rtype: bytes + """ + parse_result = self + if use_idna and self.host: + # self.host is bytes, to encode to idna, we need to decode it + # first + host = self.host.decode(self.encoding) + hostbytes = host.encode('idna') + parse_result = self.copy_with(host=hostbytes) + if self.lazy_normalize: + parse_result = parse_result.copy_with(lazy_normalize=False) + uri = parse_result.reference.unsplit() + return uri.encode(self.encoding) + + +def split_authority(authority): + # Initialize our expected return values + userinfo = host = port = None + # Initialize an extra var we may need to use + extra_host = None + # Set-up rest in case there is no userinfo portion + rest = authority + + if '@' in authority: + userinfo, rest = authority.rsplit('@', 1) + + # Handle IPv6 host addresses + if rest.startswith('['): + host, rest = rest.split(']', 1) + host += ']' + + if ':' in rest: + extra_host, port = rest.split(':', 1) + elif not host and rest: + host = rest + + if extra_host and not host: + host = extra_host + + return userinfo, host, port + + +def authority_from(reference, strict): + try: + subauthority = reference.authority_info() + except exceptions.InvalidAuthority: + if strict: + raise + userinfo, host, port = split_authority(reference.authority) + else: + # Thanks to Richard Barrell for this idea: + # https://twitter.com/0x2ba22e11/status/617338811975139328 + userinfo, host, port = (subauthority.get(p) + for p in ('userinfo', 'host', 'port')) + + if port: + try: + port = int(port) + except ValueError: + raise exceptions.InvalidPort(port) + return userinfo, host, port diff --git a/libs/urllib3/packages/rfc3986/uri.py b/libs/urllib3/packages/rfc3986/uri.py new file mode 100644 index 000000000..d1d71505e --- /dev/null +++ b/libs/urllib3/packages/rfc3986/uri.py @@ -0,0 +1,153 @@ +"""Module containing the implementation of the URIReference class.""" +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Copyright (c) 2015 Ian Stapleton Cordasco +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import namedtuple + +from . import compat +from . import misc +from . import normalizers +from ._mixin import URIMixin + + +class URIReference(namedtuple('URIReference', misc.URI_COMPONENTS), URIMixin): + """Immutable object representing a parsed URI Reference. + + .. note:: + + This class is not intended to be directly instantiated by the user. + + This object exposes attributes for the following components of a + URI: + + - scheme + - authority + - path + - query + - fragment + + .. attribute:: scheme + + The scheme that was parsed for the URI Reference. For example, + ``http``, ``https``, ``smtp``, ``imap``, etc. + + .. attribute:: authority + + Component of the URI that contains the user information, host, + and port sub-components. For example, + ``google.com``, ``127.0.0.1:5000``, ``username@[::1]``, + ``username:[email protected]:443``, etc. + + .. attribute:: path + + The path that was parsed for the given URI Reference. For example, + ``/``, ``/index.php``, etc. + + .. attribute:: query + + The query component for a given URI Reference. For example, ``a=b``, + ``a=b%20c``, ``a=b+c``, ``a=b,c=d,e=%20f``, etc. + + .. attribute:: fragment + + The fragment component of a URI. For example, ``section-3.1``. + + This class also provides extra attributes for easier access to information + like the subcomponents of the authority component. + + .. attribute:: userinfo + + The user information parsed from the authority. + + .. attribute:: host + + The hostname, IPv4, or IPv6 adddres parsed from the authority. + + .. attribute:: port + + The port parsed from the authority. + """ + + slots = () + + def __new__(cls, scheme, authority, path, query, fragment, + encoding='utf-8'): + """Create a new URIReference.""" + ref = super(URIReference, cls).__new__( + cls, + scheme or None, + authority or None, + path or None, + query, + fragment) + ref.encoding = encoding + return ref + + __hash__ = tuple.__hash__ + + def __eq__(self, other): + """Compare this reference to another.""" + other_ref = other + if isinstance(other, tuple): + other_ref = URIReference(*other) + elif not isinstance(other, URIReference): + try: + other_ref = URIReference.from_string(other) + except TypeError: + raise TypeError( + 'Unable to compare URIReference() to {0}()'.format( + type(other).__name__)) + + # See http://tools.ietf.org/html/rfc3986#section-6.2 + naive_equality = tuple(self) == tuple(other_ref) + return naive_equality or self.normalized_equality(other_ref) + + def normalize(self): + """Normalize this reference as described in Section 6.2.2. + + This is not an in-place normalization. Instead this creates a new + URIReference. + + :returns: A new reference object with normalized components. + :rtype: URIReference + """ + # See http://tools.ietf.org/html/rfc3986#section-6.2.2 for logic in + # this method. + return URIReference(normalizers.normalize_scheme(self.scheme or ''), + normalizers.normalize_authority( + (self.userinfo, self.host, self.port)), + normalizers.normalize_path(self.path or ''), + normalizers.normalize_query(self.query), + normalizers.normalize_fragment(self.fragment), + self.encoding) + + @classmethod + def from_string(cls, uri_string, encoding='utf-8'): + """Parse a URI reference from the given unicode URI string. + + :param str uri_string: Unicode URI to be parsed into a reference. + :param str encoding: The encoding of the string provided + :returns: :class:`URIReference` or subclass thereof + """ + uri_string = compat.to_str(uri_string, encoding) + + split_uri = misc.URI_MATCHER.match(uri_string).groupdict() + return cls( + split_uri['scheme'], split_uri['authority'], + normalizers.encode_component(split_uri['path'], encoding), + normalizers.encode_component(split_uri['query'], encoding), + normalizers.encode_component(split_uri['fragment'], encoding), + encoding, + ) diff --git a/libs/urllib3/packages/rfc3986/validators.py b/libs/urllib3/packages/rfc3986/validators.py new file mode 100644 index 000000000..7fc97215b --- /dev/null +++ b/libs/urllib3/packages/rfc3986/validators.py @@ -0,0 +1,450 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Ian Stapleton Cordasco +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module containing the validation logic for rfc3986.""" +from . import exceptions +from . import misc +from . import normalizers + + +class Validator(object): + """Object used to configure validation of all objects in rfc3986. + + .. versionadded:: 1.0 + + Example usage:: + + >>> from rfc3986 import api, validators + >>> uri = api.uri_reference('https://github.com/') + >>> validator = validators.Validator().require_presence_of( + ... 'scheme', 'host', 'path', + ... ).allow_schemes( + ... 'http', 'https', + ... ).allow_hosts( + ... '127.0.0.1', 'github.com', + ... ) + >>> validator.validate(uri) + >>> invalid_uri = rfc3986.uri_reference('imap://mail.google.com') + >>> validator.validate(invalid_uri) + Traceback (most recent call last): + ... + rfc3986.exceptions.MissingComponentError: ('path was required but + missing', URIReference(scheme=u'imap', authority=u'mail.google.com', + path=None, query=None, fragment=None), ['path']) + + """ + + COMPONENT_NAMES = frozenset([ + 'scheme', + 'userinfo', + 'host', + 'port', + 'path', + 'query', + 'fragment', + ]) + + def __init__(self): + """Initialize our default validations.""" + self.allowed_schemes = set() + self.allowed_hosts = set() + self.allowed_ports = set() + self.allow_password = True + self.required_components = { + 'scheme': False, + 'userinfo': False, + 'host': False, + 'port': False, + 'path': False, + 'query': False, + 'fragment': False, + } + self.validated_components = self.required_components.copy() + + def allow_schemes(self, *schemes): + """Require the scheme to be one of the provided schemes. + + .. versionadded:: 1.0 + + :param schemes: + Schemes, without ``://`` that are allowed. + :returns: + The validator instance. + :rtype: + Validator + """ + for scheme in schemes: + self.allowed_schemes.add(normalizers.normalize_scheme(scheme)) + return self + + def allow_hosts(self, *hosts): + """Require the host to be one of the provided hosts. + + .. versionadded:: 1.0 + + :param hosts: + Hosts that are allowed. + :returns: + The validator instance. + :rtype: + Validator + """ + for host in hosts: + self.allowed_hosts.add(normalizers.normalize_host(host)) + return self + + def allow_ports(self, *ports): + """Require the port to be one of the provided ports. + + .. versionadded:: 1.0 + + :param ports: + Ports that are allowed. + :returns: + The validator instance. + :rtype: + Validator + """ + for port in ports: + port_int = int(port, base=10) + if 0 <= port_int <= 65535: + self.allowed_ports.add(port) + return self + + def allow_use_of_password(self): + """Allow passwords to be present in the URI. + + .. versionadded:: 1.0 + + :returns: + The validator instance. + :rtype: + Validator + """ + self.allow_password = True + return self + + def forbid_use_of_password(self): + """Prevent passwords from being included in the URI. + + .. versionadded:: 1.0 + + :returns: + The validator instance. + :rtype: + Validator + """ + self.allow_password = False + return self + + def check_validity_of(self, *components): + """Check the validity of the components provided. + + This can be specified repeatedly. + + .. versionadded:: 1.1 + + :param components: + Names of components from :attr:`Validator.COMPONENT_NAMES`. + :returns: + The validator instance. + :rtype: + Validator + """ + components = [c.lower() for c in components] + for component in components: + if component not in self.COMPONENT_NAMES: + raise ValueError( + '"{}" is not a valid component'.format(component) + ) + self.validated_components.update({ + component: True for component in components + }) + return self + + def require_presence_of(self, *components): + """Require the components provided. + + This can be specified repeatedly. + + .. versionadded:: 1.0 + + :param components: + Names of components from :attr:`Validator.COMPONENT_NAMES`. + :returns: + The validator instance. + :rtype: + Validator + """ + components = [c.lower() for c in components] + for component in components: + if component not in self.COMPONENT_NAMES: + raise ValueError( + '"{}" is not a valid component'.format(component) + ) + self.required_components.update({ + component: True for component in components + }) + return self + + def validate(self, uri): + """Check a URI for conditions specified on this validator. + + .. versionadded:: 1.0 + + :param uri: + Parsed URI to validate. + :type uri: + rfc3986.uri.URIReference + :raises MissingComponentError: + When a required component is missing. + :raises UnpermittedComponentError: + When a component is not one of those allowed. + :raises PasswordForbidden: + When a password is present in the userinfo component but is + not permitted by configuration. + :raises InvalidComponentsError: + When a component was found to be invalid. + """ + if not self.allow_password: + check_password(uri) + + required_components = [ + component + for component, required in self.required_components.items() + if required + ] + validated_components = [ + component + for component, required in self.validated_components.items() + if required + ] + if required_components: + ensure_required_components_exist(uri, required_components) + if validated_components: + ensure_components_are_valid(uri, validated_components) + + ensure_one_of(self.allowed_schemes, uri, 'scheme') + ensure_one_of(self.allowed_hosts, uri, 'host') + ensure_one_of(self.allowed_ports, uri, 'port') + + +def check_password(uri): + """Assert that there is no password present in the uri.""" + userinfo = uri.userinfo + if not userinfo: + return + credentials = userinfo.split(':', 1) + if len(credentials) <= 1: + return + raise exceptions.PasswordForbidden(uri) + + +def ensure_one_of(allowed_values, uri, attribute): + """Assert that the uri's attribute is one of the allowed values.""" + value = getattr(uri, attribute) + if value is not None and allowed_values and value not in allowed_values: + raise exceptions.UnpermittedComponentError( + attribute, value, allowed_values, + ) + + +def ensure_required_components_exist(uri, required_components): + """Assert that all required components are present in the URI.""" + missing_components = sorted([ + component + for component in required_components + if getattr(uri, component) is None + ]) + if missing_components: + raise exceptions.MissingComponentError(uri, *missing_components) + + +def is_valid(value, matcher, require): + """Determine if a value is valid based on the provided matcher. + + :param str value: + Value to validate. + :param matcher: + Compiled regular expression to use to validate the value. + :param require: + Whether or not the value is required. + """ + if require: + return (value is not None + and matcher.match(value)) + + # require is False and value is not None + return value is None or matcher.match(value) + + +def authority_is_valid(authority, host=None, require=False): + """Determine if the authority string is valid. + + :param str authority: + The authority to validate. + :param str host: + (optional) The host portion of the authority to validate. + :param bool require: + (optional) Specify if authority must not be None. + :returns: + ``True`` if valid, ``False`` otherwise + :rtype: + bool + """ + validated = is_valid(authority, misc.SUBAUTHORITY_MATCHER, require) + if validated and host is not None: + return host_is_valid(host, require) + return validated + + +def host_is_valid(host, require=False): + """Determine if the host string is valid. + + :param str host: + The host to validate. + :param bool require: + (optional) Specify if host must not be None. + :returns: + ``True`` if valid, ``False`` otherwise + :rtype: + bool + """ + validated = is_valid(host, misc.HOST_MATCHER, require) + if validated and host is not None and misc.IPv4_MATCHER.match(host): + return valid_ipv4_host_address(host) + elif validated and host is not None and misc.IPv6_MATCHER.match(host): + return misc.IPv6_NO_RFC4007_MATCHER.match(host) is not None + return validated + + +def scheme_is_valid(scheme, require=False): + """Determine if the scheme is valid. + + :param str scheme: + The scheme string to validate. + :param bool require: + (optional) Set to ``True`` to require the presence of a scheme. + :returns: + ``True`` if the scheme is valid. ``False`` otherwise. + :rtype: + bool + """ + return is_valid(scheme, misc.SCHEME_MATCHER, require) + + +def path_is_valid(path, require=False): + """Determine if the path component is valid. + + :param str path: + The path string to validate. + :param bool require: + (optional) Set to ``True`` to require the presence of a path. + :returns: + ``True`` if the path is valid. ``False`` otherwise. + :rtype: + bool + """ + return is_valid(path, misc.PATH_MATCHER, require) + + +def query_is_valid(query, require=False): + """Determine if the query component is valid. + + :param str query: + The query string to validate. + :param bool require: + (optional) Set to ``True`` to require the presence of a query. + :returns: + ``True`` if the query is valid. ``False`` otherwise. + :rtype: + bool + """ + return is_valid(query, misc.QUERY_MATCHER, require) + + +def fragment_is_valid(fragment, require=False): + """Determine if the fragment component is valid. + + :param str fragment: + The fragment string to validate. + :param bool require: + (optional) Set to ``True`` to require the presence of a fragment. + :returns: + ``True`` if the fragment is valid. ``False`` otherwise. + :rtype: + bool + """ + return is_valid(fragment, misc.FRAGMENT_MATCHER, require) + + +def valid_ipv4_host_address(host): + """Determine if the given host is a valid IPv4 address.""" + # If the host exists, and it might be IPv4, check each byte in the + # address. + return all([0 <= int(byte, base=10) <= 255 for byte in host.split('.')]) + + +_COMPONENT_VALIDATORS = { + 'scheme': scheme_is_valid, + 'path': path_is_valid, + 'query': query_is_valid, + 'fragment': fragment_is_valid, +} + +_SUBAUTHORITY_VALIDATORS = set(['userinfo', 'host', 'port']) + + +def subauthority_component_is_valid(uri, component): + """Determine if the userinfo, host, and port are valid.""" + try: + subauthority_dict = uri.authority_info() + except exceptions.InvalidAuthority: + return False + + # If we can parse the authority into sub-components and we're not + # validating the port, we can assume it's valid. + if component == 'host': + return host_is_valid(subauthority_dict['host']) + elif component != 'port': + return True + + try: + port = int(subauthority_dict['port']) + except TypeError: + # If the port wasn't provided it'll be None and int(None) raises a + # TypeError + return True + + return (0 <= port <= 65535) + + +def ensure_components_are_valid(uri, validated_components): + """Assert that all components are valid in the URI.""" + invalid_components = set([]) + for component in validated_components: + if component in _SUBAUTHORITY_VALIDATORS: + if not subauthority_component_is_valid(uri, component): + invalid_components.add(component) + # Python's peephole optimizer means that while this continue *is* + # actually executed, coverage.py cannot detect that. See also, + # https://bitbucket.org/ned/coveragepy/issues/198/continue-marked-as-not-covered + continue # nocov: Python 2.7, 3.3, 3.4 + + validator = _COMPONENT_VALIDATORS[component] + if not validator(getattr(uri, component)): + invalid_components.add(component) + + if invalid_components: + raise exceptions.InvalidComponentsError(uri, *invalid_components) diff --git a/libs/urllib3/poolmanager.py b/libs/urllib3/poolmanager.py index 32bd97302..a6ade6e90 100644 --- a/libs/urllib3/poolmanager.py +++ b/libs/urllib3/poolmanager.py @@ -20,7 +20,8 @@ __all__ = ['PoolManager', 'ProxyManager', 'proxy_from_url'] log = logging.getLogger(__name__) SSL_KEYWORDS = ('key_file', 'cert_file', 'cert_reqs', 'ca_certs', - 'ssl_version', 'ca_cert_dir', 'ssl_context') + 'ssl_version', 'ca_cert_dir', 'ssl_context', + 'key_password') # All known keyword arguments that could be provided to the pool manager, its # pools, or the underlying connections. This is used to construct a pool key. @@ -34,6 +35,7 @@ _key_fields = ( 'key_block', # bool 'key_source_address', # str 'key_key_file', # str + 'key_key_password', # str 'key_cert_file', # str 'key_cert_reqs', # str 'key_ca_certs', # str @@ -48,7 +50,7 @@ _key_fields = ( 'key__socks_options', # dict 'key_assert_hostname', # bool or string 'key_assert_fingerprint', # str - 'key_server_hostname', #str + 'key_server_hostname', # str ) #: The namedtuple class used to construct keys for the connection pool. diff --git a/libs/urllib3/response.py b/libs/urllib3/response.py index c112690b0..4f857932c 100644 --- a/libs/urllib3/response.py +++ b/libs/urllib3/response.py @@ -6,6 +6,11 @@ import logging from socket import timeout as SocketTimeout from socket import error as SocketError +try: + import brotli +except ImportError: + brotli = None + from ._collections import HTTPHeaderDict from .exceptions import ( BodyNotHttplibCompatible, ProtocolError, DecodeError, ReadTimeoutError, @@ -90,6 +95,25 @@ class GzipDecoder(object): self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS) +if brotli is not None: + class BrotliDecoder(object): + # Supports both 'brotlipy' and 'Brotli' packages + # since they share an import name. The top branches + # are for 'brotlipy' and bottom branches for 'Brotli' + def __init__(self): + self._obj = brotli.Decompressor() + + def decompress(self, data): + if hasattr(self._obj, 'decompress'): + return self._obj.decompress(data) + return self._obj.process(data) + + def flush(self): + if hasattr(self._obj, 'flush'): + return self._obj.flush() + return b'' + + class MultiDecoder(object): """ From RFC7231: @@ -118,6 +142,9 @@ def _get_decoder(mode): if mode == 'gzip': return GzipDecoder() + if brotli is not None and mode == 'br': + return BrotliDecoder() + return DeflateDecoder() @@ -155,6 +182,8 @@ class HTTPResponse(io.IOBase): """ CONTENT_DECODERS = ['gzip', 'deflate'] + if brotli is not None: + CONTENT_DECODERS += ['br'] REDIRECT_STATUSES = [301, 302, 303, 307, 308] def __init__(self, body='', headers=None, status=0, version=0, reason=None, @@ -311,24 +340,32 @@ class HTTPResponse(io.IOBase): if content_encoding in self.CONTENT_DECODERS: self._decoder = _get_decoder(content_encoding) elif ',' in content_encoding: - encodings = [e.strip() for e in content_encoding.split(',') if e.strip() in self.CONTENT_DECODERS] + encodings = [ + e.strip() for e in content_encoding.split(',') + if e.strip() in self.CONTENT_DECODERS] if len(encodings): self._decoder = _get_decoder(content_encoding) + DECODER_ERROR_CLASSES = (IOError, zlib.error) + if brotli is not None: + DECODER_ERROR_CLASSES += (brotli.error,) + def _decode(self, data, decode_content, flush_decoder): """ Decode the data passed in and potentially flush the decoder. """ + if not decode_content: + return data + try: - if decode_content and self._decoder: + if self._decoder: data = self._decoder.decompress(data) - except (IOError, zlib.error) as e: + except self.DECODER_ERROR_CLASSES as e: content_encoding = self.headers.get('content-encoding', '').lower() raise DecodeError( "Received response with content-encoding: %s, but " "failed to decode it." % content_encoding, e) - - if flush_decoder and decode_content: + if flush_decoder: data += self._flush_decoder() return data @@ -508,9 +545,10 @@ class HTTPResponse(io.IOBase): headers = r.msg if not isinstance(headers, HTTPHeaderDict): - if PY3: # Python 3 + if PY3: headers = HTTPHeaderDict(headers.items()) - else: # Python 2 + else: + # Python 2.7 headers = HTTPHeaderDict.from_httplib(headers) # HTTPResponse objects in Python 3 don't have a .strict attribute @@ -703,3 +741,20 @@ class HTTPResponse(io.IOBase): return self.retries.history[-1].redirect_location else: return self._request_url + + def __iter__(self): + buffer = [b""] + for chunk in self.stream(decode_content=True): + if b"\n" in chunk: + chunk = chunk.split(b"\n") + yield b"".join(buffer) + chunk[0] + b"\n" + for x in chunk[1:-1]: + yield x + b"\n" + if chunk[-1]: + buffer = [chunk[-1]] + else: + buffer = [] + else: + buffer.append(chunk) + if buffer: + yield b"".join(buffer) diff --git a/libs/urllib3/util/__init__.py b/libs/urllib3/util/__init__.py index 2f2770b62..2914bb468 100644 --- a/libs/urllib3/util/__init__.py +++ b/libs/urllib3/util/__init__.py @@ -12,6 +12,7 @@ from .ssl_ import ( resolve_cert_reqs, resolve_ssl_version, ssl_wrap_socket, + PROTOCOL_TLS, ) from .timeout import ( current_time, @@ -35,6 +36,7 @@ __all__ = ( 'IS_PYOPENSSL', 'IS_SECURETRANSPORT', 'SSLContext', + 'PROTOCOL_TLS', 'Retry', 'Timeout', 'Url', diff --git a/libs/urllib3/util/request.py b/libs/urllib3/util/request.py index 3ddfcd559..280b8530c 100644 --- a/libs/urllib3/util/request.py +++ b/libs/urllib3/util/request.py @@ -5,6 +5,13 @@ from ..packages.six import b, integer_types from ..exceptions import UnrewindableBodyError ACCEPT_ENCODING = 'gzip,deflate' +try: + import brotli as _unused_module_brotli # noqa: F401 +except ImportError: + pass +else: + ACCEPT_ENCODING += ',br' + _FAILEDTELL = object() diff --git a/libs/urllib3/util/selectors.py b/libs/urllib3/util/selectors.py deleted file mode 100644 index d75cb266b..000000000 --- a/libs/urllib3/util/selectors.py +++ /dev/null @@ -1,581 +0,0 @@ -# Backport of selectors.py from Python 3.5+ to support Python < 3.4 -# Also has the behavior specified in PEP 475 which is to retry syscalls -# in the case of an EINTR error. This module is required because selectors34 -# does not follow this behavior and instead returns that no dile descriptor -# events have occurred rather than retry the syscall. The decision to drop -# support for select.devpoll is made to maintain 100% test coverage. - -import errno -import math -import select -import socket -import sys -import time -from collections import namedtuple, Mapping - -try: - monotonic = time.monotonic -except (AttributeError, ImportError): # Python 3.3< - monotonic = time.time - -EVENT_READ = (1 << 0) -EVENT_WRITE = (1 << 1) - -HAS_SELECT = True # Variable that shows whether the platform has a selector. -_SYSCALL_SENTINEL = object() # Sentinel in case a system call returns None. -_DEFAULT_SELECTOR = None - - -class SelectorError(Exception): - def __init__(self, errcode): - super(SelectorError, self).__init__() - self.errno = errcode - - def __repr__(self): - return "<SelectorError errno={0}>".format(self.errno) - - def __str__(self): - return self.__repr__() - - -def _fileobj_to_fd(fileobj): - """ Return a file descriptor from a file object. If - given an integer will simply return that integer back. """ - if isinstance(fileobj, int): - fd = fileobj - else: - try: - fd = int(fileobj.fileno()) - except (AttributeError, TypeError, ValueError): - raise ValueError("Invalid file object: {0!r}".format(fileobj)) - if fd < 0: - raise ValueError("Invalid file descriptor: {0}".format(fd)) - return fd - - -# Determine which function to use to wrap system calls because Python 3.5+ -# already handles the case when system calls are interrupted. -if sys.version_info >= (3, 5): - def _syscall_wrapper(func, _, *args, **kwargs): - """ This is the short-circuit version of the below logic - because in Python 3.5+ all system calls automatically restart - and recalculate their timeouts. """ - try: - return func(*args, **kwargs) - except (OSError, IOError, select.error) as e: - errcode = None - if hasattr(e, "errno"): - errcode = e.errno - raise SelectorError(errcode) -else: - def _syscall_wrapper(func, recalc_timeout, *args, **kwargs): - """ Wrapper function for syscalls that could fail due to EINTR. - All functions should be retried if there is time left in the timeout - in accordance with PEP 475. """ - timeout = kwargs.get("timeout", None) - if timeout is None: - expires = None - recalc_timeout = False - else: - timeout = float(timeout) - if timeout < 0.0: # Timeout less than 0 treated as no timeout. - expires = None - else: - expires = monotonic() + timeout - - args = list(args) - if recalc_timeout and "timeout" not in kwargs: - raise ValueError( - "Timeout must be in args or kwargs to be recalculated") - - result = _SYSCALL_SENTINEL - while result is _SYSCALL_SENTINEL: - try: - result = func(*args, **kwargs) - # OSError is thrown by select.select - # IOError is thrown by select.epoll.poll - # select.error is thrown by select.poll.poll - # Aren't we thankful for Python 3.x rework for exceptions? - except (OSError, IOError, select.error) as e: - # select.error wasn't a subclass of OSError in the past. - errcode = None - if hasattr(e, "errno"): - errcode = e.errno - elif hasattr(e, "args"): - errcode = e.args[0] - - # Also test for the Windows equivalent of EINTR. - is_interrupt = (errcode == errno.EINTR or (hasattr(errno, "WSAEINTR") and - errcode == errno.WSAEINTR)) - - if is_interrupt: - if expires is not None: - current_time = monotonic() - if current_time > expires: - raise OSError(errno=errno.ETIMEDOUT) - if recalc_timeout: - if "timeout" in kwargs: - kwargs["timeout"] = expires - current_time - continue - if errcode: - raise SelectorError(errcode) - else: - raise - return result - - -SelectorKey = namedtuple('SelectorKey', ['fileobj', 'fd', 'events', 'data']) - - -class _SelectorMapping(Mapping): - """ Mapping of file objects to selector keys """ - - def __init__(self, selector): - self._selector = selector - - def __len__(self): - return len(self._selector._fd_to_key) - - def __getitem__(self, fileobj): - try: - fd = self._selector._fileobj_lookup(fileobj) - return self._selector._fd_to_key[fd] - except KeyError: - raise KeyError("{0!r} is not registered.".format(fileobj)) - - def __iter__(self): - return iter(self._selector._fd_to_key) - - -class BaseSelector(object): - """ Abstract Selector class - - A selector supports registering file objects to be monitored - for specific I/O events. - - A file object is a file descriptor or any object with a - `fileno()` method. An arbitrary object can be attached to the - file object which can be used for example to store context info, - a callback, etc. - - A selector can use various implementations (select(), poll(), epoll(), - and kqueue()) depending on the platform. The 'DefaultSelector' class uses - the most efficient implementation for the current platform. - """ - def __init__(self): - # Maps file descriptors to keys. - self._fd_to_key = {} - - # Read-only mapping returned by get_map() - self._map = _SelectorMapping(self) - - def _fileobj_lookup(self, fileobj): - """ Return a file descriptor from a file object. - This wraps _fileobj_to_fd() to do an exhaustive - search in case the object is invalid but we still - have it in our map. Used by unregister() so we can - unregister an object that was previously registered - even if it is closed. It is also used by _SelectorMapping - """ - try: - return _fileobj_to_fd(fileobj) - except ValueError: - - # Search through all our mapped keys. - for key in self._fd_to_key.values(): - if key.fileobj is fileobj: - return key.fd - - # Raise ValueError after all. - raise - - def register(self, fileobj, events, data=None): - """ Register a file object for a set of events to monitor. """ - if (not events) or (events & ~(EVENT_READ | EVENT_WRITE)): - raise ValueError("Invalid events: {0!r}".format(events)) - - key = SelectorKey(fileobj, self._fileobj_lookup(fileobj), events, data) - - if key.fd in self._fd_to_key: - raise KeyError("{0!r} (FD {1}) is already registered" - .format(fileobj, key.fd)) - - self._fd_to_key[key.fd] = key - return key - - def unregister(self, fileobj): - """ Unregister a file object from being monitored. """ - try: - key = self._fd_to_key.pop(self._fileobj_lookup(fileobj)) - except KeyError: - raise KeyError("{0!r} is not registered".format(fileobj)) - - # Getting the fileno of a closed socket on Windows errors with EBADF. - except socket.error as e: # Platform-specific: Windows. - if e.errno != errno.EBADF: - raise - else: - for key in self._fd_to_key.values(): - if key.fileobj is fileobj: - self._fd_to_key.pop(key.fd) - break - else: - raise KeyError("{0!r} is not registered".format(fileobj)) - return key - - def modify(self, fileobj, events, data=None): - """ Change a registered file object monitored events and data. """ - # NOTE: Some subclasses optimize this operation even further. - try: - key = self._fd_to_key[self._fileobj_lookup(fileobj)] - except KeyError: - raise KeyError("{0!r} is not registered".format(fileobj)) - - if events != key.events: - self.unregister(fileobj) - key = self.register(fileobj, events, data) - - elif data != key.data: - # Use a shortcut to update the data. - key = key._replace(data=data) - self._fd_to_key[key.fd] = key - - return key - - def select(self, timeout=None): - """ Perform the actual selection until some monitored file objects - are ready or the timeout expires. """ - raise NotImplementedError() - - def close(self): - """ Close the selector. This must be called to ensure that all - underlying resources are freed. """ - self._fd_to_key.clear() - self._map = None - - def get_key(self, fileobj): - """ Return the key associated with a registered file object. """ - mapping = self.get_map() - if mapping is None: - raise RuntimeError("Selector is closed") - try: - return mapping[fileobj] - except KeyError: - raise KeyError("{0!r} is not registered".format(fileobj)) - - def get_map(self): - """ Return a mapping of file objects to selector keys """ - return self._map - - def _key_from_fd(self, fd): - """ Return the key associated to a given file descriptor - Return None if it is not found. """ - try: - return self._fd_to_key[fd] - except KeyError: - return None - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - -# Almost all platforms have select.select() -if hasattr(select, "select"): - class SelectSelector(BaseSelector): - """ Select-based selector. """ - def __init__(self): - super(SelectSelector, self).__init__() - self._readers = set() - self._writers = set() - - def register(self, fileobj, events, data=None): - key = super(SelectSelector, self).register(fileobj, events, data) - if events & EVENT_READ: - self._readers.add(key.fd) - if events & EVENT_WRITE: - self._writers.add(key.fd) - return key - - def unregister(self, fileobj): - key = super(SelectSelector, self).unregister(fileobj) - self._readers.discard(key.fd) - self._writers.discard(key.fd) - return key - - def _select(self, r, w, timeout=None): - """ Wrapper for select.select because timeout is a positional arg """ - return select.select(r, w, [], timeout) - - def select(self, timeout=None): - # Selecting on empty lists on Windows errors out. - if not len(self._readers) and not len(self._writers): - return [] - - timeout = None if timeout is None else max(timeout, 0.0) - ready = [] - r, w, _ = _syscall_wrapper(self._select, True, self._readers, - self._writers, timeout) - r = set(r) - w = set(w) - for fd in r | w: - events = 0 - if fd in r: - events |= EVENT_READ - if fd in w: - events |= EVENT_WRITE - - key = self._key_from_fd(fd) - if key: - ready.append((key, events & key.events)) - return ready - - -if hasattr(select, "poll"): - class PollSelector(BaseSelector): - """ Poll-based selector """ - def __init__(self): - super(PollSelector, self).__init__() - self._poll = select.poll() - - def register(self, fileobj, events, data=None): - key = super(PollSelector, self).register(fileobj, events, data) - event_mask = 0 - if events & EVENT_READ: - event_mask |= select.POLLIN - if events & EVENT_WRITE: - event_mask |= select.POLLOUT - self._poll.register(key.fd, event_mask) - return key - - def unregister(self, fileobj): - key = super(PollSelector, self).unregister(fileobj) - self._poll.unregister(key.fd) - return key - - def _wrap_poll(self, timeout=None): - """ Wrapper function for select.poll.poll() so that - _syscall_wrapper can work with only seconds. """ - if timeout is not None: - if timeout <= 0: - timeout = 0 - else: - # select.poll.poll() has a resolution of 1 millisecond, - # round away from zero to wait *at least* timeout seconds. - timeout = math.ceil(timeout * 1e3) - - result = self._poll.poll(timeout) - return result - - def select(self, timeout=None): - ready = [] - fd_events = _syscall_wrapper(self._wrap_poll, True, timeout=timeout) - for fd, event_mask in fd_events: - events = 0 - if event_mask & ~select.POLLIN: - events |= EVENT_WRITE - if event_mask & ~select.POLLOUT: - events |= EVENT_READ - - key = self._key_from_fd(fd) - if key: - ready.append((key, events & key.events)) - - return ready - - -if hasattr(select, "epoll"): - class EpollSelector(BaseSelector): - """ Epoll-based selector """ - def __init__(self): - super(EpollSelector, self).__init__() - self._epoll = select.epoll() - - def fileno(self): - return self._epoll.fileno() - - def register(self, fileobj, events, data=None): - key = super(EpollSelector, self).register(fileobj, events, data) - events_mask = 0 - if events & EVENT_READ: - events_mask |= select.EPOLLIN - if events & EVENT_WRITE: - events_mask |= select.EPOLLOUT - _syscall_wrapper(self._epoll.register, False, key.fd, events_mask) - return key - - def unregister(self, fileobj): - key = super(EpollSelector, self).unregister(fileobj) - try: - _syscall_wrapper(self._epoll.unregister, False, key.fd) - except SelectorError: - # This can occur when the fd was closed since registry. - pass - return key - - def select(self, timeout=None): - if timeout is not None: - if timeout <= 0: - timeout = 0.0 - else: - # select.epoll.poll() has a resolution of 1 millisecond - # but luckily takes seconds so we don't need a wrapper - # like PollSelector. Just for better rounding. - timeout = math.ceil(timeout * 1e3) * 1e-3 - timeout = float(timeout) - else: - timeout = -1.0 # epoll.poll() must have a float. - - # We always want at least 1 to ensure that select can be called - # with no file descriptors registered. Otherwise will fail. - max_events = max(len(self._fd_to_key), 1) - - ready = [] - fd_events = _syscall_wrapper(self._epoll.poll, True, - timeout=timeout, - maxevents=max_events) - for fd, event_mask in fd_events: - events = 0 - if event_mask & ~select.EPOLLIN: - events |= EVENT_WRITE - if event_mask & ~select.EPOLLOUT: - events |= EVENT_READ - - key = self._key_from_fd(fd) - if key: - ready.append((key, events & key.events)) - return ready - - def close(self): - self._epoll.close() - super(EpollSelector, self).close() - - -if hasattr(select, "kqueue"): - class KqueueSelector(BaseSelector): - """ Kqueue / Kevent-based selector """ - def __init__(self): - super(KqueueSelector, self).__init__() - self._kqueue = select.kqueue() - - def fileno(self): - return self._kqueue.fileno() - - def register(self, fileobj, events, data=None): - key = super(KqueueSelector, self).register(fileobj, events, data) - if events & EVENT_READ: - kevent = select.kevent(key.fd, - select.KQ_FILTER_READ, - select.KQ_EV_ADD) - - _syscall_wrapper(self._kqueue.control, False, [kevent], 0, 0) - - if events & EVENT_WRITE: - kevent = select.kevent(key.fd, - select.KQ_FILTER_WRITE, - select.KQ_EV_ADD) - - _syscall_wrapper(self._kqueue.control, False, [kevent], 0, 0) - - return key - - def unregister(self, fileobj): - key = super(KqueueSelector, self).unregister(fileobj) - if key.events & EVENT_READ: - kevent = select.kevent(key.fd, - select.KQ_FILTER_READ, - select.KQ_EV_DELETE) - try: - _syscall_wrapper(self._kqueue.control, False, [kevent], 0, 0) - except SelectorError: - pass - if key.events & EVENT_WRITE: - kevent = select.kevent(key.fd, - select.KQ_FILTER_WRITE, - select.KQ_EV_DELETE) - try: - _syscall_wrapper(self._kqueue.control, False, [kevent], 0, 0) - except SelectorError: - pass - - return key - - def select(self, timeout=None): - if timeout is not None: - timeout = max(timeout, 0) - - max_events = len(self._fd_to_key) * 2 - ready_fds = {} - - kevent_list = _syscall_wrapper(self._kqueue.control, True, - None, max_events, timeout) - - for kevent in kevent_list: - fd = kevent.ident - event_mask = kevent.filter - events = 0 - if event_mask == select.KQ_FILTER_READ: - events |= EVENT_READ - if event_mask == select.KQ_FILTER_WRITE: - events |= EVENT_WRITE - - key = self._key_from_fd(fd) - if key: - if key.fd not in ready_fds: - ready_fds[key.fd] = (key, events & key.events) - else: - old_events = ready_fds[key.fd][1] - ready_fds[key.fd] = (key, (events | old_events) & key.events) - - return list(ready_fds.values()) - - def close(self): - self._kqueue.close() - super(KqueueSelector, self).close() - - -if not hasattr(select, 'select'): # Platform-specific: AppEngine - HAS_SELECT = False - - -def _can_allocate(struct): - """ Checks that select structs can be allocated by the underlying - operating system, not just advertised by the select module. We don't - check select() because we'll be hopeful that most platforms that - don't have it available will not advertise it. (ie: GAE) """ - try: - # select.poll() objects won't fail until used. - if struct == 'poll': - p = select.poll() - p.poll(0) - - # All others will fail on allocation. - else: - getattr(select, struct)().close() - return True - except (OSError, AttributeError) as e: - return False - - -# Choose the best implementation, roughly: -# kqueue == epoll > poll > select. Devpoll not supported. (See above) -# select() also can't accept a FD > FD_SETSIZE (usually around 1024) -def DefaultSelector(): - """ This function serves as a first call for DefaultSelector to - detect if the select module is being monkey-patched incorrectly - by eventlet, greenlet, and preserve proper behavior. """ - global _DEFAULT_SELECTOR - if _DEFAULT_SELECTOR is None: - if _can_allocate('kqueue'): - _DEFAULT_SELECTOR = KqueueSelector - elif _can_allocate('epoll'): - _DEFAULT_SELECTOR = EpollSelector - elif _can_allocate('poll'): - _DEFAULT_SELECTOR = PollSelector - elif hasattr(select, 'select'): - _DEFAULT_SELECTOR = SelectSelector - else: # Platform-specific: AppEngine - raise ValueError('Platform does not have a selector') - return _DEFAULT_SELECTOR() diff --git a/libs/urllib3/util/ssl_.py b/libs/urllib3/util/ssl_.py index 5ae435827..f271ce930 100644 --- a/libs/urllib3/util/ssl_.py +++ b/libs/urllib3/util/ssl_.py @@ -2,13 +2,14 @@ from __future__ import absolute_import import errno import warnings import hmac -import socket +import re from binascii import hexlify, unhexlify from hashlib import md5, sha1, sha256 from ..exceptions import SSLError, InsecurePlatformWarning, SNIMissingWarning from ..packages import six +from ..packages.rfc3986 import abnf_regexp SSLContext = None @@ -40,14 +41,33 @@ def _const_compare_digest_backport(a, b): _const_compare_digest = getattr(hmac, 'compare_digest', _const_compare_digest_backport) +# Borrow rfc3986's regular expressions for IPv4 +# and IPv6 addresses for use in is_ipaddress() +_IP_ADDRESS_REGEX = re.compile( + r'^(?:%s|%s|%s)$' % ( + abnf_regexp.IPv4_RE, + abnf_regexp.IPv6_RE, + abnf_regexp.IPv6_ADDRZ_RFC4007_RE + ) +) try: # Test for SSL features import ssl - from ssl import wrap_socket, CERT_NONE, PROTOCOL_SSLv23 + from ssl import wrap_socket, CERT_REQUIRED from ssl import HAS_SNI # Has SNI? except ImportError: pass +try: # Platform-specific: Python 3.6 + from ssl import PROTOCOL_TLS + PROTOCOL_SSLv23 = PROTOCOL_TLS +except ImportError: + try: + from ssl import PROTOCOL_SSLv23 as PROTOCOL_TLS + PROTOCOL_SSLv23 = PROTOCOL_TLS + except ImportError: + PROTOCOL_SSLv23 = PROTOCOL_TLS = 2 + try: from ssl import OP_NO_SSLv2, OP_NO_SSLv3, OP_NO_COMPRESSION @@ -56,25 +76,6 @@ except ImportError: OP_NO_COMPRESSION = 0x20000 -# Python 2.7 doesn't have inet_pton on non-Linux so we fallback on inet_aton in -# those cases. This means that we can only detect IPv4 addresses in this case. -if hasattr(socket, 'inet_pton'): - inet_pton = socket.inet_pton -else: - # Maybe we can use ipaddress if the user has urllib3[secure]? - try: - import ipaddress - - def inet_pton(_, host): - if isinstance(host, bytes): - host = host.decode('ascii') - return ipaddress.ip_address(host) - - except ImportError: # Platform-specific: Non-Linux - def inet_pton(_, host): - return socket.inet_aton(host) - - # A secure default. # Sources for more information on TLS ciphers: # @@ -83,37 +84,35 @@ else: # - https://hynek.me/articles/hardening-your-web-servers-ssl-ciphers/ # # The general intent is: -# - Prefer TLS 1.3 cipher suites # - prefer cipher suites that offer perfect forward secrecy (DHE/ECDHE), # - prefer ECDHE over DHE for better performance, # - prefer any AES-GCM and ChaCha20 over any AES-CBC for better performance and # security, # - prefer AES-GCM over ChaCha20 because hardware-accelerated AES is common, -# - disable NULL authentication, MD5 MACs and DSS for security reasons. +# - disable NULL authentication, MD5 MACs, DSS, and other +# insecure ciphers for security reasons. +# - NOTE: TLS 1.3 cipher suites are managed through a different interface +# not exposed by CPython (yet!) and are enabled by default if they're available. DEFAULT_CIPHERS = ':'.join([ - 'TLS13-AES-256-GCM-SHA384', - 'TLS13-CHACHA20-POLY1305-SHA256', - 'TLS13-AES-128-GCM-SHA256', + 'ECDHE+AESGCM', + 'ECDHE+CHACHA20', + 'DHE+AESGCM', + 'DHE+CHACHA20', 'ECDH+AESGCM', - 'ECDH+CHACHA20', 'DH+AESGCM', - 'DH+CHACHA20', - 'ECDH+AES256', - 'DH+AES256', - 'ECDH+AES128', + 'ECDH+AES', 'DH+AES', 'RSA+AESGCM', 'RSA+AES', '!aNULL', '!eNULL', '!MD5', + '!DSS', ]) try: from ssl import SSLContext # Modern SSL? except ImportError: - import sys - class SSLContext(object): # Platform-specific: Python 2 def __init__(self, protocol_version): self.protocol = protocol_version @@ -199,7 +198,7 @@ def resolve_cert_reqs(candidate): constant which can directly be passed to wrap_socket. """ if candidate is None: - return CERT_NONE + return CERT_REQUIRED if isinstance(candidate, str): res = getattr(ssl, candidate, None) @@ -215,7 +214,7 @@ def resolve_ssl_version(candidate): like resolve_cert_reqs """ if candidate is None: - return PROTOCOL_SSLv23 + return PROTOCOL_TLS if isinstance(candidate, str): res = getattr(ssl, candidate, None) @@ -261,7 +260,7 @@ def create_urllib3_context(ssl_version=None, cert_reqs=None, Constructed SSLContext object with specified options :rtype: SSLContext """ - context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23) + context = SSLContext(ssl_version or PROTOCOL_TLS) context.set_ciphers(ciphers or DEFAULT_CIPHERS) @@ -291,7 +290,7 @@ def create_urllib3_context(ssl_version=None, cert_reqs=None, def ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None, ca_certs=None, server_hostname=None, ssl_version=None, ciphers=None, ssl_context=None, - ca_cert_dir=None): + ca_cert_dir=None, key_password=None): """ All arguments except for server_hostname, ssl_context, and ca_cert_dir have the same meaning as they do when using :func:`ssl.wrap_socket`. @@ -307,6 +306,8 @@ def ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None, A directory containing CA certificates in multiple separate files, as supported by OpenSSL's -CApath flag or the capath argument to SSLContext.load_verify_locations(). + :param key_password: + Optional password if the keyfile is encrypted. """ context = ssl_context if context is None: @@ -328,14 +329,21 @@ def ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None, raise SSLError(e) raise - # Don't load system certs unless there were no CA certs or - # SSLContext object specified manually. elif ssl_context is None and hasattr(context, 'load_default_certs'): # try to load OS default certs; works well on Windows (require Python3.4+) context.load_default_certs() + # Attempt to detect if we get the goofy behavior of the + # keyfile being encrypted and OpenSSL asking for the + # passphrase via the terminal and instead error out. + if keyfile and key_password is None and _is_key_file_encrypted(keyfile): + raise SSLError("Client private key is encrypted, password is required") + if certfile: - context.load_cert_chain(certfile, keyfile) + if key_password is None: + context.load_cert_chain(certfile, keyfile) + else: + context.load_cert_chain(certfile, keyfile, key_password) # If we detect server_hostname is an IP address then the SNI # extension should not be used according to RFC3546 Section 3.1 @@ -361,7 +369,8 @@ def ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None, def is_ipaddress(hostname): - """Detects whether the hostname given is an IP address. + """Detects whether the hostname given is an IPv4 or IPv6 address. + Also detects IPv6 addresses with Zone IDs. :param str hostname: Hostname to examine. :return: True if the hostname is an IP address, False otherwise. @@ -369,16 +378,15 @@ def is_ipaddress(hostname): if six.PY3 and isinstance(hostname, bytes): # IDN A-label bytes are ASCII compatible. hostname = hostname.decode('ascii') + return _IP_ADDRESS_REGEX.match(hostname) is not None - families = [socket.AF_INET] - if hasattr(socket, 'AF_INET6'): - families.append(socket.AF_INET6) - for af in families: - try: - inet_pton(af, hostname) - except (socket.error, ValueError, OSError): - pass - else: - return True +def _is_key_file_encrypted(key_file): + """Detects if a key file is encrypted or not.""" + with open(key_file, 'r') as f: + for line in f: + # Look for Proc-Type: 4,ENCRYPTED + if 'ENCRYPTED' in line: + return True + return False diff --git a/libs/urllib3/util/timeout.py b/libs/urllib3/util/timeout.py index cec817e6e..a4d004a84 100644 --- a/libs/urllib3/util/timeout.py +++ b/libs/urllib3/util/timeout.py @@ -131,7 +131,8 @@ class Timeout(object): raise ValueError("Attempted to set %s timeout to %s, but the " "timeout cannot be set to a value less " "than or equal to 0." % (name, value)) - except TypeError: # Python 3 + except TypeError: + # Python 3 raise ValueError("Timeout value %s was %s, but it must be an " "int, float or None." % (name, value)) diff --git a/libs/urllib3/util/url.py b/libs/urllib3/util/url.py index 6b6f9968d..0bc6ced75 100644 --- a/libs/urllib3/util/url.py +++ b/libs/urllib3/util/url.py @@ -1,7 +1,12 @@ from __future__ import absolute_import +import re from collections import namedtuple from ..exceptions import LocationParseError +from ..packages import six, rfc3986 +from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError +from ..packages.rfc3986.validators import Validator +from ..packages.rfc3986 import abnf_regexp, normalizers, compat, misc url_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment'] @@ -10,10 +15,16 @@ url_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment'] # urllib3 infers URLs without a scheme (None) to be http. NORMALIZABLE_SCHEMES = ('http', 'https', None) +# Regex for detecting URLs with schemes. RFC 3986 Section 3.1 +SCHEME_REGEX = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+\-]*:|/)") + +PATH_CHARS = abnf_regexp.UNRESERVED_CHARS_SET | abnf_regexp.SUB_DELIMITERS_SET | {':', '@', '/'} +QUERY_CHARS = FRAGMENT_CHARS = PATH_CHARS | {'?'} + class Url(namedtuple('Url', url_attrs)): """ - Datastructure for representing an HTTP URL. Used as a return value for + Data structure for representing an HTTP URL. Used as a return value for :func:`parse_url`. Both the scheme and host are normalized as they are both case-insensitive according to RFC 3986. """ @@ -23,10 +34,8 @@ class Url(namedtuple('Url', url_attrs)): query=None, fragment=None): if path and not path.startswith('/'): path = '/' + path - if scheme: + if scheme is not None: scheme = scheme.lower() - if host and scheme in NORMALIZABLE_SCHEMES: - host = host.lower() return super(Url, cls).__new__(cls, scheme, auth, host, port, path, query, fragment) @@ -72,23 +81,23 @@ class Url(namedtuple('Url', url_attrs)): 'http://username:[email protected]:80/path?query#fragment' """ scheme, auth, host, port, path, query, fragment = self - url = '' + url = u'' # We use "is not None" we want things to happen with empty strings (or 0 port) if scheme is not None: - url += scheme + '://' + url += scheme + u'://' if auth is not None: - url += auth + '@' + url += auth + u'@' if host is not None: url += host if port is not None: - url += ':' + str(port) + url += u':' + str(port) if path is not None: url += path if query is not None: - url += '?' + query + url += u'?' + query if fragment is not None: - url += '#' + fragment + url += u'#' + fragment return url @@ -98,6 +107,8 @@ class Url(namedtuple('Url', url_attrs)): def split_first(s, delims): """ + .. deprecated:: 1.25 + Given a string and an iterable of delimiters, split on the first found delimiter. Return two split parts and the matched delimiter. @@ -129,10 +140,44 @@ def split_first(s, delims): return s[:min_idx], s[min_idx + 1:], min_delim +def _encode_invalid_chars(component, allowed_chars, encoding='utf-8'): + """Percent-encodes a URI component without reapplying + onto an already percent-encoded component. Based on + rfc3986.normalizers.encode_component() + """ + if component is None: + return component + + # Try to see if the component we're encoding is already percent-encoded + # so we can skip all '%' characters but still encode all others. + percent_encodings = len(normalizers.PERCENT_MATCHER.findall( + compat.to_str(component, encoding))) + + uri_bytes = component.encode('utf-8', 'surrogatepass') + is_percent_encoded = percent_encodings == uri_bytes.count(b'%') + + encoded_component = bytearray() + + for i in range(0, len(uri_bytes)): + # Will return a single character bytestring on both Python 2 & 3 + byte = uri_bytes[i:i+1] + byte_ord = ord(byte) + if ((is_percent_encoded and byte == b'%') + or (byte_ord < 128 and byte.decode() in allowed_chars)): + encoded_component.extend(byte) + continue + encoded_component.extend('%{0:02x}'.format(byte_ord).encode().upper()) + + return encoded_component.decode(encoding) + + def parse_url(url): """ Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is performed to parse incomplete urls. Fields not provided will be None. + This parser is RFC 3986 compliant. + + :param str url: URL to parse into a :class:`.Url` namedtuple. Partly backwards-compatible with :mod:`urlparse`. @@ -145,81 +190,95 @@ def parse_url(url): >>> parse_url('/foo?bar') Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...) """ - - # While this code has overlap with stdlib's urlparse, it is much - # simplified for our needs and less annoying. - # Additionally, this implementations does silly things to be optimal - # on CPython. - if not url: # Empty return Url() - scheme = None - auth = None - host = None - port = None - path = None - fragment = None - query = None - - # Scheme - if '://' in url: - scheme, url = url.split('://', 1) - - # Find the earliest Authority Terminator - # (http://tools.ietf.org/html/rfc3986#section-3.2) - url, path_, delim = split_first(url, ['/', '?', '#']) - - if delim: - # Reassemble the path - path = delim + path_ - - # Auth - if '@' in url: - # Last '@' denotes end of auth part - auth, url = url.rsplit('@', 1) - - # IPv6 - if url and url[0] == '[': - host, url = url.split(']', 1) - host += ']' - - # Port - if ':' in url: - _host, port = url.split(':', 1) - - if not host: - host = _host - - if port: - # If given, ports must be integers. No whitespace, no plus or - # minus prefixes, no non-integer digits such as ^2 (superscript). - if not port.isdigit(): - raise LocationParseError(url) - try: - port = int(port) - except ValueError: - raise LocationParseError(url) - else: - # Blank ports are cool, too. (rfc3986#section-3.2.3) - port = None + is_string = not isinstance(url, six.binary_type) - elif not host and url: - host = url + # RFC 3986 doesn't like URLs that have a host but don't start + # with a scheme and we support URLs like that so we need to + # detect that problem and add an empty scheme indication. + # We don't get hurt on path-only URLs here as it's stripped + # off and given an empty scheme anyways. + if not SCHEME_REGEX.search(url): + url = "//" + url + def idna_encode(name): + if name and any([ord(x) > 128 for x in name]): + try: + import idna + except ImportError: + raise LocationParseError("Unable to parse URL without the 'idna' module") + try: + return idna.encode(name.lower(), strict=True, std3_rules=True) + except idna.IDNAError: + raise LocationParseError(u"Name '%s' is not a valid IDNA label" % name) + return name + + try: + split_iri = misc.IRI_MATCHER.match(compat.to_str(url)).groupdict() + iri_ref = rfc3986.IRIReference( + split_iri['scheme'], split_iri['authority'], + _encode_invalid_chars(split_iri['path'], PATH_CHARS), + _encode_invalid_chars(split_iri['query'], QUERY_CHARS), + _encode_invalid_chars(split_iri['fragment'], FRAGMENT_CHARS) + ) + has_authority = iri_ref.authority is not None + uri_ref = iri_ref.encode(idna_encoder=idna_encode) + except (ValueError, RFC3986Exception): + return six.raise_from(LocationParseError(url), None) + + # rfc3986 strips the authority if it's invalid + if has_authority and uri_ref.authority is None: + raise LocationParseError(url) + + # Only normalize schemes we understand to not break http+unix + # or other schemes that don't follow RFC 3986. + if uri_ref.scheme is None or uri_ref.scheme.lower() in NORMALIZABLE_SCHEMES: + uri_ref = uri_ref.normalize() + + # Validate all URIReference components and ensure that all + # components that were set before are still set after + # normalization has completed. + validator = Validator() + try: + validator.check_validity_of( + *validator.COMPONENT_NAMES + ).validate(uri_ref) + except ValidationError: + return six.raise_from(LocationParseError(url), None) + + # For the sake of backwards compatibility we put empty + # string values for path if there are any defined values + # beyond the path in the URL. + # TODO: Remove this when we break backwards compatibility. + path = uri_ref.path if not path: - return Url(scheme, auth, host, port, path, query, fragment) - - # Fragment - if '#' in path: - path, fragment = path.split('#', 1) - - # Query - if '?' in path: - path, query = path.split('?', 1) - - return Url(scheme, auth, host, port, path, query, fragment) + if (uri_ref.query is not None + or uri_ref.fragment is not None): + path = "" + else: + path = None + + # Ensure that each part of the URL is a `str` for + # backwards compatibility. + def to_input_type(x): + if x is None: + return None + elif not is_string and not isinstance(x, six.binary_type): + return x.encode('utf-8') + return x + + return Url( + scheme=to_input_type(uri_ref.scheme), + auth=to_input_type(uri_ref.userinfo), + host=to_input_type(uri_ref.host), + port=int(uri_ref.port) if uri_ref.port is not None else None, + path=to_input_type(path), + query=to_input_type(uri_ref.query), + fragment=to_input_type(uri_ref.fragment) + ) def get_host(url): diff --git a/libs/bs4/AUTHORS.txt b/libs2/bs4/AUTHORS.txt index 2ac8fcc8c..2ac8fcc8c 100644 --- a/libs/bs4/AUTHORS.txt +++ b/libs2/bs4/AUTHORS.txt diff --git a/libs/bs4/COPYING.txt b/libs2/bs4/COPYING.txt index b91188869..b91188869 100644 --- a/libs/bs4/COPYING.txt +++ b/libs2/bs4/COPYING.txt diff --git a/libs/bs4/NEWS.txt b/libs2/bs4/NEWS.txt index 3726c570a..3726c570a 100644 --- a/libs/bs4/NEWS.txt +++ b/libs2/bs4/NEWS.txt diff --git a/libs/bs4/README.txt b/libs2/bs4/README.txt index 305c51e05..305c51e05 100644 --- a/libs/bs4/README.txt +++ b/libs2/bs4/README.txt diff --git a/libs/bs4/TODO.txt b/libs2/bs4/TODO.txt index e26d6264d..e26d6264d 100644 --- a/libs/bs4/TODO.txt +++ b/libs2/bs4/TODO.txt diff --git a/libs/bs4/__init__.py b/libs2/bs4/__init__.py index 7a80452f7..7a80452f7 100644 --- a/libs/bs4/__init__.py +++ b/libs2/bs4/__init__.py diff --git a/libs/bs4/builder/__init__.py b/libs2/bs4/builder/__init__.py index fdb3362fc..fdb3362fc 100644 --- a/libs/bs4/builder/__init__.py +++ b/libs2/bs4/builder/__init__.py diff --git a/libs/bs4/builder/_html5lib.py b/libs2/bs4/builder/_html5lib.py index 5f5489358..5f5489358 100644 --- a/libs/bs4/builder/_html5lib.py +++ b/libs2/bs4/builder/_html5lib.py diff --git a/libs/bs4/builder/_htmlparser.py b/libs2/bs4/builder/_htmlparser.py index 67890b3a3..67890b3a3 100644 --- a/libs/bs4/builder/_htmlparser.py +++ b/libs2/bs4/builder/_htmlparser.py diff --git a/libs/bs4/builder/_lxml.py b/libs2/bs4/builder/_lxml.py index d2ca2872d..d2ca2872d 100644 --- a/libs/bs4/builder/_lxml.py +++ b/libs2/bs4/builder/_lxml.py diff --git a/libs/bs4/dammit.py b/libs2/bs4/dammit.py index 7965565f5..7965565f5 100644 --- a/libs/bs4/dammit.py +++ b/libs2/bs4/dammit.py diff --git a/libs/bs4/diagnose.py b/libs2/bs4/diagnose.py index 8768332f5..8768332f5 100644 --- a/libs/bs4/diagnose.py +++ b/libs2/bs4/diagnose.py diff --git a/libs/bs4/element.py b/libs2/bs4/element.py index 9ef75f814..9ef75f814 100644 --- a/libs/bs4/element.py +++ b/libs2/bs4/element.py diff --git a/libs2/bs4/formatter.py b/libs2/bs4/formatter.py new file mode 100644 index 000000000..7dbaa3850 --- /dev/null +++ b/libs2/bs4/formatter.py @@ -0,0 +1,99 @@ +from bs4.dammit import EntitySubstitution + +class Formatter(EntitySubstitution): + """Describes a strategy to use when outputting a parse tree to a string. + + Some parts of this strategy come from the distinction between + HTML4, HTML5, and XML. Others are configurable by the user. + """ + # Registries of XML and HTML formatters. + XML_FORMATTERS = {} + HTML_FORMATTERS = {} + + HTML = 'html' + XML = 'xml' + + HTML_DEFAULTS = dict( + cdata_containing_tags=set(["script", "style"]), + ) + + def _default(self, language, value, kwarg): + if value is not None: + return value + if language == self.XML: + return set() + return self.HTML_DEFAULTS[kwarg] + + def __init__( + self, language=None, entity_substitution=None, + void_element_close_prefix='/', cdata_containing_tags=None, + ): + """ + + :param void_element_close_prefix: By default, represent void + elements as <tag/> rather than <tag> + """ + self.language = language + self.entity_substitution = entity_substitution + self.void_element_close_prefix = void_element_close_prefix + self.cdata_containing_tags = self._default( + language, cdata_containing_tags, 'cdata_containing_tags' + ) + + def substitute(self, ns): + """Process a string that needs to undergo entity substitution.""" + if not self.entity_substitution: + return ns + from .element import NavigableString + if (isinstance(ns, NavigableString) + and ns.parent is not None + and ns.parent.name in self.cdata_containing_tags): + # Do nothing. + return ns + # Substitute. + return self.entity_substitution(ns) + + def attribute_value(self, value): + """Process the value of an attribute.""" + return self.substitute(value) + + def attributes(self, tag): + """Reorder a tag's attributes however you want.""" + return sorted(tag.attrs.items()) + + +class HTMLFormatter(Formatter): + REGISTRY = {} + def __init__(self, *args, **kwargs): + return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) + + +class XMLFormatter(Formatter): + REGISTRY = {} + def __init__(self, *args, **kwargs): + return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) + + +# Set up aliases for the default formatters. +HTMLFormatter.REGISTRY['html'] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_html +) +HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_html, + void_element_close_prefix = None +) +HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_xml +) +HTMLFormatter.REGISTRY[None] = HTMLFormatter( + entity_substitution=None +) +XMLFormatter.REGISTRY["html"] = XMLFormatter( + entity_substitution=EntitySubstitution.substitute_html +) +XMLFormatter.REGISTRY["minimal"] = XMLFormatter( + entity_substitution=EntitySubstitution.substitute_xml +) +XMLFormatter.REGISTRY[None] = Formatter( + Formatter(Formatter.XML, entity_substitution=None) +) diff --git a/libs/bs4/testing.py b/libs2/bs4/testing.py index 6ba2506c4..6ba2506c4 100644 --- a/libs/bs4/testing.py +++ b/libs2/bs4/testing.py diff --git a/libs/bs4/tests/__init__.py b/libs2/bs4/tests/__init__.py index 142c8cc3f..142c8cc3f 100644 --- a/libs/bs4/tests/__init__.py +++ b/libs2/bs4/tests/__init__.py diff --git a/libs/bs4/tests/test_builder_registry.py b/libs2/bs4/tests/test_builder_registry.py index 90cad8293..90cad8293 100644 --- a/libs/bs4/tests/test_builder_registry.py +++ b/libs2/bs4/tests/test_builder_registry.py diff --git a/libs/bs4/tests/test_docs.py b/libs2/bs4/tests/test_docs.py index 5b9f67709..5b9f67709 100644 --- a/libs/bs4/tests/test_docs.py +++ b/libs2/bs4/tests/test_docs.py diff --git a/libs/bs4/tests/test_html5lib.py b/libs2/bs4/tests/test_html5lib.py index 0f89d6244..0f89d6244 100644 --- a/libs/bs4/tests/test_html5lib.py +++ b/libs2/bs4/tests/test_html5lib.py diff --git a/libs/bs4/tests/test_htmlparser.py b/libs2/bs4/tests/test_htmlparser.py index d5cf0253f..d5cf0253f 100644 --- a/libs/bs4/tests/test_htmlparser.py +++ b/libs2/bs4/tests/test_htmlparser.py diff --git a/libs/bs4/tests/test_lxml.py b/libs2/bs4/tests/test_lxml.py index a05870b91..a05870b91 100644 --- a/libs/bs4/tests/test_lxml.py +++ b/libs2/bs4/tests/test_lxml.py diff --git a/libs/bs4/tests/test_soup.py b/libs2/bs4/tests/test_soup.py index f3e69edf3..f3e69edf3 100644 --- a/libs/bs4/tests/test_soup.py +++ b/libs2/bs4/tests/test_soup.py diff --git a/libs/bs4/tests/test_tree.py b/libs2/bs4/tests/test_tree.py index c0e7c4080..c0e7c4080 100644 --- a/libs/bs4/tests/test_tree.py +++ b/libs2/bs4/tests/test_tree.py diff --git a/libs/concurrent/__init__.py b/libs2/concurrent/__init__.py index b36383a61..b36383a61 100644 --- a/libs/concurrent/__init__.py +++ b/libs2/concurrent/__init__.py diff --git a/libs/concurrent/futures/__init__.py b/libs2/concurrent/futures/__init__.py index 428b14bdf..428b14bdf 100644 --- a/libs/concurrent/futures/__init__.py +++ b/libs2/concurrent/futures/__init__.py diff --git a/libs/concurrent/futures/_base.py b/libs2/concurrent/futures/_base.py index 2936c46b1..2936c46b1 100644 --- a/libs/concurrent/futures/_base.py +++ b/libs2/concurrent/futures/_base.py diff --git a/libs/concurrent/futures/process.py b/libs2/concurrent/futures/process.py index 72528410c..72528410c 100644 --- a/libs/concurrent/futures/process.py +++ b/libs2/concurrent/futures/process.py diff --git a/libs/concurrent/futures/thread.py b/libs2/concurrent/futures/thread.py index 85ab4b743..85ab4b743 100644 --- a/libs/concurrent/futures/thread.py +++ b/libs2/concurrent/futures/thread.py diff --git a/libs/enum/LICENSE b/libs2/enum/LICENSE index 9003b8850..9003b8850 100644 --- a/libs/enum/LICENSE +++ b/libs2/enum/LICENSE diff --git a/libs/enum/README b/libs2/enum/README index aa2333d8d..aa2333d8d 100644 --- a/libs/enum/README +++ b/libs2/enum/README diff --git a/libs/enum/__init__.py b/libs2/enum/__init__.py index d6ffb3a40..d6ffb3a40 100644 --- a/libs/enum/__init__.py +++ b/libs2/enum/__init__.py diff --git a/libs/enum/doc/enum.pdf b/libs2/enum/doc/enum.pdf index 3fb6ec264..3fb6ec264 100644 --- a/libs/enum/doc/enum.pdf +++ b/libs2/enum/doc/enum.pdf diff --git a/libs/enum/doc/enum.rst b/libs2/enum/doc/enum.rst index 3afc23821..3afc23821 100644 --- a/libs/enum/doc/enum.rst +++ b/libs2/enum/doc/enum.rst diff --git a/libs/enum/test.py b/libs2/enum/test.py index d9edfaee4..d9edfaee4 100644 --- a/libs/enum/test.py +++ b/libs2/enum/test.py diff --git a/libs/yaml/__init__.py b/libs2/yaml/__init__.py index e7a419dd2..e7a419dd2 100644 --- a/libs/yaml/__init__.py +++ b/libs2/yaml/__init__.py diff --git a/libs/yaml/composer.py b/libs2/yaml/composer.py index df85ef653..df85ef653 100644 --- a/libs/yaml/composer.py +++ b/libs2/yaml/composer.py diff --git a/libs/yaml/constructor.py b/libs2/yaml/constructor.py index 516dad1ce..516dad1ce 100644 --- a/libs/yaml/constructor.py +++ b/libs2/yaml/constructor.py diff --git a/libs/yaml/cyaml.py b/libs2/yaml/cyaml.py index ebb895935..ebb895935 100644 --- a/libs/yaml/cyaml.py +++ b/libs2/yaml/cyaml.py diff --git a/libs/yaml/dumper.py b/libs2/yaml/dumper.py index f9cd49fda..f9cd49fda 100644 --- a/libs/yaml/dumper.py +++ b/libs2/yaml/dumper.py diff --git a/libs/yaml/emitter.py b/libs2/yaml/emitter.py index 9561a8274..9561a8274 100644 --- a/libs/yaml/emitter.py +++ b/libs2/yaml/emitter.py diff --git a/libs/yaml/error.py b/libs2/yaml/error.py index 577686db5..577686db5 100644 --- a/libs/yaml/error.py +++ b/libs2/yaml/error.py diff --git a/libs/yaml/events.py b/libs2/yaml/events.py index f79ad389c..f79ad389c 100644 --- a/libs/yaml/events.py +++ b/libs2/yaml/events.py diff --git a/libs/yaml/loader.py b/libs2/yaml/loader.py index a79182eaf..a79182eaf 100644 --- a/libs/yaml/loader.py +++ b/libs2/yaml/loader.py diff --git a/libs/yaml/nodes.py b/libs2/yaml/nodes.py index c4f070c41..c4f070c41 100644 --- a/libs/yaml/nodes.py +++ b/libs2/yaml/nodes.py diff --git a/libs/yaml/parser.py b/libs2/yaml/parser.py index f9e3057f3..f9e3057f3 100644 --- a/libs/yaml/parser.py +++ b/libs2/yaml/parser.py diff --git a/libs/yaml/reader.py b/libs2/yaml/reader.py index b2f10b091..b2f10b091 100644 --- a/libs/yaml/reader.py +++ b/libs2/yaml/reader.py diff --git a/libs/yaml/representer.py b/libs2/yaml/representer.py index 9dca41af7..9dca41af7 100644 --- a/libs/yaml/representer.py +++ b/libs2/yaml/representer.py diff --git a/libs/yaml/resolver.py b/libs2/yaml/resolver.py index 528fbc0ea..528fbc0ea 100644 --- a/libs/yaml/resolver.py +++ b/libs2/yaml/resolver.py diff --git a/libs/yaml/scanner.py b/libs2/yaml/scanner.py index 5126cf07b..5126cf07b 100644 --- a/libs/yaml/scanner.py +++ b/libs2/yaml/scanner.py diff --git a/libs/yaml/serializer.py b/libs2/yaml/serializer.py index 0bf1e96dc..0bf1e96dc 100644 --- a/libs/yaml/serializer.py +++ b/libs2/yaml/serializer.py diff --git a/libs/yaml/tokens.py b/libs2/yaml/tokens.py index 4d0b48a39..4d0b48a39 100644 --- a/libs/yaml/tokens.py +++ b/libs2/yaml/tokens.py diff --git a/libs3/bs4/__init__.py b/libs3/bs4/__init__.py new file mode 100644 index 000000000..95ca229c1 --- /dev/null +++ b/libs3/bs4/__init__.py @@ -0,0 +1,616 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup uses a pluggable XML or HTML parser to parse a +(possibly invalid) document into a tree representation. Beautiful Soup +provides methods and Pythonic idioms that make it easy to navigate, +search, and modify the parse tree. + +Beautiful Soup works with Python 2.7 and up. It works better if lxml +and/or html5lib is installed. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: +http://www.crummy.com/software/BeautifulSoup/bs4/doc/ + +""" + +__author__ = "Leonard Richardson ([email protected])" +__version__ = "4.8.0" +__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson" +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = ['BeautifulSoup'] + +import os +import re +import sys +import traceback +import warnings + +from .builder import builder_registry, ParserRejectedMarkup +from .dammit import UnicodeDammit +from .element import ( + CData, + Comment, + DEFAULT_OUTPUT_ENCODING, + Declaration, + Doctype, + NavigableString, + PageElement, + ProcessingInstruction, + ResultSet, + SoupStrainer, + Tag, + ) + +# The very first thing we do is give a useful error if someone is +# running this code under Python 3 without converting it. +'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' + +class BeautifulSoup(Tag): + """ + This class defines the basic interface called by the tree builders. + + These methods will be called by the parser: + reset() + feed(markup) + + The tree builder may call these methods from its feed() implementation: + handle_starttag(name, attrs) # See note about return value + handle_endtag(name) + handle_data(data) # Appends to the current data node + endData(containerClass=NavigableString) # Ends the current data node + + No matter how complicated the underlying parser is, you should be + able to build a tree using 'start tag' events, 'end tag' events, + 'data' events, and "done with data" events. + + If you encounter an empty-element tag (aka a self-closing tag, + like HTML's <br> tag), call handle_starttag and then + handle_endtag. + """ + ROOT_TAG_NAME = '[document]' + + # If the end-user gives no indication which tree builder they + # want, look for one with these features. + DEFAULT_BUILDER_FEATURES = ['html', 'fast'] + + ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' + + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" + + def __init__(self, markup="", features=None, builder=None, + parse_only=None, from_encoding=None, exclude_encodings=None, + **kwargs): + """Constructor. + + :param markup: A string or a file-like object representing + markup to be parsed. + + :param features: Desirable features of the parser to be used. This + may be the name of a specific parser ("lxml", "lxml-xml", + "html.parser", or "html5lib") or it may be the type of markup + to be used ("html", "html5", "xml"). It's recommended that you + name a specific parser, so that Beautiful Soup gives you the + same results across platforms and virtual environments. + + :param builder: A TreeBuilder subclass to instantiate (or + instance to use) instead of looking one up based on + `features`. You only need to use this if you've implemented a + custom TreeBuilder. + + :param parse_only: A SoupStrainer. Only parts of the document + matching the SoupStrainer will be considered. This is useful + when parsing part of a document that would otherwise be too + large to fit into memory. + + :param from_encoding: A string indicating the encoding of the + document to be parsed. Pass this in if Beautiful Soup is + guessing wrongly about the document's encoding. + + :param exclude_encodings: A list of strings indicating + encodings known to be wrong. Pass this in if you don't know + the document's encoding but you know Beautiful Soup's guess is + wrong. + + :param kwargs: For backwards compatibility purposes, the + constructor accepts certain keyword arguments used in + Beautiful Soup 3. None of these arguments do anything in + Beautiful Soup 4; they will result in a warning and then be ignored. + + Apart from this, any keyword arguments passed into the BeautifulSoup + constructor are propagated to the TreeBuilder constructor. This + makes it possible to configure a TreeBuilder beyond saying + which one to use. + + """ + + if 'convertEntities' in kwargs: + del kwargs['convertEntities'] + warnings.warn( + "BS4 does not respect the convertEntities argument to the " + "BeautifulSoup constructor. Entities are always converted " + "to Unicode characters.") + + if 'markupMassage' in kwargs: + del kwargs['markupMassage'] + warnings.warn( + "BS4 does not respect the markupMassage argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for any necessary markup massage.") + + if 'smartQuotesTo' in kwargs: + del kwargs['smartQuotesTo'] + warnings.warn( + "BS4 does not respect the smartQuotesTo argument to the " + "BeautifulSoup constructor. Smart quotes are always converted " + "to Unicode characters.") + + if 'selfClosingTags' in kwargs: + del kwargs['selfClosingTags'] + warnings.warn( + "BS4 does not respect the selfClosingTags argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for understanding self-closing tags.") + + if 'isHTML' in kwargs: + del kwargs['isHTML'] + warnings.warn( + "BS4 does not respect the isHTML argument to the " + "BeautifulSoup constructor. Suggest you use " + "features='lxml' for HTML and features='lxml-xml' for " + "XML.") + + def deprecated_argument(old_name, new_name): + if old_name in kwargs: + warnings.warn( + 'The "%s" argument to the BeautifulSoup constructor ' + 'has been renamed to "%s."' % (old_name, new_name)) + value = kwargs[old_name] + del kwargs[old_name] + return value + return None + + parse_only = parse_only or deprecated_argument( + "parseOnlyThese", "parse_only") + + from_encoding = from_encoding or deprecated_argument( + "fromEncoding", "from_encoding") + + if from_encoding and isinstance(markup, str): + warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") + from_encoding = None + + # We need this information to track whether or not the builder + # was specified well enough that we can omit the 'you need to + # specify a parser' warning. + original_builder = builder + original_features = features + + if isinstance(builder, type): + # A builder class was passed in; it needs to be instantiated. + builder_class = builder + builder = None + elif builder is None: + if isinstance(features, str): + features = [features] + if features is None or len(features) == 0: + features = self.DEFAULT_BUILDER_FEATURES + builder_class = builder_registry.lookup(*features) + if builder_class is None: + raise FeatureNotFound( + "Couldn't find a tree builder with the features you " + "requested: %s. Do you need to install a parser library?" + % ",".join(features)) + + # At this point either we have a TreeBuilder instance in + # builder, or we have a builder_class that we can instantiate + # with the remaining **kwargs. + if builder is None: + builder = builder_class(**kwargs) + if not original_builder and not ( + original_features == builder.NAME or + original_features in builder.ALTERNATE_NAMES + ): + if builder.is_xml: + markup_type = "XML" + else: + markup_type = "HTML" + + # This code adapted from warnings.py so that we get the same line + # of code as our warnings.warn() call gets, even if the answer is wrong + # (as it may be in a multithreading situation). + caller = None + try: + caller = sys._getframe(1) + except ValueError: + pass + if caller: + globals = caller.f_globals + line_number = caller.f_lineno + else: + globals = sys.__dict__ + line_number= 1 + filename = globals.get('__file__') + if filename: + fnl = filename.lower() + if fnl.endswith((".pyc", ".pyo")): + filename = filename[:-1] + if filename: + # If there is no filename at all, the user is most likely in a REPL, + # and the warning is not necessary. + values = dict( + filename=filename, + line_number=line_number, + parser=builder.NAME, + markup_type=markup_type + ) + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) + else: + if kwargs: + warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") + + self.builder = builder + self.is_xml = builder.is_xml + self.known_xml = self.is_xml + self._namespaces = dict() + self.parse_only = parse_only + + self.builder.initialize_soup(self) + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + elif len(markup) <= 256 and ( + (isinstance(markup, bytes) and not b'<' in markup) + or (isinstance(markup, str) and not '<' in markup) + ): + # Print out warnings for a couple beginner problems + # involving passing non-markup to Beautiful Soup. + # Beautiful Soup will still parse the input as markup, + # just in case that's what the user really wants. + if (isinstance(markup, str) + and not os.path.supports_unicode_filenames): + possible_filename = markup.encode("utf8") + else: + possible_filename = markup + is_file = False + try: + is_file = os.path.exists(possible_filename) + except Exception as e: + # This is almost certainly a problem involving + # characters not valid in filenames on this + # system. Just let it go. + pass + if is_file: + if isinstance(markup, str): + markup = markup.encode("utf8") + warnings.warn( + '"%s" looks like a filename, not markup. You should' + ' probably open this file and pass the filehandle into' + ' Beautiful Soup.' % markup) + self._check_markup_is_url(markup) + + for (self.markup, self.original_encoding, self.declared_html_encoding, + self.contains_replacement_characters) in ( + self.builder.prepare_markup( + markup, from_encoding, exclude_encodings=exclude_encodings)): + self.reset() + try: + self._feed() + break + except ParserRejectedMarkup: + pass + + # Clear out the markup and remove the builder's circular + # reference to this object. + self.markup = None + self.builder.soup = None + + def __copy__(self): + copy = type(self)( + self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' + ) + + # Although we encoded the tree to UTF-8, that may not have + # been the encoding of the original markup. Set the copy's + # .original_encoding to reflect the original object's + # .original_encoding. + copy.original_encoding = self.original_encoding + return copy + + def __getstate__(self): + # Frequently a tree builder can't be pickled. + d = dict(self.__dict__) + if 'builder' in d and not self.builder.picklable: + d['builder'] = None + return d + + @staticmethod + def _check_markup_is_url(markup): + """ + Check if markup looks like it's actually a url and raise a warning + if so. Markup can be unicode or str (py2) / bytes (py3). + """ + if isinstance(markup, bytes): + space = b' ' + cant_start_with = (b"http:", b"https:") + elif isinstance(markup, str): + space = ' ' + cant_start_with = ("http:", "https:") + else: + return + + if any(markup.startswith(prefix) for prefix in cant_start_with): + if not space in markup: + if isinstance(markup, bytes): + decoded_markup = markup.decode('utf-8', 'replace') + else: + decoded_markup = markup + warnings.warn( + '"%s" looks like a URL. Beautiful Soup is not an' + ' HTTP client. You should probably use an HTTP client like' + ' requests to get the document behind the URL, and feed' + ' that document to Beautiful Soup.' % decoded_markup + ) + + def _feed(self): + # Convert the document to Unicode. + self.builder.reset() + + self.builder.feed(self.markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) + self.hidden = 1 + self.builder.reset() + self.current_data = [] + self.currentTag = None + self.tagStack = [] + self.preserve_whitespace_tag_stack = [] + self.pushTag(self) + + def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs): + """Create a new tag associated with this soup.""" + kwattrs.update(attrs) + return Tag(None, self.builder, name, namespace, nsprefix, kwattrs) + + def new_string(self, s, subclass=NavigableString): + """Create a new NavigableString associated with this soup.""" + return subclass(s) + + def insert_before(self, successor): + raise NotImplementedError("BeautifulSoup objects don't support insert_before().") + + def insert_after(self, successor): + raise NotImplementedError("BeautifulSoup objects don't support insert_after().") + + def popTag(self): + tag = self.tagStack.pop() + if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: + self.preserve_whitespace_tag_stack.pop() + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag is not None: + self.currentTag.contents.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + if tag.name in self.builder.preserve_whitespace_tags: + self.preserve_whitespace_tag_stack.append(tag) + + def endData(self, containerClass=NavigableString): + if self.current_data: + current_data = ''.join(self.current_data) + # If whitespace is not preserved, and this string contains + # nothing but ASCII spaces, replace it with a single space + # or newline. + if not self.preserve_whitespace_tag_stack: + strippable = True + for i in current_data: + if i not in self.ASCII_SPACES: + strippable = False + break + if strippable: + if '\n' in current_data: + current_data = '\n' + else: + current_data = ' ' + + # Reset the data collector. + self.current_data = [] + + # Should we add this string to the tree at all? + if self.parse_only and len(self.tagStack) <= 1 and \ + (not self.parse_only.text or \ + not self.parse_only.search(current_data)): + return + + o = containerClass(current_data) + self.object_was_parsed(o) + + def object_was_parsed(self, o, parent=None, most_recent_element=None): + """Add an object to the parse tree.""" + if parent is None: + parent = self.currentTag + if most_recent_element is not None: + previous_element = most_recent_element + else: + previous_element = self._most_recent_element + + next_element = previous_sibling = next_sibling = None + if isinstance(o, Tag): + next_element = o.next_element + next_sibling = o.next_sibling + previous_sibling = o.previous_sibling + if previous_element is None: + previous_element = o.previous_element + + fix = parent.next_element is not None + + o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) + + self._most_recent_element = o + parent.contents.append(o) + + # Check if we are inserting into an already parsed node. + if fix: + self._linkage_fixer(parent) + + def _linkage_fixer(self, el): + """Make sure linkage of this fragment is sound.""" + + first = el.contents[0] + child = el.contents[-1] + descendant = child + + if child is first and el.parent is not None: + # Parent should be linked to first child + el.next_element = child + # We are no longer linked to whatever this element is + prev_el = child.previous_element + if prev_el is not None and prev_el is not el: + prev_el.next_element = None + # First child should be linked to the parent, and no previous siblings. + child.previous_element = el + child.previous_sibling = None + + # We have no sibling as we've been appended as the last. + child.next_sibling = None + + # This index is a tag, dig deeper for a "last descendant" + if isinstance(child, Tag) and child.contents: + descendant = child._last_descendant(False) + + # As the final step, link last descendant. It should be linked + # to the parent's next sibling (if found), else walk up the chain + # and find a parent with a sibling. It should have no next sibling. + descendant.next_element = None + descendant.next_sibling = None + target = el + while True: + if target is None: + break + elif target.next_sibling is not None: + descendant.next_element = target.next_sibling + target.next_sibling.previous_element = child + break + target = target.parent + + def _popToTag(self, name, nsprefix=None, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + #print "Popping to %s" % name + if name == self.ROOT_TAG_NAME: + # The BeautifulSoup object itself can never be popped. + return + + most_recently_popped = None + + stack_size = len(self.tagStack) + for i in range(stack_size - 1, 0, -1): + t = self.tagStack[i] + if (name == t.name and nsprefix == t.prefix): + if inclusivePop: + most_recently_popped = self.popTag() + break + most_recently_popped = self.popTag() + + return most_recently_popped + + def handle_starttag(self, name, namespace, nsprefix, attrs): + """Push a start tag on to the stack. + + If this method returns None, the tag was rejected by the + SoupStrainer. You should proceed as if the tag had not occurred + in the document. For instance, if this was a self-closing tag, + don't call handle_endtag. + """ + + # print "Start tag %s: %s" % (name, attrs) + self.endData() + + if (self.parse_only and len(self.tagStack) <= 1 + and (self.parse_only.text + or not self.parse_only.search_tag(name, attrs))): + return None + + tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, + self.currentTag, self._most_recent_element) + if tag is None: + return tag + if self._most_recent_element is not None: + self._most_recent_element.next_element = tag + self._most_recent_element = tag + self.pushTag(tag) + return tag + + def handle_endtag(self, name, nsprefix=None): + #print "End tag: " + name + self.endData() + self._popToTag(name, nsprefix) + + def handle_data(self, data): + self.current_data.append(data) + + def decode(self, pretty_print=False, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a string or Unicode representation of this document. + To get Unicode, pass None for encoding.""" + + if self.is_xml: + # Print the XML declaration + encoding_part = '' + if eventual_encoding != None: + encoding_part = ' encoding="%s"' % eventual_encoding + prefix = '<?xml version="1.0"%s?>\n' % encoding_part + else: + prefix = '' + if not pretty_print: + indent_level = None + else: + indent_level = 0 + return prefix + super(BeautifulSoup, self).decode( + indent_level, eventual_encoding, formatter) + +# Alias to make it easier to type import: 'from bs4 import _soup' +_s = BeautifulSoup +_soup = BeautifulSoup + +class BeautifulStoneSoup(BeautifulSoup): + """Deprecated interface to an XML parser.""" + + def __init__(self, *args, **kwargs): + kwargs['features'] = 'xml' + warnings.warn( + 'The BeautifulStoneSoup class is deprecated. Instead of using ' + 'it, pass features="xml" into the BeautifulSoup constructor.') + super(BeautifulStoneSoup, self).__init__(*args, **kwargs) + + +class StopParsing(Exception): + pass + +class FeatureNotFound(ValueError): + pass + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) + print(soup.prettify()) diff --git a/libs3/bs4/builder/__init__.py b/libs3/bs4/builder/__init__.py new file mode 100644 index 000000000..cc497cf0b --- /dev/null +++ b/libs3/bs4/builder/__init__.py @@ -0,0 +1,367 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +from collections import defaultdict +import itertools +import sys +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + nonwhitespace_re + ) + +__all__ = [ + 'HTMLTreeBuilder', + 'SAXTreeBuilder', + 'TreeBuilder', + 'TreeBuilderRegistry', + ] + +# Some useful features for a TreeBuilder to have. +FAST = 'fast' +PERMISSIVE = 'permissive' +STRICT = 'strict' +XML = 'xml' +HTML = 'html' +HTML_5 = 'html5' + + +class TreeBuilderRegistry(object): + + def __init__(self): + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class): + """Register a treebuilder based on its advertised features.""" + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features): + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + features = list(features) + features.reverse() + candidates = None + candidate_set = None + while len(features) > 0: + feature = features.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection( + set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + +# The BeautifulSoup class will take feature lists from developers and use them +# to look up builders in this registry. +builder_registry = TreeBuilderRegistry() + +class TreeBuilder(object): + """Turn a document into a Beautiful Soup object tree.""" + + NAME = "[Unknown tree builder]" + ALTERNATE_NAMES = [] + features = [] + + is_xml = False + picklable = False + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. + + # A value for these tag/attribute combinations is a space- or + # comma-separated list of CDATA, rather than a single CDATA. + DEFAULT_CDATA_LIST_ATTRIBUTES = {} + + DEFAULT_PRESERVE_WHITESPACE_TAGS = set() + + USE_DEFAULT = object() + + def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT): + """Constructor. + + :param multi_valued_attributes: If this is set to None, the + TreeBuilder will not turn any values for attributes like + 'class' into lists. Setting this do a dictionary will + customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES + for an example. + + Internally, these are called "CDATA list attributes", but that + probably doesn't make sense to an end-user, so the argument name + is `multi_valued_attributes`. + + :param preserve_whitespace_tags: + """ + self.soup = None + if multi_valued_attributes is self.USE_DEFAULT: + multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES + self.cdata_list_attributes = multi_valued_attributes + if preserve_whitespace_tags is self.USE_DEFAULT: + preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS + self.preserve_whitespace_tags = preserve_whitespace_tags + + def initialize_soup(self, soup): + """The BeautifulSoup object has been initialized and is now + being associated with the TreeBuilder. + """ + self.soup = soup + + def reset(self): + pass + + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTMLBuilder does not consider a <p> tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty <p> tag + will be presented as "<p></p>", not "<p />". + + The default implementation has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no contents. + "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will + be left alone. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + + def feed(self, markup): + raise NotImplementedError() + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + return markup, None, None, False + + def test_fragment_to_document(self, fragment): + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty <head> tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of tests. + """ + return fragment + + def set_up_substitutions(self, tag): + return False + + def _replace_cdata_list_attribute_values(self, tag_name, attrs): + """Replaces class="foo bar" with class=["foo", "bar"] + + Modifies its input in place. + """ + if not attrs: + return attrs + if self.cdata_list_attributes: + universal = self.cdata_list_attributes.get('*', []) + tag_specific = self.cdata_list_attributes.get( + tag_name.lower(), None) + for attr in list(attrs.keys()): + if attr in universal or (tag_specific and attr in tag_specific): + # We have a "class"-type attribute whose string + # value is a whitespace-separated list of + # values. Split it into a list. + value = attrs[attr] + if isinstance(value, str): + values = nonwhitespace_re.findall(value) + else: + # html5lib sometimes calls setAttributes twice + # for the same tag when rearranging the parse + # tree. On the second call the attribute value + # here is already a list. If this happens, + # leave the value alone rather than trying to + # split it again. + values = value + attrs[attr] = values + return attrs + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events.""" + + def feed(self, markup): + raise NotImplementedError() + + def close(self): + pass + + def startElement(self, name, attrs): + attrs = dict((key[1], value) for key, value in list(attrs.items())) + #print "Start %s, %r" % (name, attrs) + self.soup.handle_starttag(name, attrs) + + def endElement(self, name): + #print "End %s" % name + self.soup.handle_endtag(name) + + def startElementNS(self, nsTuple, nodeName, attrs): + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple, nodeName): + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + #handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix, nodeValue): + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix): + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content): + self.soup.handle_data(content) + + def startDocument(self): + pass + + def endDocument(self): + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML. + + Such as which tags are empty-element tags. + """ + + empty_element_tags = set([ + # These are from HTML5. + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + + # These are from earlier versions of HTML and are removed in HTML5. + 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' + ]) + + # The HTML standard defines these as block-level elements. Beautiful + # Soup does not treat these elements differently from other elements, + # but it may do so eventually, and this information is available if + # you need to use it. + block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) + + # The HTML standard defines these attributes as containing a + # space-separated list of values, not a single value. That is, + # class="foo bar" means that the 'class' attribute has two values, + # 'foo' and 'bar', not the single value 'foo bar'. When we + # encounter one of these attributes, we will parse its value into + # a list of values if possible. Upon output, the list will be + # converted back into a string. + DEFAULT_CDATA_LIST_ATTRIBUTES = { + "*" : ['class', 'accesskey', 'dropzone'], + "a" : ['rel', 'rev'], + "link" : ['rel', 'rev'], + "td" : ["headers"], + "th" : ["headers"], + "td" : ["headers"], + "form" : ["accept-charset"], + "object" : ["archive"], + + # These are HTML5 specific, as are *.accesskey and *.dropzone above. + "area" : ["rel"], + "icon" : ["sizes"], + "iframe" : ["sandbox"], + "output" : ["for"], + } + + DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + + def set_up_substitutions(self, tag): + # We are only interested in <meta> tags + if tag.name != 'meta': + return False + + http_equiv = tag.get('http-equiv') + content = tag.get('content') + charset = tag.get('charset') + + # We are interested in <meta> tags that say what encoding the + # document was originally in. This means HTML 5-style <meta> + # tags that provide the "charset" attribute. It also means + # HTML 4-style <meta> tags that provide the "content" + # attribute and have "http-equiv" set to "content-type". + # + # In both cases we will replace the value of the appropriate + # attribute with a standin object that can take on any + # encoding. + meta_encoding = None + if charset is not None: + # HTML 5 style: + # <meta charset="utf8"> + meta_encoding = charset + tag['charset'] = CharsetMetaAttributeValue(charset) + + elif (content is not None and http_equiv is not None + and http_equiv.lower() == 'content-type'): + # HTML 4 style: + # <meta http-equiv="content-type" content="text/html; charset=utf8"> + tag['content'] = ContentMetaAttributeValue(content) + + return (meta_encoding is not None) + +def register_treebuilders_from(module): + """Copy TreeBuilders from the given module into this module.""" + # I'm fairly sure this is not the best way to do this. + this_module = sys.modules['bs4.builder'] + for name in module.__all__: + obj = getattr(module, name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + # Register the builder while we're at it. + this_module.builder_registry.register(obj) + +class ParserRejectedMarkup(Exception): + pass + +# Builders are registered in reverse order of priority, so that custom +# builder registrations will take precedence. In general, we want lxml +# to take precedence over html5lib, because it's faster. And we only +# want to use HTMLParser as a last result. +from . import _htmlparser +register_treebuilders_from(_htmlparser) +try: + from . import _html5lib + register_treebuilders_from(_html5lib) +except ImportError: + # They don't have html5lib installed. + pass +try: + from . import _lxml + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass diff --git a/libs3/bs4/builder/_html5lib.py b/libs3/bs4/builder/_html5lib.py new file mode 100644 index 000000000..090bb61a8 --- /dev/null +++ b/libs3/bs4/builder/_html5lib.py @@ -0,0 +1,426 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'HTML5TreeBuilder', + ] + +import warnings +import re +from bs4.builder import ( + PERMISSIVE, + HTML, + HTML_5, + HTMLTreeBuilder, + ) +from bs4.element import ( + NamespacedAttribute, + nonwhitespace_re, +) +import html5lib +from html5lib.constants import ( + namespaces, + prefixes, + ) +from bs4.element import ( + Comment, + Doctype, + NavigableString, + Tag, + ) + +try: + # Pre-0.99999999 + from html5lib.treebuilders import _base as treebuilder_base + new_html5lib = False +except ImportError as e: + # 0.99999999 and up + from html5lib.treebuilders import base as treebuilder_base + new_html5lib = True + +class HTML5TreeBuilder(HTMLTreeBuilder): + """Use html5lib to build a tree.""" + + NAME = "html5lib" + + features = [NAME, PERMISSIVE, HTML_5, HTML] + + def prepare_markup(self, markup, user_specified_encoding, + document_declared_encoding=None, exclude_encodings=None): + # Store the user-specified encoding for use later on. + self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + if exclude_encodings: + warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") + yield (markup, None, None, False) + + # These methods are defined by Beautiful Soup. + def feed(self, markup): + if self.soup.parse_only is not None: + warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + + extra_kwargs = dict() + if not isinstance(markup, str): + if new_html5lib: + extra_kwargs['override_encoding'] = self.user_specified_encoding + else: + extra_kwargs['encoding'] = self.user_specified_encoding + doc = parser.parse(markup, **extra_kwargs) + + # Set the character encoding detected by the tokenizer. + if isinstance(markup, str): + # We need to special-case this because html5lib sets + # charEncoding to UTF-8 if it gets Unicode input. + doc.original_encoding = None + else: + original_encoding = parser.tokenizer.stream.charEncoding[0] + if not isinstance(original_encoding, str): + # In 0.99999999 and up, the encoding is an html5lib + # Encoding object. We want to use a string for compatibility + # with other tree builders. + original_encoding = original_encoding.name + doc.original_encoding = original_encoding + + def create_treebuilder(self, namespaceHTMLElements): + self.underlying_builder = TreeBuilderForHtml5lib( + namespaceHTMLElements, self.soup) + return self.underlying_builder + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return '<html><head></head><body>%s</body></html>' % fragment + + +class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): + + def __init__(self, namespaceHTMLElements, soup=None): + if soup: + self.soup = soup + else: + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") + super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + + def documentClass(self): + self.soup.reset() + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + doctype = Doctype.for_name_and_ids(name, publicId, systemId) + self.soup.object_was_parsed(doctype) + + def elementClass(self, name, namespace): + tag = self.soup.new_tag(name, namespace) + return Element(tag, self.soup, namespace) + + def commentClass(self, data): + return TextNode(Comment(data), self.soup) + + def fragmentClass(self): + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") + self.soup.name = "[document_fragment]" + return Element(self.soup, self.soup, None) + + def appendChild(self, node): + # XXX This code is not covered by the BS4 tests. + self.soup.append(node.element) + + def getDocument(self): + return self.soup + + def getFragment(self): + return treebuilder_base.TreeBuilder.getFragment(self).element + + def testSerializer(self, element): + from bs4 import BeautifulSoup + rv = [] + doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') + + def serializeElement(element, indent=0): + if isinstance(element, BeautifulSoup): + pass + if isinstance(element, Doctype): + m = doctype_re.match(element) + if m: + name = m.group(1) + if m.lastindex > 1: + publicId = m.group(2) or "" + systemId = m.group(3) or m.group(4) or "" + rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" % + (' ' * indent, name, publicId, systemId)) + else: + rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name)) + else: + rv.append("|%s<!DOCTYPE >" % (' ' * indent,)) + elif isinstance(element, Comment): + rv.append("|%s<!-- %s -->" % (' ' * indent, element)) + elif isinstance(element, NavigableString): + rv.append("|%s\"%s\"" % (' ' * indent, element)) + else: + if element.namespace: + name = "%s %s" % (prefixes[element.namespace], + element.name) + else: + name = element.name + rv.append("|%s<%s>" % (' ' * indent, name)) + if element.attrs: + attributes = [] + for name, value in list(element.attrs.items()): + if isinstance(name, NamespacedAttribute): + name = "%s %s" % (prefixes[name.namespace], name.name) + if isinstance(value, list): + value = " ".join(value) + attributes.append((name, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) + indent += 2 + for child in element.children: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) + +class AttrList(object): + def __init__(self, element): + self.element = element + self.attrs = dict(self.element.attrs) + def __iter__(self): + return list(self.attrs.items()).__iter__() + def __setitem__(self, name, value): + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = self.element.cdata_list_attributes + if (name in list_attr['*'] + or (self.element.name in list_attr + and name in list_attr[self.element.name])): + # A node that is being cloned may have already undergone + # this procedure. + if not isinstance(value, list): + value = nonwhitespace_re.findall(value) + self.element[name] = value + def items(self): + return list(self.attrs.items()) + def keys(self): + return list(self.attrs.keys()) + def __len__(self): + return len(self.attrs) + def __getitem__(self, name): + return self.attrs[name] + def __contains__(self, name): + return name in list(self.attrs.keys()) + + +class Element(treebuilder_base.Node): + def __init__(self, element, soup, namespace): + treebuilder_base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def appendChild(self, node): + string_child = child = None + if isinstance(node, str): + # Some other piece of code decided to pass in a string + # instead of creating a TextElement object to contain the + # string. + string_child = child = node + elif isinstance(node, Tag): + # Some other piece of code decided to pass in a Tag + # instead of creating an Element object to contain the + # Tag. + child = node + elif node.element.__class__ == NavigableString: + string_child = child = node.element + node.parent = self + else: + child = node.element + node.parent = self + + if not isinstance(child, str) and child.parent is not None: + node.element.extract() + + if (string_child is not None and self.element.contents + and self.element.contents[-1].__class__ == NavigableString): + # We are appending a string onto another string. + # TODO This has O(n^2) performance, for input like + # "a</a>a</a>a</a>..." + old_element = self.element.contents[-1] + new_element = self.soup.new_string(old_element + string_child) + old_element.replace_with(new_element) + self.soup._most_recent_element = new_element + else: + if isinstance(node, str): + # Create a brand new NavigableString from this string. + child = self.soup.new_string(node) + + # Tell Beautiful Soup to act as if it parsed this element + # immediately after the parent's last descendant. (Or + # immediately after the parent, if it has no children.) + if self.element.contents: + most_recent_element = self.element._last_descendant(False) + elif self.element.next_element is not None: + # Something from further ahead in the parse tree is + # being inserted into this earlier element. This is + # very annoying because it means an expensive search + # for the last element in the tree. + most_recent_element = self.soup._last_descendant() + else: + most_recent_element = self.element + + self.soup.object_was_parsed( + child, parent=self.element, + most_recent_element=most_recent_element) + + def getAttributes(self): + if isinstance(self.element, Comment): + return {} + return AttrList(self.element) + + def setAttributes(self, attributes): + + if attributes is not None and len(attributes) > 0: + + converted_attributes = [] + for name, value in list(attributes.items()): + if isinstance(name, tuple): + new_name = NamespacedAttribute(*name) + del attributes[name] + attributes[new_name] = value + + self.soup.builder._replace_cdata_list_attribute_values( + self.name, attributes) + for name, value in list(attributes.items()): + self.element[name] = value + + # The attributes may contain variables that need substitution. + # Call set_up_substitutions manually. + # + # The Tag constructor called this method when the Tag was created, + # but we just set/changed the attributes, so call it again. + self.soup.builder.set_up_substitutions(self.element) + attributes = property(getAttributes, setAttributes) + + def insertText(self, data, insertBefore=None): + text = TextNode(self.soup.new_string(data), self.soup) + if insertBefore: + self.insertBefore(text, insertBefore) + else: + self.appendChild(text) + + def insertBefore(self, node, refNode): + index = self.element.index(refNode.element) + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[index-1].__class__ == NavigableString): + # (See comments in appendChild) + old_node = self.element.contents[index-1] + new_str = self.soup.new_string(old_node + node.element) + old_node.replace_with(new_str) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node): + node.element.extract() + + def reparentChildren(self, new_parent): + """Move all of this tag's children into another tag.""" + # print "MOVE", self.element.contents + # print "FROM", self.element + # print "TO", new_parent.element + + element = self.element + new_parent_element = new_parent.element + # Determine what this tag's next_element will be once all the children + # are removed. + final_next_element = element.next_sibling + + new_parents_last_descendant = new_parent_element._last_descendant(False, False) + if len(new_parent_element.contents) > 0: + # The new parent already contains children. We will be + # appending this tag's children to the end. + new_parents_last_child = new_parent_element.contents[-1] + new_parents_last_descendant_next_element = new_parents_last_descendant.next_element + else: + # The new parent contains no children. + new_parents_last_child = None + new_parents_last_descendant_next_element = new_parent_element.next_element + + to_append = element.contents + if len(to_append) > 0: + # Set the first child's previous_element and previous_sibling + # to elements within the new parent + first_child = to_append[0] + if new_parents_last_descendant is not None: + first_child.previous_element = new_parents_last_descendant + else: + first_child.previous_element = new_parent_element + first_child.previous_sibling = new_parents_last_child + if new_parents_last_descendant is not None: + new_parents_last_descendant.next_element = first_child + else: + new_parent_element.next_element = first_child + if new_parents_last_child is not None: + new_parents_last_child.next_sibling = first_child + + # Find the very last element being moved. It is now the + # parent's last descendant. It has no .next_sibling and + # its .next_element is whatever the previous last + # descendant had. + last_childs_last_descendant = to_append[-1]._last_descendant(False, True) + + last_childs_last_descendant.next_element = new_parents_last_descendant_next_element + if new_parents_last_descendant_next_element is not None: + # TODO: This code has no test coverage and I'm not sure + # how to get html5lib to go through this path, but it's + # just the other side of the previous line. + new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant + last_childs_last_descendant.next_sibling = None + + for child in to_append: + child.parent = new_parent_element + new_parent_element.contents.append(child) + + # Now that this element has no children, change its .next_element. + element.contents = [] + element.next_element = final_next_element + + # print "DONE WITH MOVE" + # print "FROM", self.element + # print "TO", new_parent_element + + def cloneNode(self): + tag = self.soup.new_tag(self.element.name, self.namespace) + node = Element(tag, self.soup, self.namespace) + for key,value in self.attributes: + node.attributes[key] = value + return node + + def hasContent(self): + return self.element.contents + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class TextNode(Element): + def __init__(self, element, soup): + treebuilder_base.Node.__init__(self, None) + self.element = element + self.soup = soup + + def cloneNode(self): + raise NotImplementedError diff --git a/libs3/bs4/builder/_htmlparser.py b/libs3/bs4/builder/_htmlparser.py new file mode 100644 index 000000000..ea549c356 --- /dev/null +++ b/libs3/bs4/builder/_htmlparser.py @@ -0,0 +1,350 @@ +# encoding: utf-8 +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'HTMLParserTreeBuilder', + ] + +from html.parser import HTMLParser + +try: + from html.parser import HTMLParseError +except ImportError as e: + # HTMLParseError is removed in Python 3.5. Since it can never be + # thrown in 3.5, we can just define our own class as a placeholder. + class HTMLParseError(Exception): + pass + +import sys +import warnings + +# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' +# argument, which we'd like to set to False. Unfortunately, +# http://bugs.python.org/issue13273 makes strict=True a better bet +# before Python 3.2.3. +# +# At the end of this file, we monkeypatch HTMLParser so that +# strict=True works well on Python 3.2.2. +major, minor, release = sys.version_info[:3] +CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 +CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 +CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 + + +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, + ) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + HTML, + HTMLTreeBuilder, + STRICT, + ) + + +HTMLPARSER = 'html.parser' + +class BeautifulSoupHTMLParser(HTMLParser): + + def __init__(self, *args, **kwargs): + HTMLParser.__init__(self, *args, **kwargs) + + # Keep a list of empty-element tags that were encountered + # without an explicit closing tag. If we encounter a closing tag + # of this type, we'll associate it with one of those entries. + # + # This isn't a stack because we don't care about the + # order. It's a list of closing tags we've already handled and + # will ignore, assuming they ever show up. + self.already_closed_empty_element = [] + + def error(self, msg): + """In Python 3, HTMLParser subclasses must implement error(), although this + requirement doesn't appear to be documented. + + In Python 2, HTMLParser implements error() as raising an exception. + + In any event, this method is called only on very strange markup and our best strategy + is to pretend it didn't happen and keep going. + """ + warnings.warn(msg) + + def handle_startendtag(self, name, attrs): + # This is only called when the markup looks like + # <tag/>. + + # is_startend() tells handle_starttag not to close the tag + # just because its name matches a known empty-element tag. We + # know that this is an empty-element tag and we want to call + # handle_endtag ourselves. + tag = self.handle_starttag(name, attrs, handle_empty_element=False) + self.handle_endtag(name) + + def handle_starttag(self, name, attrs, handle_empty_element=True): + # XXX namespace + attr_dict = {} + for key, value in attrs: + # Change None attribute values to the empty string + # for consistency with the other tree builders. + if value is None: + value = '' + attr_dict[key] = value + attrvalue = '""' + #print "START", name + tag = self.soup.handle_starttag(name, None, None, attr_dict) + if tag and tag.is_empty_element and handle_empty_element: + # Unlike other parsers, html.parser doesn't send separate end tag + # events for empty-element tags. (It's handled in + # handle_startendtag, but only if the original markup looked like + # <tag/>.) + # + # So we need to call handle_endtag() ourselves. Since we + # know the start event is identical to the end event, we + # don't want handle_endtag() to cross off any previous end + # events for tags of this name. + self.handle_endtag(name, check_already_closed=False) + + # But we might encounter an explicit closing tag for this tag + # later on. If so, we want to ignore it. + self.already_closed_empty_element.append(name) + + def handle_endtag(self, name, check_already_closed=True): + #print "END", name + if check_already_closed and name in self.already_closed_empty_element: + # This is a redundant end tag for an empty-element tag. + # We've already called handle_endtag() for it, so just + # check it off the list. + # print "ALREADY CLOSED", name + self.already_closed_empty_element.remove(name) + else: + self.soup.handle_endtag(name) + + def handle_data(self, data): + self.soup.handle_data(data) + + def handle_charref(self, name): + # XXX workaround for a bug in HTMLParser. Remove this once + # it's fixed in all supported versions. + # http://bugs.python.org/issue13633 + if name.startswith('x'): + real_name = int(name.lstrip('x'), 16) + elif name.startswith('X'): + real_name = int(name.lstrip('X'), 16) + else: + real_name = int(name) + + data = None + if real_name < 256: + # HTML numeric entities are supposed to reference Unicode + # code points, but sometimes they reference code points in + # some other encoding (ahem, Windows-1252). E.g. “ + # instead of É for LEFT DOUBLE QUOTATION MARK. This + # code tries to detect this situation and compensate. + for encoding in (self.soup.original_encoding, 'windows-1252'): + if not encoding: + continue + try: + data = bytearray([real_name]).decode(encoding) + except UnicodeDecodeError as e: + pass + if not data: + try: + data = chr(real_name) + except (ValueError, OverflowError) as e: + pass + data = data or "\N{REPLACEMENT CHARACTER}" + self.handle_data(data) + + def handle_entityref(self, name): + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + # If this were XML, it would be ambiguous whether "&foo" + # was an character entity reference with a missing + # semicolon or the literal string "&foo". Since this is + # HTML, we have a complete list of all character entity references, + # and this one wasn't found, so assume it's the literal string "&foo". + data = "&%s" % name + self.handle_data(data) + + def handle_comment(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data): + self.soup.endData() + if data.startswith("DOCTYPE "): + data = data[len("DOCTYPE "):] + elif data == 'DOCTYPE': + # i.e. "<!DOCTYPE>" + data = '' + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data): + if data.upper().startswith('CDATA['): + cls = CData + data = data[len('CDATA['):] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(ProcessingInstruction) + + +class HTMLParserTreeBuilder(HTMLTreeBuilder): + + is_xml = False + picklable = True + NAME = HTMLPARSER + features = [NAME, HTML, STRICT] + + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): + super(HTMLParserTreeBuilder, self).__init__(**kwargs) + parser_args = parser_args or [] + parser_kwargs = parser_kwargs or {} + if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: + parser_kwargs['strict'] = False + if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: + parser_kwargs['convert_charrefs'] = False + self.parser_args = (parser_args, parser_kwargs) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None, exclude_encodings=None): + """ + :return: A 4-tuple (markup, original encoding, encoding + declared within markup, whether any characters had to be + replaced with REPLACEMENT CHARACTER). + """ + if isinstance(markup, str): + yield (markup, None, None, False) + return + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True, + exclude_encodings=exclude_encodings) + yield (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + args, kwargs = self.parser_args + parser = BeautifulSoupHTMLParser(*args, **kwargs) + parser.soup = self.soup + try: + parser.feed(markup) + parser.close() + except HTMLParseError as e: + warnings.warn(RuntimeWarning( + "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) + raise e + parser.already_closed_empty_element = [] + +# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some +# 3.2.3 code. This ensures they don't treat markup like <p></p> as a +# string. +# +# XXX This code can be removed once most Python 3 users are on 3.2.3. +if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: + import re + attrfind_tolerant = re.compile( + r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') + HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant + + locatestarttagend = re.compile(r""" + <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name + (?:\s+ # whitespace before attribute name + (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name + (?:\s*=\s* # value indicator + (?:'[^']*' # LITA-enclosed value + |\"[^\"]*\" # LIT-enclosed value + |[^'\">\s]+ # bare value + ) + )? + ) + )* + \s* # trailing whitespace +""", re.VERBOSE) + BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend + + from html.parser import tagfind, attrfind + + def parse_starttag(self, i): + self.__starttag_text = None + endpos = self.check_for_whole_start_tag(i) + if endpos < 0: + return endpos + rawdata = self.rawdata + self.__starttag_text = rawdata[i:endpos] + + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + match = tagfind.match(rawdata, i+1) + assert match, 'unexpected call to parse_starttag()' + k = match.end() + self.lasttag = tag = rawdata[i+1:k].lower() + while k < endpos: + if self.strict: + m = attrfind.match(rawdata, k) + else: + m = attrfind_tolerant.match(rawdata, k) + if not m: + break + attrname, rest, attrvalue = m.group(1, 2, 3) + if not rest: + attrvalue = None + elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ + attrvalue[:1] == '"' == attrvalue[-1:]: + attrvalue = attrvalue[1:-1] + if attrvalue: + attrvalue = self.unescape(attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = m.end() + + end = rawdata[k:endpos].strip() + if end not in (">", "/>"): + lineno, offset = self.getpos() + if "\n" in self.__starttag_text: + lineno = lineno + self.__starttag_text.count("\n") + offset = len(self.__starttag_text) \ + - self.__starttag_text.rfind("\n") + else: + offset = offset + len(self.__starttag_text) + if self.strict: + self.error("junk characters in start tag: %r" + % (rawdata[k:endpos][:20],)) + self.handle_data(rawdata[i:endpos]) + return endpos + if end.endswith('/>'): + # XHTML-style empty tag: <span attr="value" /> + self.handle_startendtag(tag, attrs) + else: + self.handle_starttag(tag, attrs) + if tag in self.CDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag) + return endpos + + def set_cdata_mode(self, elem): + self.cdata_elem = elem.lower() + self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) + + BeautifulSoupHTMLParser.parse_starttag = parse_starttag + BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode + + CONSTRUCTOR_TAKES_STRICT = True diff --git a/libs3/bs4/builder/_lxml.py b/libs3/bs4/builder/_lxml.py new file mode 100644 index 000000000..a490e2301 --- /dev/null +++ b/libs3/bs4/builder/_lxml.py @@ -0,0 +1,296 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'LXMLTreeBuilderForXML', + 'LXMLTreeBuilder', + ] + +try: + from collections.abc import Callable # Python 3.6 +except ImportError as e: + from collections import Callable + +from io import BytesIO +from io import StringIO +from lxml import etree +from bs4.element import ( + Comment, + Doctype, + NamespacedAttribute, + ProcessingInstruction, + XMLProcessingInstruction, +) +from bs4.builder import ( + FAST, + HTML, + HTMLTreeBuilder, + PERMISSIVE, + ParserRejectedMarkup, + TreeBuilder, + XML) +from bs4.dammit import EncodingDetector + +LXML = 'lxml' + +def _invert(d): + "Invert a dictionary." + return dict((v,k) for k, v in list(d.items())) + +class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser + + is_xml = True + processing_instruction_class = XMLProcessingInstruction + + NAME = "lxml-xml" + ALTERNATE_NAMES = ["xml"] + + # Well, it's permissive by XML parser standards. + features = [NAME, LXML, XML, FAST, PERMISSIVE] + + CHUNK_SIZE = 512 + + # This namespace mapping is specified in the XML Namespace + # standard. + DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') + + DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) + + def initialize_soup(self, soup): + """Let the BeautifulSoup object know about the standard namespace + mapping. + """ + super(LXMLTreeBuilderForXML, self).initialize_soup(soup) + self._register_namespaces(self.DEFAULT_NSMAPS) + + def _register_namespaces(self, mapping): + """Let the BeautifulSoup object know about namespaces encountered + while parsing the document. + + This might be useful later on when creating CSS selectors. + """ + for key, value in list(mapping.items()): + if key and key not in self.soup._namespaces: + # Let the BeautifulSoup object know about a new namespace. + # If there are multiple namespaces defined with the same + # prefix, the first one in the document takes precedence. + self.soup._namespaces[key] = value + + def default_parser(self, encoding): + # This can either return a parser object or a class, which + # will be instantiated with default arguments. + if self._default_parser is not None: + return self._default_parser + return etree.XMLParser( + target=self, strip_cdata=False, recover=True, encoding=encoding) + + def parser_for(self, encoding): + # Use the default parser. + parser = self.default_parser(encoding) + + if isinstance(parser, Callable): + # Instantiate the parser with default arguments + parser = parser(target=self, strip_cdata=False, encoding=encoding) + return parser + + def __init__(self, parser=None, empty_element_tags=None, **kwargs): + # TODO: Issue a warning if parser is present but not a + # callable, since that means there's no way to create new + # parsers for different encodings. + self._default_parser = parser + if empty_element_tags is not None: + self.empty_element_tags = set(empty_element_tags) + self.soup = None + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + super(LXMLTreeBuilderForXML, self).__init__(**kwargs) + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) + + def prepare_markup(self, markup, user_specified_encoding=None, + exclude_encodings=None, + document_declared_encoding=None): + """ + :yield: A series of 4-tuples. + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for parsing the document. + """ + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + else: + self.processing_instruction_class = XMLProcessingInstruction + + if isinstance(markup, str): + # We were given Unicode. Maybe lxml can parse Unicode on + # this system? + yield markup, None, document_declared_encoding, False + + if isinstance(markup, str): + # No, apparently not. Convert the Unicode to UTF-8 and + # tell lxml to parse it as UTF-8. + yield (markup.encode("utf8"), "utf8", + document_declared_encoding, False) + + try_encodings = [user_specified_encoding, document_declared_encoding] + detector = EncodingDetector( + markup, try_encodings, is_html, exclude_encodings) + for encoding in detector.encodings: + yield (detector.markup, encoding, document_declared_encoding, False) + + def feed(self, markup): + if isinstance(markup, bytes): + markup = BytesIO(markup) + elif isinstance(markup, str): + markup = StringIO(markup) + + # Call feed() at least once, even if the markup is empty, + # or the parser won't be initialized. + data = markup.read(self.CHUNK_SIZE) + try: + self.parser = self.parser_for(self.soup.original_encoding) + self.parser.feed(data) + while len(data) != 0: + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + if len(data) != 0: + self.parser.feed(data) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: + raise ParserRejectedMarkup(str(e)) + + def close(self): + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + + def start(self, name, attrs, nsmap={}): + # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. + attrs = dict(attrs) + nsprefix = None + # Invert each namespace map as it comes in. + if len(nsmap) == 0 and len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. + self.nsmaps.append(None) + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + + # First, Let the BeautifulSoup object know about it. + self._register_namespaces(nsmap) + + # Then, add it to our running list of inverted namespace + # mappings. + self.nsmaps.append(_invert(nsmap)) + + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() + for prefix, namespace in list(nsmap.items()): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace + + # Namespaces are in play. Find any attributes that came in + # from lxml with namespaces attached to their names, and + # turn then into NamespacedAttribute objects. + new_attrs = {} + for attr, value in list(attrs.items()): + namespace, attr = self._getNsTag(attr) + if namespace is None: + new_attrs[attr] = value + else: + nsprefix = self._prefix_for_namespace(namespace) + attr = NamespacedAttribute(nsprefix, attr, namespace) + new_attrs[attr] = value + attrs = new_attrs + + namespace, name = self._getNsTag(name) + nsprefix = self._prefix_for_namespace(namespace) + self.soup.handle_starttag(name, namespace, nsprefix, attrs) + + def _prefix_for_namespace(self, namespace): + """Find the currently active prefix for the given namespace.""" + if namespace is None: + return None + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + return inverted_nsmap[namespace] + return None + + def end(self, name): + self.soup.endData() + completed_tag = self.soup.tagStack[-1] + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) + if len(self.nsmaps) > 1: + # This tag, or one of its parents, introduced a namespace + # mapping, so pop it off the stack. + self.nsmaps.pop() + + def pi(self, target, data): + self.soup.endData() + self.soup.handle_data(target + ' ' + data) + self.soup.endData(self.processing_instruction_class) + + def data(self, content): + self.soup.handle_data(content) + + def doctype(self, name, pubid, system): + self.soup.endData() + doctype = Doctype.for_name_and_ids(name, pubid, system) + self.soup.object_was_parsed(doctype) + + def comment(self, content): + "Handle comments as Comment objects." + self.soup.endData() + self.soup.handle_data(content) + self.soup.endData(Comment) + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment + + +class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + + NAME = LXML + ALTERNATE_NAMES = ["lxml-html"] + + features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] + is_xml = False + processing_instruction_class = ProcessingInstruction + + def default_parser(self, encoding): + return etree.HTMLParser + + def feed(self, markup): + encoding = self.soup.original_encoding + try: + self.parser = self.parser_for(encoding) + self.parser.feed(markup) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: + raise ParserRejectedMarkup(str(e)) + + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return '<html><body>%s</body></html>' % fragment diff --git a/libs3/bs4/dammit.py b/libs3/bs4/dammit.py new file mode 100644 index 000000000..c7ac4d431 --- /dev/null +++ b/libs3/bs4/dammit.py @@ -0,0 +1,850 @@ +# -*- coding: utf-8 -*- +"""Beautiful Soup bonus library: Unicode, Dammit + +This library converts a bytestream to Unicode through any means +necessary. It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It works best on XML and HTML, but it does not rewrite the +XML or HTML to reflect a new encoding; that's the tree builder's job. +""" +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import codecs +from html.entities import codepoint2name +import re +import logging +import string + +# Import a library to autodetect character encodings. +chardet_type = None +try: + # First try the fast C implementation. + # PyPI package: cchardet + import cchardet + def chardet_dammit(s): + return cchardet.detect(s)['encoding'] +except ImportError: + try: + # Fall back to the pure Python implementation + # Debian package: python-chardet + # PyPI package: chardet + import chardet + def chardet_dammit(s): + return chardet.detect(s)['encoding'] + #import chardet.constants + #chardet.constants._debug = 1 + except ImportError: + # No chardet available. + def chardet_dammit(s): + return None + +# Available from http://cjkpython.i18n.org/. +try: + import iconv_codec +except ImportError: + pass + +xml_encoding_re = re.compile( + '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I) +html_meta_re = re.compile( + '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) + +class EntitySubstitution(object): + + """Substitute XML or HTML entities for the corresponding characters.""" + + def _populate_class_variables(): + lookup = {} + reverse_lookup = {} + characters_for_re = [] + + # &apos is an XHTML entity and an HTML 5, but not an HTML 4 + # entity. We don't want to use it, but we want to recognize it on the way in. + # + # TODO: Ideally we would be able to recognize all HTML 5 named + # entities, but that's a little tricky. + extra = [(39, 'apos')] + for codepoint, name in list(codepoint2name.items()) + extra: + character = chr(codepoint) + if codepoint not in (34, 39): + # There's no point in turning the quotation mark into + # " or the single quote into ', unless it + # happens within an attribute value, which is handled + # elsewhere. + characters_for_re.append(character) + lookup[character] = name + # But we do want to recognize those entities on the way in and + # convert them to Unicode characters. + reverse_lookup[name] = character + re_definition = "[%s]" % "".join(characters_for_re) + return lookup, reverse_lookup, re.compile(re_definition) + (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, + CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() + + CHARACTER_TO_XML_ENTITY = { + "'": "apos", + '"': "quot", + "&": "amp", + "<": "lt", + ">": "gt", + } + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" + ")") + + AMPERSAND_OR_BRACKET = re.compile("([<>&])") + + @classmethod + def _substitute_html_entity(cls, matchobj): + entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) + return "&%s;" % entity + + @classmethod + def _substitute_xml_entity(cls, matchobj): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + return "&%s;" % entity + + @classmethod + def quoted_attribute_value(self, value): + """Make a value into a quoted XML attribute, possibly escaping it. + + Most strings will be quoted using double quotes. + + Bob's Bar -> "Bob's Bar" + + If a string contains double quotes, it will be quoted using + single quotes. + + Welcome to "my bar" -> 'Welcome to "my bar"' + + If a string contains both single and double quotes, the + double quotes will be escaped, and the string will be quoted + using double quotes. + + Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" + """ + quote_with = '"' + if '"' in value: + if "'" in value: + # The string contains both single and double + # quotes. Turn the double quotes into + # entities. We quote the double quotes rather than + # the single quotes because the entity name is + # """ whether this is HTML or XML. If we + # quoted the single quotes, we'd have to decide + # between ' and &squot;. + replace_with = """ + value = value.replace('"', replace_with) + else: + # There are double quotes but no single quotes. + # We can use single quotes to quote the attribute. + quote_with = "'" + return quote_with + value + quote_with + + @classmethod + def substitute_xml(cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign + will become <, the greater-than sign will become >, + and any ampersands will become &. If you want ampersands + that appear to be part of an entity definition to be left + alone, use substitute_xml_containing_entities() instead. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets and ampersands. + value = cls.AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_xml_containing_entities( + cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets, and ampersands that aren't part of + # entities. + value = cls.BARE_AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_html(cls, s): + """Replace certain Unicode characters with named HTML entities. + + This differs from data.encode(encoding, 'xmlcharrefreplace') + in that the goal is to make the result more readable (to those + with ASCII displays) rather than to recover from + errors. There's absolutely nothing wrong with a UTF-8 string + containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that + character with "é" will make it more readable to some + people. + """ + return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( + cls._substitute_html_entity, s) + + +class EncodingDetector: + """Suggests a number of possible encodings for a bytestring. + + Order of precedence: + + 1. Encodings you specifically tell EncodingDetector to try first + (the override_encodings argument to the constructor). + + 2. An encoding declared within the bytestring itself, either in an + XML declaration (if the bytestring is to be interpreted as an XML + document), or in a <meta> tag (if the bytestring is to be + interpreted as an HTML document.) + + 3. An encoding detected through textual analysis by chardet, + cchardet, or a similar external library. + + 4. UTF-8. + + 5. Windows-1252. + """ + def __init__(self, markup, override_encodings=None, is_html=False, + exclude_encodings=None): + self.override_encodings = override_encodings or [] + exclude_encodings = exclude_encodings or [] + self.exclude_encodings = set([x.lower() for x in exclude_encodings]) + self.chardet_encoding = None + self.is_html = is_html + self.declared_encoding = None + + # First order of business: strip a byte-order mark. + self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) + + def _usable(self, encoding, tried): + if encoding is not None: + encoding = encoding.lower() + if encoding in self.exclude_encodings: + return False + if encoding not in tried: + tried.add(encoding) + return True + return False + + @property + def encodings(self): + """Yield a number of encodings that might work for this markup.""" + tried = set() + for e in self.override_encodings: + if self._usable(e, tried): + yield e + + # Did the document originally start with a byte-order mark + # that indicated its encoding? + if self._usable(self.sniffed_encoding, tried): + yield self.sniffed_encoding + + # Look within the document for an XML or HTML encoding + # declaration. + if self.declared_encoding is None: + self.declared_encoding = self.find_declared_encoding( + self.markup, self.is_html) + if self._usable(self.declared_encoding, tried): + yield self.declared_encoding + + # Use third-party character set detection to guess at the + # encoding. + if self.chardet_encoding is None: + self.chardet_encoding = chardet_dammit(self.markup) + if self._usable(self.chardet_encoding, tried): + yield self.chardet_encoding + + # As a last-ditch effort, try utf-8 and windows-1252. + for e in ('utf-8', 'windows-1252'): + if self._usable(e, tried): + yield e + + @classmethod + def strip_byte_order_mark(cls, data): + """If a byte-order mark is present, strip it and return the encoding it implies.""" + encoding = None + if isinstance(data, str): + # Unicode data cannot have a byte-order mark. + return data, encoding + if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == b'\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == b'\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == b'\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + return data, encoding + + @classmethod + def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): + """Given a document, tries to find its declared encoding. + + An XML encoding is declared at the beginning of the document. + + An HTML encoding is declared in a <meta> tag, hopefully near the + beginning of the document. + """ + if search_entire_document: + xml_endpos = html_endpos = len(markup) + else: + xml_endpos = 1024 + html_endpos = max(2048, int(len(markup) * 0.05)) + + declared_encoding = None + declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) + if not declared_encoding_match and is_html: + declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) + if declared_encoding_match is not None: + declared_encoding = declared_encoding_match.groups()[0].decode( + 'ascii', 'replace') + if declared_encoding: + return declared_encoding.lower() + return None + +class UnicodeDammit: + """A class for detecting the encoding of a *ML document and + converting it to a Unicode string. If the source encoding is + windows-1252, can replace MS smart quotes with their HTML or XML + equivalents.""" + + # This dictionary maps commonly seen values for "charset" in HTML + # meta tags to the corresponding Python codec names. It only covers + # values that aren't in Python's aliases and can't be determined + # by the heuristics in find_codec. + CHARSET_ALIASES = {"macintosh": "mac-roman", + "x-sjis": "shift-jis"} + + ENCODINGS_WITH_SMART_QUOTES = [ + "windows-1252", + "iso-8859-1", + "iso-8859-2", + ] + + def __init__(self, markup, override_encodings=[], + smart_quotes_to=None, is_html=False, exclude_encodings=[]): + self.smart_quotes_to = smart_quotes_to + self.tried_encodings = [] + self.contains_replacement_characters = False + self.is_html = is_html + self.log = logging.getLogger(__name__) + self.detector = EncodingDetector( + markup, override_encodings, is_html, exclude_encodings) + + # Short-circuit if the data is in Unicode to begin with. + if isinstance(markup, str) or markup == '': + self.markup = markup + self.unicode_markup = str(markup) + self.original_encoding = None + return + + # The encoding detector may have stripped a byte-order mark. + # Use the stripped markup from this point on. + self.markup = self.detector.markup + + u = None + for encoding in self.detector.encodings: + markup = self.detector.markup + u = self._convert_from(encoding) + if u is not None: + break + + if not u: + # None of the encodings worked. As an absolute last resort, + # try them again with character replacement. + + for encoding in self.detector.encodings: + if encoding != "ascii": + u = self._convert_from(encoding, "replace") + if u is not None: + self.log.warning( + "Some characters could not be decoded, and were " + "replaced with REPLACEMENT CHARACTER." + ) + self.contains_replacement_characters = True + break + + # If none of that worked, we could at this point force it to + # ASCII, but that would destroy so much data that I think + # giving up is better. + self.unicode_markup = u + if not u: + self.original_encoding = None + + def _sub_ms_char(self, match): + """Changes a MS smart quote character to an XML or HTML + entity, or an ASCII character.""" + orig = match.group(1) + if self.smart_quotes_to == 'ascii': + sub = self.MS_CHARS_TO_ASCII.get(orig).encode() + else: + sub = self.MS_CHARS.get(orig) + if type(sub) == tuple: + if self.smart_quotes_to == 'xml': + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() + else: + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() + return sub + + def _convert_from(self, proposed, errors="strict"): + proposed = self.find_codec(proposed) + if not proposed or (proposed, errors) in self.tried_encodings: + return None + self.tried_encodings.append((proposed, errors)) + markup = self.markup + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if (self.smart_quotes_to is not None + and proposed in self.ENCODINGS_WITH_SMART_QUOTES): + smart_quotes_re = b"([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) + + try: + #print "Trying to convert document to %s (errors=%s)" % ( + # proposed, errors) + u = self._to_unicode(markup, proposed, errors) + self.markup = u + self.original_encoding = proposed + except Exception as e: + #print "That didn't work!" + #print e + return None + #print "Correct encoding: %s" % proposed + return self.markup + + def _to_unicode(self, data, encoding, errors="strict"): + '''Given a string and its encoding, decodes the string into Unicode. + %encoding is a string recognized by encodings.aliases''' + return str(data, encoding, errors) + + @property + def declared_html_encoding(self): + if not self.is_html: + return None + return self.detector.declared_encoding + + def find_codec(self, charset): + value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) + or (charset and self._codec(charset.replace("-", ""))) + or (charset and self._codec(charset.replace("-", "_"))) + or (charset and charset.lower()) + or charset + ) + if value: + return value.lower() + return None + + def _codec(self, charset): + if not charset: + return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + + # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. + MS_CHARS = {b'\x80': ('euro', '20AC'), + b'\x81': ' ', + b'\x82': ('sbquo', '201A'), + b'\x83': ('fnof', '192'), + b'\x84': ('bdquo', '201E'), + b'\x85': ('hellip', '2026'), + b'\x86': ('dagger', '2020'), + b'\x87': ('Dagger', '2021'), + b'\x88': ('circ', '2C6'), + b'\x89': ('permil', '2030'), + b'\x8A': ('Scaron', '160'), + b'\x8B': ('lsaquo', '2039'), + b'\x8C': ('OElig', '152'), + b'\x8D': '?', + b'\x8E': ('#x17D', '17D'), + b'\x8F': '?', + b'\x90': '?', + b'\x91': ('lsquo', '2018'), + b'\x92': ('rsquo', '2019'), + b'\x93': ('ldquo', '201C'), + b'\x94': ('rdquo', '201D'), + b'\x95': ('bull', '2022'), + b'\x96': ('ndash', '2013'), + b'\x97': ('mdash', '2014'), + b'\x98': ('tilde', '2DC'), + b'\x99': ('trade', '2122'), + b'\x9a': ('scaron', '161'), + b'\x9b': ('rsaquo', '203A'), + b'\x9c': ('oelig', '153'), + b'\x9d': '?', + b'\x9e': ('#x17E', '17E'), + b'\x9f': ('Yuml', ''),} + + # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains + # horrors like stripping diacritical marks to turn á into a, but also + # contains non-horrors like turning “ into ". + MS_CHARS_TO_ASCII = { + b'\x80' : 'EUR', + b'\x81' : ' ', + b'\x82' : ',', + b'\x83' : 'f', + b'\x84' : ',,', + b'\x85' : '...', + b'\x86' : '+', + b'\x87' : '++', + b'\x88' : '^', + b'\x89' : '%', + b'\x8a' : 'S', + b'\x8b' : '<', + b'\x8c' : 'OE', + b'\x8d' : '?', + b'\x8e' : 'Z', + b'\x8f' : '?', + b'\x90' : '?', + b'\x91' : "'", + b'\x92' : "'", + b'\x93' : '"', + b'\x94' : '"', + b'\x95' : '*', + b'\x96' : '-', + b'\x97' : '--', + b'\x98' : '~', + b'\x99' : '(TM)', + b'\x9a' : 's', + b'\x9b' : '>', + b'\x9c' : 'oe', + b'\x9d' : '?', + b'\x9e' : 'z', + b'\x9f' : 'Y', + b'\xa0' : ' ', + b'\xa1' : '!', + b'\xa2' : 'c', + b'\xa3' : 'GBP', + b'\xa4' : '$', #This approximation is especially parochial--this is the + #generic currency symbol. + b'\xa5' : 'YEN', + b'\xa6' : '|', + b'\xa7' : 'S', + b'\xa8' : '..', + b'\xa9' : '', + b'\xaa' : '(th)', + b'\xab' : '<<', + b'\xac' : '!', + b'\xad' : ' ', + b'\xae' : '(R)', + b'\xaf' : '-', + b'\xb0' : 'o', + b'\xb1' : '+-', + b'\xb2' : '2', + b'\xb3' : '3', + b'\xb4' : ("'", 'acute'), + b'\xb5' : 'u', + b'\xb6' : 'P', + b'\xb7' : '*', + b'\xb8' : ',', + b'\xb9' : '1', + b'\xba' : '(th)', + b'\xbb' : '>>', + b'\xbc' : '1/4', + b'\xbd' : '1/2', + b'\xbe' : '3/4', + b'\xbf' : '?', + b'\xc0' : 'A', + b'\xc1' : 'A', + b'\xc2' : 'A', + b'\xc3' : 'A', + b'\xc4' : 'A', + b'\xc5' : 'A', + b'\xc6' : 'AE', + b'\xc7' : 'C', + b'\xc8' : 'E', + b'\xc9' : 'E', + b'\xca' : 'E', + b'\xcb' : 'E', + b'\xcc' : 'I', + b'\xcd' : 'I', + b'\xce' : 'I', + b'\xcf' : 'I', + b'\xd0' : 'D', + b'\xd1' : 'N', + b'\xd2' : 'O', + b'\xd3' : 'O', + b'\xd4' : 'O', + b'\xd5' : 'O', + b'\xd6' : 'O', + b'\xd7' : '*', + b'\xd8' : 'O', + b'\xd9' : 'U', + b'\xda' : 'U', + b'\xdb' : 'U', + b'\xdc' : 'U', + b'\xdd' : 'Y', + b'\xde' : 'b', + b'\xdf' : 'B', + b'\xe0' : 'a', + b'\xe1' : 'a', + b'\xe2' : 'a', + b'\xe3' : 'a', + b'\xe4' : 'a', + b'\xe5' : 'a', + b'\xe6' : 'ae', + b'\xe7' : 'c', + b'\xe8' : 'e', + b'\xe9' : 'e', + b'\xea' : 'e', + b'\xeb' : 'e', + b'\xec' : 'i', + b'\xed' : 'i', + b'\xee' : 'i', + b'\xef' : 'i', + b'\xf0' : 'o', + b'\xf1' : 'n', + b'\xf2' : 'o', + b'\xf3' : 'o', + b'\xf4' : 'o', + b'\xf5' : 'o', + b'\xf6' : 'o', + b'\xf7' : '/', + b'\xf8' : 'o', + b'\xf9' : 'u', + b'\xfa' : 'u', + b'\xfb' : 'u', + b'\xfc' : 'u', + b'\xfd' : 'y', + b'\xfe' : 'b', + b'\xff' : 'y', + } + + # A map used when removing rogue Windows-1252/ISO-8859-1 + # characters in otherwise UTF-8 documents. + # + # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in + # Windows-1252. + WINDOWS_1252_TO_UTF8 = { + 0x80 : b'\xe2\x82\xac', # € + 0x82 : b'\xe2\x80\x9a', # ‚ + 0x83 : b'\xc6\x92', # ƒ + 0x84 : b'\xe2\x80\x9e', # „ + 0x85 : b'\xe2\x80\xa6', # … + 0x86 : b'\xe2\x80\xa0', # † + 0x87 : b'\xe2\x80\xa1', # ‡ + 0x88 : b'\xcb\x86', # ˆ + 0x89 : b'\xe2\x80\xb0', # ‰ + 0x8a : b'\xc5\xa0', # Š + 0x8b : b'\xe2\x80\xb9', # ‹ + 0x8c : b'\xc5\x92', # Œ + 0x8e : b'\xc5\xbd', # Ž + 0x91 : b'\xe2\x80\x98', # ‘ + 0x92 : b'\xe2\x80\x99', # ’ + 0x93 : b'\xe2\x80\x9c', # “ + 0x94 : b'\xe2\x80\x9d', # ” + 0x95 : b'\xe2\x80\xa2', # • + 0x96 : b'\xe2\x80\x93', # – + 0x97 : b'\xe2\x80\x94', # — + 0x98 : b'\xcb\x9c', # ˜ + 0x99 : b'\xe2\x84\xa2', # ™ + 0x9a : b'\xc5\xa1', # š + 0x9b : b'\xe2\x80\xba', # › + 0x9c : b'\xc5\x93', # œ + 0x9e : b'\xc5\xbe', # ž + 0x9f : b'\xc5\xb8', # Ÿ + 0xa0 : b'\xc2\xa0', # + 0xa1 : b'\xc2\xa1', # ¡ + 0xa2 : b'\xc2\xa2', # ¢ + 0xa3 : b'\xc2\xa3', # £ + 0xa4 : b'\xc2\xa4', # ¤ + 0xa5 : b'\xc2\xa5', # ¥ + 0xa6 : b'\xc2\xa6', # ¦ + 0xa7 : b'\xc2\xa7', # § + 0xa8 : b'\xc2\xa8', # ¨ + 0xa9 : b'\xc2\xa9', # © + 0xaa : b'\xc2\xaa', # ª + 0xab : b'\xc2\xab', # « + 0xac : b'\xc2\xac', # ¬ + 0xad : b'\xc2\xad', # + 0xae : b'\xc2\xae', # ® + 0xaf : b'\xc2\xaf', # ¯ + 0xb0 : b'\xc2\xb0', # ° + 0xb1 : b'\xc2\xb1', # ± + 0xb2 : b'\xc2\xb2', # ² + 0xb3 : b'\xc2\xb3', # ³ + 0xb4 : b'\xc2\xb4', # ´ + 0xb5 : b'\xc2\xb5', # µ + 0xb6 : b'\xc2\xb6', # ¶ + 0xb7 : b'\xc2\xb7', # · + 0xb8 : b'\xc2\xb8', # ¸ + 0xb9 : b'\xc2\xb9', # ¹ + 0xba : b'\xc2\xba', # º + 0xbb : b'\xc2\xbb', # » + 0xbc : b'\xc2\xbc', # ¼ + 0xbd : b'\xc2\xbd', # ½ + 0xbe : b'\xc2\xbe', # ¾ + 0xbf : b'\xc2\xbf', # ¿ + 0xc0 : b'\xc3\x80', # À + 0xc1 : b'\xc3\x81', # Á + 0xc2 : b'\xc3\x82', #  + 0xc3 : b'\xc3\x83', # à + 0xc4 : b'\xc3\x84', # Ä + 0xc5 : b'\xc3\x85', # Å + 0xc6 : b'\xc3\x86', # Æ + 0xc7 : b'\xc3\x87', # Ç + 0xc8 : b'\xc3\x88', # È + 0xc9 : b'\xc3\x89', # É + 0xca : b'\xc3\x8a', # Ê + 0xcb : b'\xc3\x8b', # Ë + 0xcc : b'\xc3\x8c', # Ì + 0xcd : b'\xc3\x8d', # Í + 0xce : b'\xc3\x8e', # Î + 0xcf : b'\xc3\x8f', # Ï + 0xd0 : b'\xc3\x90', # Ð + 0xd1 : b'\xc3\x91', # Ñ + 0xd2 : b'\xc3\x92', # Ò + 0xd3 : b'\xc3\x93', # Ó + 0xd4 : b'\xc3\x94', # Ô + 0xd5 : b'\xc3\x95', # Õ + 0xd6 : b'\xc3\x96', # Ö + 0xd7 : b'\xc3\x97', # × + 0xd8 : b'\xc3\x98', # Ø + 0xd9 : b'\xc3\x99', # Ù + 0xda : b'\xc3\x9a', # Ú + 0xdb : b'\xc3\x9b', # Û + 0xdc : b'\xc3\x9c', # Ü + 0xdd : b'\xc3\x9d', # Ý + 0xde : b'\xc3\x9e', # Þ + 0xdf : b'\xc3\x9f', # ß + 0xe0 : b'\xc3\xa0', # à + 0xe1 : b'\xa1', # á + 0xe2 : b'\xc3\xa2', # â + 0xe3 : b'\xc3\xa3', # ã + 0xe4 : b'\xc3\xa4', # ä + 0xe5 : b'\xc3\xa5', # å + 0xe6 : b'\xc3\xa6', # æ + 0xe7 : b'\xc3\xa7', # ç + 0xe8 : b'\xc3\xa8', # è + 0xe9 : b'\xc3\xa9', # é + 0xea : b'\xc3\xaa', # ê + 0xeb : b'\xc3\xab', # ë + 0xec : b'\xc3\xac', # ì + 0xed : b'\xc3\xad', # í + 0xee : b'\xc3\xae', # î + 0xef : b'\xc3\xaf', # ï + 0xf0 : b'\xc3\xb0', # ð + 0xf1 : b'\xc3\xb1', # ñ + 0xf2 : b'\xc3\xb2', # ò + 0xf3 : b'\xc3\xb3', # ó + 0xf4 : b'\xc3\xb4', # ô + 0xf5 : b'\xc3\xb5', # õ + 0xf6 : b'\xc3\xb6', # ö + 0xf7 : b'\xc3\xb7', # ÷ + 0xf8 : b'\xc3\xb8', # ø + 0xf9 : b'\xc3\xb9', # ù + 0xfa : b'\xc3\xba', # ú + 0xfb : b'\xc3\xbb', # û + 0xfc : b'\xc3\xbc', # ü + 0xfd : b'\xc3\xbd', # ý + 0xfe : b'\xc3\xbe', # þ + } + + MULTIBYTE_MARKERS_AND_SIZES = [ + (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF + (0xe0, 0xef, 3), # 3-byte characters start with E0-EF + (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 + ] + + FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] + LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] + + @classmethod + def detwingle(cls, in_bytes, main_encoding="utf8", + embedded_encoding="windows-1252"): + """Fix characters from one encoding embedded in some other encoding. + + Currently the only situation supported is Windows-1252 (or its + subset ISO-8859-1), embedded in UTF-8. + + The input must be a bytestring. If you've already converted + the document to Unicode, you're too late. + + The output is a bytestring in which `embedded_encoding` + characters have been converted to their `main_encoding` + equivalents. + """ + if embedded_encoding.replace('_', '-').lower() not in ( + 'windows-1252', 'windows_1252'): + raise NotImplementedError( + "Windows-1252 and ISO-8859-1 are the only currently supported " + "embedded encodings.") + + if main_encoding.lower() not in ('utf8', 'utf-8'): + raise NotImplementedError( + "UTF-8 is the only currently supported main encoding.") + + byte_chunks = [] + + chunk_start = 0 + pos = 0 + while pos < len(in_bytes): + byte = in_bytes[pos] + if not isinstance(byte, int): + # Python 2.x + byte = ord(byte) + if (byte >= cls.FIRST_MULTIBYTE_MARKER + and byte <= cls.LAST_MULTIBYTE_MARKER): + # This is the start of a UTF-8 multibyte character. Skip + # to the end. + for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: + if byte >= start and byte <= end: + pos += size + break + elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: + # We found a Windows-1252 character! + # Save the string up to this point as a chunk. + byte_chunks.append(in_bytes[chunk_start:pos]) + + # Now translate the Windows-1252 character into UTF-8 + # and add it as another, one-byte chunk. + byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) + pos += 1 + chunk_start = pos + else: + # Go on to the next character. + pos += 1 + if chunk_start == 0: + # The string is unchanged. + return in_bytes + else: + # Store the final chunk. + byte_chunks.append(in_bytes[chunk_start:]) + return b''.join(byte_chunks) + diff --git a/libs3/bs4/diagnose.py b/libs3/bs4/diagnose.py new file mode 100644 index 000000000..b5f6e6c8b --- /dev/null +++ b/libs3/bs4/diagnose.py @@ -0,0 +1,224 @@ +"""Diagnostic functions, mainly for use when doing tech support.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import cProfile +from io import StringIO +from html.parser import HTMLParser +import bs4 +from bs4 import BeautifulSoup, __version__ +from bs4.builder import builder_registry + +import os +import pstats +import random +import tempfile +import time +import traceback +import sys +import cProfile + +def diagnose(data): + """Diagnostic suite for isolating common problems.""" + print("Diagnostic running on Beautiful Soup %s" % __version__) + print("Python version %s" % sys.version) + + basic_parsers = ["html.parser", "html5lib", "lxml"] + for name in basic_parsers: + for builder in builder_registry.builders: + if name in builder.features: + break + else: + basic_parsers.remove(name) + print(( + "I noticed that %s is not installed. Installing it may help." % + name)) + + if 'lxml' in basic_parsers: + basic_parsers.append("lxml-xml") + try: + from lxml import etree + print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) + except ImportError as e: + print ( + "lxml is not installed or couldn't be imported.") + + + if 'html5lib' in basic_parsers: + try: + import html5lib + print("Found html5lib version %s" % html5lib.__version__) + except ImportError as e: + print ( + "html5lib is not installed or couldn't be imported.") + + if hasattr(data, 'read'): + data = data.read() + elif data.startswith("http:") or data.startswith("https:"): + print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) + print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") + return + else: + try: + if os.path.exists(data): + print('"%s" looks like a filename. Reading data from the file.' % data) + with open(data) as fp: + data = fp.read() + except ValueError: + # This can happen on some platforms when the 'filename' is + # too long. Assume it's data and not a filename. + pass + print() + + for parser in basic_parsers: + print("Trying to parse your markup with %s" % parser) + success = False + try: + soup = BeautifulSoup(data, features=parser) + success = True + except Exception as e: + print("%s could not parse the markup." % parser) + traceback.print_exc() + if success: + print("Here's what %s did with the markup:" % parser) + print(soup.prettify()) + + print("-" * 80) + +def lxml_trace(data, html=True, **kwargs): + """Print out the lxml events that occur during parsing. + + This lets you see how lxml parses a document when no Beautiful + Soup code is running. + """ + from lxml import etree + for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): + print(("%s, %4s, %s" % (event, element.tag, element.text))) + +class AnnouncingParser(HTMLParser): + """Announces HTMLParser parse events, without doing anything else.""" + + def _p(self, s): + print(s) + + def handle_starttag(self, name, attrs): + self._p("%s START" % name) + + def handle_endtag(self, name): + self._p("%s END" % name) + + def handle_data(self, data): + self._p("%s DATA" % data) + + def handle_charref(self, name): + self._p("%s CHARREF" % name) + + def handle_entityref(self, name): + self._p("%s ENTITYREF" % name) + + def handle_comment(self, data): + self._p("%s COMMENT" % data) + + def handle_decl(self, data): + self._p("%s DECL" % data) + + def unknown_decl(self, data): + self._p("%s UNKNOWN-DECL" % data) + + def handle_pi(self, data): + self._p("%s PI" % data) + +def htmlparser_trace(data): + """Print out the HTMLParser events that occur during parsing. + + This lets you see how HTMLParser parses a document when no + Beautiful Soup code is running. + """ + parser = AnnouncingParser() + parser.feed(data) + +_vowels = "aeiou" +_consonants = "bcdfghjklmnpqrstvwxyz" + +def rword(length=5): + "Generate a random word-like string." + s = '' + for i in range(length): + if i % 2 == 0: + t = _consonants + else: + t = _vowels + s += random.choice(t) + return s + +def rsentence(length=4): + "Generate a random sentence-like string." + return " ".join(rword(random.randint(4,9)) for i in list(range(length))) + +def rdoc(num_elements=1000): + """Randomly generate an invalid HTML document.""" + tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] + elements = [] + for i in range(num_elements): + choice = random.randint(0,3) + if choice == 0: + # New tag. + tag_name = random.choice(tag_names) + elements.append("<%s>" % tag_name) + elif choice == 1: + elements.append(rsentence(random.randint(1,4))) + elif choice == 2: + # Close a tag. + tag_name = random.choice(tag_names) + elements.append("</%s>" % tag_name) + return "<html>" + "\n".join(elements) + "</html>" + +def benchmark_parsers(num_elements=100000): + """Very basic head-to-head performance benchmark.""" + print("Comparative parser benchmark on Beautiful Soup %s" % __version__) + data = rdoc(num_elements) + print("Generated a large invalid HTML document (%d bytes)." % len(data)) + + for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: + success = False + try: + a = time.time() + soup = BeautifulSoup(data, parser) + b = time.time() + success = True + except Exception as e: + print("%s could not parse the markup." % parser) + traceback.print_exc() + if success: + print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) + + from lxml import etree + a = time.time() + etree.HTML(data) + b = time.time() + print("Raw lxml parsed the markup in %.2fs." % (b-a)) + + import html5lib + parser = html5lib.HTMLParser() + a = time.time() + parser.parse(data) + b = time.time() + print("Raw html5lib parsed the markup in %.2fs." % (b-a)) + +def profile(num_elements=100000, parser="lxml"): + + filehandle = tempfile.NamedTemporaryFile() + filename = filehandle.name + + data = rdoc(num_elements) + vars = dict(bs4=bs4, data=data, parser=parser) + cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) + + stats = pstats.Stats(filename) + # stats.strip_dirs() + stats.sort_stats("cumulative") + stats.print_stats('_html5lib|bs4', 50) + +if __name__ == '__main__': + diagnose(sys.stdin.read()) diff --git a/libs3/bs4/element.py b/libs3/bs4/element.py new file mode 100644 index 000000000..f16b1663e --- /dev/null +++ b/libs3/bs4/element.py @@ -0,0 +1,1579 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +try: + from collections.abc import Callable # Python 3.6 +except ImportError as e: + from collections import Callable +import re +import sys +import warnings +try: + import soupsieve +except ImportError as e: + soupsieve = None + warnings.warn( + 'The soupsieve package is not installed. CSS selectors cannot be used.' + ) + +from bs4.formatter import ( + Formatter, + HTMLFormatter, + XMLFormatter, +) + +DEFAULT_OUTPUT_ENCODING = "utf-8" +PY3K = (sys.version_info[0] > 2) + +nonwhitespace_re = re.compile(r"\S+") + +# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on +# the off chance someone imported it for their own use. +whitespace_re = re.compile(r"\s+") + +def _alias(attr): + """Alias one attribute name to another for backward compatibility""" + @property + def alias(self): + return getattr(self, attr) + + @alias.setter + def alias(self): + return setattr(self, attr) + return alias + + +class NamespacedAttribute(str): + + def __new__(cls, prefix, name, namespace=None): + if name is None: + obj = str.__new__(cls, prefix) + elif prefix is None: + # Not really namespaced. + obj = str.__new__(cls, name) + else: + obj = str.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + +class AttributeValueWithCharsetSubstitution(str): + """A stand-in object for a character encoding specified in HTML.""" + +class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'charset' attribute. + + When Beautiful Soup parses the markup '<meta charset="utf8">', the + value of the 'charset' attribute will be one of these objects. + """ + + def __new__(cls, original_value): + obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + return encoding + + +class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'content' attribute. + + When Beautiful Soup parses the markup: + <meta http-equiv="content-type" content="text/html; charset=utf8"> + + The value of the 'content' attribute will be one of these objects. + """ + + CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) + + def __new__(cls, original_value): + match = cls.CHARSET_RE.search(original_value) + if match is None: + # No substitution necessary. + return str.__new__(str, original_value) + + obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + def rewrite(match): + return match.group(1) + encoding + return self.CHARSET_RE.sub(rewrite, self.original_value) + + +class PageElement(object): + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=None, previous_element=None, next_element=None, + previous_sibling=None, next_sibling=None): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + + self.previous_element = previous_element + if previous_element is not None: + self.previous_element.next_element = self + + self.next_element = next_element + if self.next_element is not None: + self.next_element.previous_element = self + + self.next_sibling = next_sibling + if self.next_sibling is not None: + self.next_sibling.previous_sibling = self + + if (previous_sibling is None + and self.parent is not None and self.parent.contents): + previous_sibling = self.parent.contents[-1] + + self.previous_sibling = previous_sibling + if previous_sibling is not None: + self.previous_sibling.next_sibling = self + + def format_string(self, s, formatter): + """Format the given string using the given formatter.""" + if formatter is None: + return s + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + output = formatter.substitute(s) + return output + + def formatter_for_name(self, formatter): + """Look up or create a Formatter for the given identifier, + if necessary. + + :param formatter: Can be a Formatter object (used as-is), a + function (used as the entity substitution hook for an + XMLFormatter or HTMLFormatter), or a string (used to look up + an XMLFormatter or HTMLFormatter in the appropriate registry. + """ + if isinstance(formatter, Formatter): + return formatter + if self._is_xml: + c = XMLFormatter + else: + c = HTMLFormatter + if callable(formatter): + return c(entity_substitution=formatter) + return c.REGISTRY[formatter] + + @property + def _is_xml(self): + """Is this element part of an XML tree or an HTML tree? + + This is used in formatter_for_name, when deciding whether an + XMLFormatter or HTMLFormatter is more appropriate. It can be + inefficient, but it should be called very rarely. + """ + if self.known_xml is not None: + # Most of the time we will have determined this when the + # document is parsed. + return self.known_xml + + # Otherwise, it's likely that this element was created by + # direct invocation of the constructor from within the user's + # Python code. + if self.parent is None: + # This is the top-level object. It should have .known_xml set + # from tree creation. If not, take a guess--BS is usually + # used on HTML markup. + return getattr(self, 'is_xml', False) + return self.parent._is_xml + + nextSibling = _alias("next_sibling") # BS3 + previousSibling = _alias("previous_sibling") # BS3 + + def replace_with(self, replace_with): + if self.parent is None: + raise ValueError( + "Cannot replace one element with another when the " + "element to be replaced is not part of a tree.") + if replace_with is self: + return + if replace_with is self.parent: + raise ValueError("Cannot replace a Tag with its parent.") + old_parent = self.parent + my_index = self.parent.index(self) + self.extract() + old_parent.insert(my_index, replace_with) + return self + replaceWith = replace_with # BS3 + + def unwrap(self): + my_parent = self.parent + if self.parent is None: + raise ValueError( + "Cannot replace an element with its contents when that" + "element is not part of a tree.") + my_index = self.parent.index(self) + self.extract() + for child in reversed(self.contents[:]): + my_parent.insert(my_index, child) + return self + replace_with_children = unwrap + replaceWithChildren = unwrap # BS3 + + def wrap(self, wrap_inside): + me = self.replace_with(wrap_inside) + wrap_inside.append(me) + return wrap_inside + + def extract(self): + """Destructively rips this element out of the tree.""" + if self.parent is not None: + del self.parent.contents[self.parent.index(self)] + + #Find the two elements that would be next to each other if + #this element (and any children) hadn't been parsed. Connect + #the two. + last_child = self._last_descendant() + next_element = last_child.next_element + + if (self.previous_element is not None and + self.previous_element is not next_element): + self.previous_element.next_element = next_element + if next_element is not None and next_element is not self.previous_element: + next_element.previous_element = self.previous_element + self.previous_element = None + last_child.next_element = None + + self.parent = None + if (self.previous_sibling is not None + and self.previous_sibling is not self.next_sibling): + self.previous_sibling.next_sibling = self.next_sibling + if (self.next_sibling is not None + and self.next_sibling is not self.previous_sibling): + self.next_sibling.previous_sibling = self.previous_sibling + self.previous_sibling = self.next_sibling = None + return self + + def _last_descendant(self, is_initialized=True, accept_self=True): + "Finds the last element beneath this object to be parsed." + if is_initialized and self.next_sibling is not None: + last_child = self.next_sibling.previous_element + else: + last_child = self + while isinstance(last_child, Tag) and last_child.contents: + last_child = last_child.contents[-1] + if not accept_self and last_child is self: + last_child = None + return last_child + # BS3: Not part of the API! + _lastRecursiveChild = _last_descendant + + def insert(self, position, new_child): + if new_child is None: + raise ValueError("Cannot insert None into a tag.") + if new_child is self: + raise ValueError("Cannot insert a tag into itself.") + if (isinstance(new_child, str) + and not isinstance(new_child, NavigableString)): + new_child = NavigableString(new_child) + + from bs4 import BeautifulSoup + if isinstance(new_child, BeautifulSoup): + # We don't want to end up with a situation where one BeautifulSoup + # object contains another. Insert the children one at a time. + for subchild in list(new_child.contents): + self.insert(position, subchild) + position += 1 + return + position = min(position, len(self.contents)) + if hasattr(new_child, 'parent') and new_child.parent is not None: + # We're 'inserting' an element that's already one + # of this object's children. + if new_child.parent is self: + current_index = self.index(new_child) + if current_index < position: + # We're moving this element further down the list + # of this object's children. That means that when + # we extract this element, our target index will + # jump down one. + position -= 1 + new_child.extract() + + new_child.parent = self + previous_child = None + if position == 0: + new_child.previous_sibling = None + new_child.previous_element = self + else: + previous_child = self.contents[position - 1] + new_child.previous_sibling = previous_child + new_child.previous_sibling.next_sibling = new_child + new_child.previous_element = previous_child._last_descendant(False) + if new_child.previous_element is not None: + new_child.previous_element.next_element = new_child + + new_childs_last_element = new_child._last_descendant(False) + + if position >= len(self.contents): + new_child.next_sibling = None + + parent = self + parents_next_sibling = None + while parents_next_sibling is None and parent is not None: + parents_next_sibling = parent.next_sibling + parent = parent.parent + if parents_next_sibling is not None: + # We found the element that comes next in the document. + break + if parents_next_sibling is not None: + new_childs_last_element.next_element = parents_next_sibling + else: + # The last element of this tag is the last element in + # the document. + new_childs_last_element.next_element = None + else: + next_child = self.contents[position] + new_child.next_sibling = next_child + if new_child.next_sibling is not None: + new_child.next_sibling.previous_sibling = new_child + new_childs_last_element.next_element = next_child + + if new_childs_last_element.next_element is not None: + new_childs_last_element.next_element.previous_element = new_childs_last_element + self.contents.insert(position, new_child) + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.insert(len(self.contents), tag) + + def extend(self, tags): + """Appends the given tags to the contents of this tag.""" + for tag in tags: + self.append(tag) + + def insert_before(self, *args): + """Makes the given element(s) the immediate predecessor of this one. + + The elements will have the same parent, and the given elements + will be immediately before this one. + """ + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'before' has no meaning.") + if any(x is self for x in args): + raise ValueError("Can't insert an element before itself.") + for predecessor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(predecessor, PageElement): + predecessor.extract() + index = parent.index(self) + parent.insert(index, predecessor) + + def insert_after(self, *args): + """Makes the given element(s) the immediate successor of this one. + + The elements will have the same parent, and the given elements + will be immediately after this one. + """ + # Do all error checking before modifying the tree. + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'after' has no meaning.") + if any(x is self for x in args): + raise ValueError("Can't insert an element after itself.") + + offset = 0 + for successor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(successor, PageElement): + successor.extract() + index = parent.index(self) + parent.insert(index+1+offset, successor) + offset += 1 + + def find_next(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._find_one(self.find_all_next, name, attrs, text, **kwargs) + findNext = find_next # BS3 + + def find_all_next(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + after this Tag in the document.""" + return self._find_all(name, attrs, text, limit, self.next_elements, + **kwargs) + findAllNext = find_all_next # BS3 + + def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._find_one(self.find_next_siblings, name, attrs, text, + **kwargs) + findNextSibling = find_next_sibling # BS3 + + def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._find_all(name, attrs, text, limit, + self.next_siblings, **kwargs) + findNextSiblings = find_next_siblings # BS3 + fetchNextSiblings = find_next_siblings # BS2 + + def find_previous(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._find_one( + self.find_all_previous, name, attrs, text, **kwargs) + findPrevious = find_previous # BS3 + + def find_all_previous(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._find_all(name, attrs, text, limit, self.previous_elements, + **kwargs) + findAllPrevious = find_all_previous # BS3 + fetchPrevious = find_all_previous # BS2 + + def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._find_one(self.find_previous_siblings, name, attrs, text, + **kwargs) + findPreviousSibling = find_previous_sibling # BS3 + + def find_previous_siblings(self, name=None, attrs={}, text=None, + limit=None, **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._find_all(name, attrs, text, limit, + self.previous_siblings, **kwargs) + findPreviousSiblings = find_previous_siblings # BS3 + fetchPreviousSiblings = find_previous_siblings # BS2 + + def find_parent(self, name=None, attrs={}, **kwargs): + """Returns the closest parent of this Tag that matches the given + criteria.""" + # NOTE: We can't use _find_one because findParents takes a different + # set of arguments. + r = None + l = self.find_parents(name, attrs, 1, **kwargs) + if l: + r = l[0] + return r + findParent = find_parent # BS3 + + def find_parents(self, name=None, attrs={}, limit=None, **kwargs): + """Returns the parents of this Tag that match the given + criteria.""" + + return self._find_all(name, attrs, None, limit, self.parents, + **kwargs) + findParents = find_parents # BS3 + fetchParents = find_parents # BS2 + + @property + def next(self): + return self.next_element + + @property + def previous(self): + return self.previous_element + + #These methods do the real heavy lifting. + + def _find_one(self, method, name, attrs, text, **kwargs): + r = None + l = method(name, attrs, text, 1, **kwargs) + if l: + r = l[0] + return r + + def _find_all(self, name, attrs, text, limit, generator, **kwargs): + "Iterates over a generator looking for things that match." + + if text is None and 'string' in kwargs: + text = kwargs['string'] + del kwargs['string'] + + if isinstance(name, SoupStrainer): + strainer = name + else: + strainer = SoupStrainer(name, attrs, text, **kwargs) + + if text is None and not limit and not attrs and not kwargs: + if name is True or name is None: + # Optimization to find all tags. + result = (element for element in generator + if isinstance(element, Tag)) + return ResultSet(strainer, result) + elif isinstance(name, str): + # Optimization to find all tags with a given name. + if name.count(':') == 1: + # This is a name with a prefix. If this is a namespace-aware document, + # we need to match the local name against tag.name. If not, + # we need to match the fully-qualified name against tag.name. + prefix, local_name = name.split(':', 1) + else: + prefix = None + local_name = name + result = (element for element in generator + if isinstance(element, Tag) + and ( + element.name == name + ) or ( + element.name == local_name + and (prefix is None or element.prefix == prefix) + ) + ) + return ResultSet(strainer, result) + results = ResultSet(strainer) + while True: + try: + i = next(generator) + except StopIteration: + break + if i: + found = strainer.search(i) + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #These generators can be used to navigate starting from both + #NavigableStrings and Tags. + @property + def next_elements(self): + i = self.next_element + while i is not None: + yield i + i = i.next_element + + @property + def next_siblings(self): + i = self.next_sibling + while i is not None: + yield i + i = i.next_sibling + + @property + def previous_elements(self): + i = self.previous_element + while i is not None: + yield i + i = i.previous_element + + @property + def previous_siblings(self): + i = self.previous_sibling + while i is not None: + yield i + i = i.previous_sibling + + @property + def parents(self): + i = self.parent + while i is not None: + yield i + i = i.parent + + # Old non-property versions of the generators, for backwards + # compatibility with BS3. + def nextGenerator(self): + return self.next_elements + + def nextSiblingGenerator(self): + return self.next_siblings + + def previousGenerator(self): + return self.previous_elements + + def previousSiblingGenerator(self): + return self.previous_siblings + + def parentGenerator(self): + return self.parents + + +class NavigableString(str, PageElement): + + PREFIX = '' + SUFFIX = '' + + # We can't tell just by looking at a string whether it's contained + # in an XML document or an HTML document. + + known_xml = None + + def __new__(cls, value): + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, str): + u = str.__new__(cls, value) + else: + u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u.setup() + return u + + def __copy__(self): + """A copy of a NavigableString has the same contents and class + as the original, but it is not connected to the parse tree. + """ + return type(self)(self) + + def __getnewargs__(self): + return (str(self),) + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards + compatibility for Navigable*String, but for CData* it lets you + get the string without the CData wrapper.""" + if attr == 'string': + return self + else: + raise AttributeError( + "'%s' object has no attribute '%s'" % ( + self.__class__.__name__, attr)) + + def output_ready(self, formatter="minimal"): + """Run the string through the provided formatter.""" + output = self.format_string(self, formatter) + return self.PREFIX + output + self.SUFFIX + + @property + def name(self): + return None + + @name.setter + def name(self, name): + raise AttributeError("A NavigableString cannot be given a name.") + +class PreformattedString(NavigableString): + """A NavigableString not subject to the normal formatting rules. + + The string will be passed into the formatter (to trigger side effects), + but the return value will be ignored. + """ + + def output_ready(self, formatter=None): + """CData strings are passed into the formatter, purely + for any side effects. The return value is ignored. + """ + if formatter is not None: + ignore = self.format_string(self, formatter) + return self.PREFIX + self + self.SUFFIX + +class CData(PreformattedString): + + PREFIX = '<![CDATA[' + SUFFIX = ']]>' + +class ProcessingInstruction(PreformattedString): + """A SGML processing instruction.""" + + PREFIX = '<?' + SUFFIX = '>' + +class XMLProcessingInstruction(ProcessingInstruction): + """An XML processing instruction.""" + PREFIX = '<?' + SUFFIX = '?>' + +class Comment(PreformattedString): + + PREFIX = '<!--' + SUFFIX = '-->' + + +class Declaration(PreformattedString): + PREFIX = '<?' + SUFFIX = '?>' + + +class Doctype(PreformattedString): + + @classmethod + def for_name_and_ids(cls, name, pub_id, system_id): + value = name or '' + if pub_id is not None: + value += ' PUBLIC "%s"' % pub_id + if system_id is not None: + value += ' "%s"' % system_id + elif system_id is not None: + value += ' SYSTEM "%s"' % system_id + + return Doctype(value) + + PREFIX = '<!DOCTYPE ' + SUFFIX = '>\n' + + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def __init__(self, parser=None, builder=None, name=None, namespace=None, + prefix=None, attrs=None, parent=None, previous=None, + is_xml=None): + "Basic constructor." + + if parser is None: + self.parser_class = None + else: + # We don't actually store the parser object: that lets extracted + # chunks be garbage-collected. + self.parser_class = parser.__class__ + if name is None: + raise ValueError("No value provided for new tag's name.") + self.name = name + self.namespace = namespace + self.prefix = prefix + if attrs is None: + attrs = {} + elif attrs: + if builder is not None and builder.cdata_list_attributes: + attrs = builder._replace_cdata_list_attribute_values( + self.name, attrs) + else: + attrs = dict(attrs) + else: + attrs = dict(attrs) + + # If possible, determine ahead of time whether this tag is an + # XML tag. + if builder: + self.known_xml = builder.is_xml + else: + self.known_xml = is_xml + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + + if builder is None: + # In the absence of a TreeBuilder, assume this tag is nothing + # special. + self.can_be_empty_element = False + self.cdata_list_attributes = None + else: + # Set up any substitutions for this tag, such as the charset in a META tag. + builder.set_up_substitutions(self) + + # Ask the TreeBuilder whether this tag might be an empty-element tag. + self.can_be_empty_element = builder.can_be_empty_element(name) + + # Keep track of the list of attributes of this tag that + # might need to be treated as a list. + # + # For performance reasons, we store the whole data structure + # rather than asking the question of every tag. Asking would + # require building a new data structure every time, and + # (unlike can_be_empty_element), we almost never need + # to check this. + self.cdata_list_attributes = builder.cdata_list_attributes + + # Keep track of the names that might cause this tag to be treated as a + # whitespace-preserved tag. + self.preserve_whitespace_tags = builder.preserve_whitespace_tags + + parserClass = _alias("parser_class") # BS3 + + def __copy__(self): + """A copy of a Tag is a new Tag, unconnected to the parse tree. + Its contents are a copy of the old Tag's contents. + """ + clone = type(self)(None, self.builder, self.name, self.namespace, + self.prefix, self.attrs, is_xml=self._is_xml) + for attr in ('can_be_empty_element', 'hidden'): + setattr(clone, attr, getattr(self, attr)) + for child in self.contents: + clone.append(child.__copy__()) + return clone + + @property + def is_empty_element(self): + """Is this tag an empty-element tag? (aka a self-closing tag) + + A tag that has contents is never an empty-element tag. + + A tag that has no contents may or may not be an empty-element + tag. It depends on the builder used to create the tag. If the + builder has a designated list of empty-element tags, then only + a tag whose name shows up in that list is considered an + empty-element tag. + + If the builder has no designated list of empty-element tags, + then any tag with no contents is an empty-element tag. + """ + return len(self.contents) == 0 and self.can_be_empty_element + isSelfClosing = is_empty_element # BS3 + + @property + def string(self): + """Convenience property to get the single string within this tag. + + :Return: If this tag has a single string child, return value + is that string. If this tag has no children, or more than one + child, return value is None. If this tag has one child tag, + return value is the 'string' attribute of the child tag, + recursively. + """ + if len(self.contents) != 1: + return None + child = self.contents[0] + if isinstance(child, NavigableString): + return child + return child.string + + @string.setter + def string(self, string): + self.clear() + self.append(string.__class__(string)) + + def _all_strings(self, strip=False, types=(NavigableString, CData)): + """Yield all strings of certain classes, possibly stripping them. + + By default, yields only NavigableString and CData objects. So + no comments, processing instructions, etc. + """ + for descendant in self.descendants: + if ( + (types is None and not isinstance(descendant, NavigableString)) + or + (types is not None and type(descendant) not in types)): + continue + if strip: + descendant = descendant.strip() + if len(descendant) == 0: + continue + yield descendant + + strings = property(_all_strings) + + @property + def stripped_strings(self): + for string in self._all_strings(True): + yield string + + def get_text(self, separator="", strip=False, + types=(NavigableString, CData)): + """ + Get all child strings, concatenated using the given separator. + """ + return separator.join([s for s in self._all_strings( + strip, types=types)]) + getText = get_text + text = property(get_text) + + def decompose(self): + """Recursively destroys the contents of this tree.""" + self.extract() + i = self + while i is not None: + next = i.next_element + i.__dict__.clear() + i.contents = [] + i = next + + def clear(self, decompose=False): + """ + Extract all children. If decompose is True, decompose instead. + """ + if decompose: + for element in self.contents[:]: + if isinstance(element, Tag): + element.decompose() + else: + element.extract() + else: + for element in self.contents[:]: + element.extract() + + def smooth(self): + """Smooth out this element's children by consolidating consecutive strings. + + This makes pretty-printed output look more natural following a + lot of operations that modified the tree. + """ + # Mark the first position of every pair of children that need + # to be consolidated. Do this rather than making a copy of + # self.contents, since in most cases very few strings will be + # affected. + marked = [] + for i, a in enumerate(self.contents): + if isinstance(a, Tag): + # Recursively smooth children. + a.smooth() + if i == len(self.contents)-1: + # This is the last item in .contents, and it's not a + # tag. There's no chance it needs any work. + continue + b = self.contents[i+1] + if (isinstance(a, NavigableString) + and isinstance(b, NavigableString) + and not isinstance(a, PreformattedString) + and not isinstance(b, PreformattedString) + ): + marked.append(i) + + # Go over the marked positions in reverse order, so that + # removing items from .contents won't affect the remaining + # positions. + for i in reversed(marked): + a = self.contents[i] + b = self.contents[i+1] + b.extract() + n = NavigableString(a+b) + a.replace_with(n) + + def index(self, element): + """ + Find the index of a child by identity, not value. Avoids issues with + tag.contents.index(element) getting the index of equal elements. + """ + for i, child in enumerate(self.contents): + if child is element: + return i + raise ValueError("Tag.index: element not in tag") + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self.attrs.get(key, default) + + def get_attribute_list(self, key, default=None): + """The same as get(), but always returns a list.""" + value = self.get(key, default) + if not isinstance(value, list): + value = [value] + return value + + def has_attr(self, key): + return key in self.attrs + + def __hash__(self): + return str(self).__hash__() + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self.attrs[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __bool__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self.attrs[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + self.attrs.pop(key, None) + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + find_all() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return self.find_all(*args, **kwargs) + + def __getattr__(self, tag): + #print "Getattr %s.%s" % (self.__class__, tag) + if len(tag) > 3 and tag.endswith('Tag'): + # BS3: soup.aTag -> "soup.find("a") + tag_name = tag[:-3] + warnings.warn( + '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( + name=tag_name + ) + ) + return self.find(tag_name) + # We special case contents to avoid recursion. + elif not tag.startswith("__") and not tag == "contents": + return self.find(tag) + raise AttributeError( + "'%s' object has no attribute '%s'" % (self.__class__, tag)) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag.""" + if self is other: + return True + if (not hasattr(other, 'name') or + not hasattr(other, 'attrs') or + not hasattr(other, 'contents') or + self.name != other.name or + self.attrs != other.attrs or + len(self) != len(other)): + return False + for i, my_child in enumerate(self.contents): + if my_child != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self, encoding="unicode-escape"): + """Renders this tag as a string.""" + if PY3K: + # "The return value must be a string object", i.e. Unicode + return self.decode() + else: + # "The return value must be a string object", i.e. a bytestring. + # By convention, the return value of __repr__ should also be + # an ASCII string. + return self.encode(encoding) + + def __unicode__(self): + return self.decode() + + def __str__(self): + if PY3K: + return self.decode() + else: + return self.encode() + + if PY3K: + __str__ = __repr__ = __unicode__ + + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, + indent_level=None, formatter="minimal", + errors="xmlcharrefreplace"): + # Turn the data structure into Unicode, then encode the + # Unicode. + u = self.decode(indent_level, encoding, formatter) + return u.encode(encoding, errors) + + def decode(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a Unicode representation of this tag and its contents. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a <META> tag that mentions the document's + encoding. + """ + + # First off, turn a non-Formatter `formatter` into a Formatter + # object. This will stop the lookup from happening over and + # over again. + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + attributes = formatter.attributes(self) + attrs = [] + for key, val in attributes: + if val is None: + decoded = key + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, str): + val = str(val) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None + ): + val = val.encode(eventual_encoding) + + text = formatter.attribute_value(val) + decoded = ( + str(key) + '=' + + formatter.quoted_attribute_value(text)) + attrs.append(decoded) + close = '' + closeTag = '' + + prefix = '' + if self.prefix: + prefix = self.prefix + ":" + + if self.is_empty_element: + close = formatter.void_element_close_prefix or '' + else: + closeTag = '</%s%s>' % (prefix, self.name) + + pretty_print = self._should_pretty_print(indent_level) + space = '' + indent_space = '' + if indent_level is not None: + indent_space = (' ' * (indent_level - 1)) + if pretty_print: + space = indent_space + indent_contents = indent_level + 1 + else: + indent_contents = None + contents = self.decode_contents( + indent_contents, eventual_encoding, formatter + ) + + if self.hidden: + # This is the 'document root' object. + s = contents + else: + s = [] + attribute_string = '' + if attrs: + attribute_string = ' ' + ' '.join(attrs) + if indent_level is not None: + # Even if this particular tag is not pretty-printed, + # we should indent up to the start of the tag. + s.append(indent_space) + s.append('<%s%s%s%s>' % ( + prefix, self.name, attribute_string, close)) + if pretty_print: + s.append("\n") + s.append(contents) + if pretty_print and contents and contents[-1] != "\n": + s.append("\n") + if pretty_print and closeTag: + s.append(space) + s.append(closeTag) + if indent_level is not None and closeTag and self.next_sibling: + # Even if this particular tag is not pretty-printed, + # we're now done with the tag, and we should add a + # newline if appropriate. + s.append("\n") + s = ''.join(s) + return s + + def _should_pretty_print(self, indent_level): + """Should this tag be pretty-printed?""" + return ( + indent_level is not None + and self.name not in self.preserve_whitespace_tags + ) + + def prettify(self, encoding=None, formatter="minimal"): + if encoding is None: + return self.decode(True, formatter=formatter) + else: + return self.encode(encoding, True, formatter=formatter) + + def decode_contents(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this tag as a Unicode string. + + :param indent_level: Each line of the rendering will be + indented this many spaces. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. decode_contents() is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a <META> tag that mentions the document's + encoding. + + :param formatter: A Formatter object, or a string naming one of + the standard Formatters. + """ + # First off, turn a string formatter into a Formatter object. This + # will stop the lookup from happening over and over again. + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + + pretty_print = (indent_level is not None) + s = [] + for c in self: + text = None + if isinstance(c, NavigableString): + text = c.output_ready(formatter) + elif isinstance(c, Tag): + s.append(c.decode(indent_level, eventual_encoding, + formatter)) + preserve_whitespace = ( + self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags + ) + if text and indent_level and not preserve_whitespace: + text = text.strip() + if text: + if pretty_print and not preserve_whitespace: + s.append(" " * (indent_level - 1)) + s.append(text) + if pretty_print and not preserve_whitespace: + s.append("\n") + return ''.join(s) + + def encode_contents( + self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this tag as a bytestring. + + :param indent_level: Each line of the rendering will be + indented this many spaces. + + :param eventual_encoding: The bytestring will be in this encoding. + + :param formatter: The output formatter responsible for converting + entities to Unicode characters. + """ + + contents = self.decode_contents(indent_level, encoding, formatter) + return contents.encode(encoding) + + # Old method for BS3 compatibility + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + if not prettyPrint: + indentLevel = None + return self.encode_contents( + indent_level=indentLevel, encoding=encoding) + + #Soup methods + + def find(self, name=None, attrs={}, recursive=True, text=None, + **kwargs): + """Return only the first child of this Tag matching the given + criteria.""" + r = None + l = self.find_all(name, attrs, recursive, text, 1, **kwargs) + if l: + r = l[0] + return r + findChild = find + + def find_all(self, name=None, attrs={}, recursive=True, text=None, + limit=None, **kwargs): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + + generator = self.descendants + if not recursive: + generator = self.children + return self._find_all(name, attrs, text, limit, generator, **kwargs) + findAll = find_all # BS3 + findChildren = find_all # BS2 + + #Generator methods + @property + def children(self): + # return iter() to make the purpose of the method clear + return iter(self.contents) # XXX This seems to be untested. + + @property + def descendants(self): + if not len(self.contents): + return + stopNode = self._last_descendant().next_element + current = self.contents[0] + while current is not stopNode: + yield current + current = current.next_element + + # CSS selector code + def select_one(self, selector, namespaces=None, **kwargs): + """Perform a CSS selection operation on the current element.""" + value = self.select(selector, namespaces, 1, **kwargs) + if value: + return value[0] + return None + + def select(self, selector, namespaces=None, limit=None, **kwargs): + """Perform a CSS selection operation on the current element. + + This uses the SoupSieve library. + + :param selector: A string containing a CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param limit: After finding this number of results, stop looking. + + :param kwargs: Any extra arguments you'd like to pass in to + soupsieve.select(). + """ + if namespaces is None: + namespaces = self._namespaces + + if limit is None: + limit = 0 + if soupsieve is None: + raise NotImplementedError( + "Cannot execute CSS selectors because the soupsieve package is not installed." + ) + + return soupsieve.select(selector, self, namespaces, limit, **kwargs) + + # Old names for backwards compatibility + def childGenerator(self): + return self.children + + def recursiveChildGenerator(self): + return self.descendants + + def has_key(self, key): + """This was kind of misleading because has_key() (attributes) + was different from __in__ (contents). has_key() is gone in + Python 3, anyway.""" + warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( + key)) + return self.has_attr(key) + +# Next, a couple classes to represent queries and their results. +class SoupStrainer(object): + """Encapsulates a number of ways of matching a markup element (tag or + text).""" + + def __init__(self, name=None, attrs={}, text=None, **kwargs): + self.name = self._normalize_search_value(name) + if not isinstance(attrs, dict): + # Treat a non-dict value for attrs as a search for the 'class' + # attribute. + kwargs['class'] = attrs + attrs = None + + if 'class_' in kwargs: + # Treat class_="foo" as a search for the 'class' + # attribute, overriding any non-dict value for attrs. + kwargs['class'] = kwargs['class_'] + del kwargs['class_'] + + if kwargs: + if attrs: + attrs = attrs.copy() + attrs.update(kwargs) + else: + attrs = kwargs + normalized_attrs = {} + for key, value in list(attrs.items()): + normalized_attrs[key] = self._normalize_search_value(value) + + self.attrs = normalized_attrs + self.text = self._normalize_search_value(text) + + def _normalize_search_value(self, value): + # Leave it alone if it's a Unicode string, a callable, a + # regular expression, a boolean, or None. + if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') + or isinstance(value, bool) or value is None): + return value + + # If it's a bytestring, convert it to Unicode, treating it as UTF-8. + if isinstance(value, bytes): + return value.decode("utf8") + + # If it's listlike, convert it into a list of strings. + if hasattr(value, '__iter__'): + new_value = [] + for v in value: + if (hasattr(v, '__iter__') and not isinstance(v, bytes) + and not isinstance(v, str)): + # This is almost certainly the user's mistake. In the + # interests of avoiding infinite loops, we'll let + # it through as-is rather than doing a recursive call. + new_value.append(v) + else: + new_value.append(self._normalize_search_value(v)) + return new_value + + # Otherwise, convert it into a Unicode string. + # The unicode(str()) thing is so this will do the same thing on Python 2 + # and Python 3. + return str(str(value)) + + def __str__(self): + if self.text: + return self.text + else: + return "%s|%s" % (self.name, self.attrs) + + def search_tag(self, markup_name=None, markup_attrs={}): + found = None + markup = None + if isinstance(markup_name, Tag): + markup = markup_name + markup_attrs = markup + call_function_with_tag_data = ( + isinstance(self.name, Callable) + and not isinstance(markup_name, Tag)) + + if ((not self.name) + or call_function_with_tag_data + or (markup and self._matches(markup, self.name)) + or (not markup and self._matches(markup_name, self.name))): + if call_function_with_tag_data: + match = self.name(markup_name, markup_attrs) + else: + match = True + markup_attr_map = None + for attr, match_against in list(self.attrs.items()): + if not markup_attr_map: + if hasattr(markup_attrs, 'get'): + markup_attr_map = markup_attrs + else: + markup_attr_map = {} + for k, v in markup_attrs: + markup_attr_map[k] = v + attr_value = markup_attr_map.get(attr) + if not self._matches(attr_value, match_against): + match = False + break + if match: + if markup: + found = markup + else: + found = markup_name + if found and self.text and not self._matches(found.string, self.text): + found = None + return found + searchTag = search_tag + + def search(self, markup): + # print 'looking for %s in %s' % (self, markup) + found = None + # If given a list of items, scan it for a text element that + # matches. + if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): + found = element + break + # If it's a Tag, make sure its name or attributes match. + # Don't bother with Tags if we're searching for text. + elif isinstance(markup, Tag): + if not self.text or self.name or self.attrs: + found = self.search_tag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ + isinstance(markup, str): + if not self.name and not self.attrs and self._matches(markup, self.text): + found = markup + else: + raise Exception( + "I don't know how to match against a %s" % markup.__class__) + return found + + def _matches(self, markup, match_against, already_tried=None): + # print u"Matching %s against %s" % (markup, match_against) + result = False + if isinstance(markup, list) or isinstance(markup, tuple): + # This should only happen when searching a multi-valued attribute + # like 'class'. + for item in markup: + if self._matches(item, match_against): + return True + # We didn't match any particular value of the multivalue + # attribute, but maybe we match the attribute value when + # considered as a string. + if self._matches(' '.join(markup), match_against): + return True + return False + + if match_against is True: + # True matches any non-None value. + return markup is not None + + if isinstance(match_against, Callable): + return match_against(markup) + + # Custom callables take the tag as an argument, but all + # other ways of matching match the tag name as a string. + original_markup = markup + if isinstance(markup, Tag): + markup = markup.name + + # Ensure that `markup` is either a Unicode string, or None. + markup = self._normalize_search_value(markup) + + if markup is None: + # None matches None, False, an empty string, an empty list, and so on. + return not match_against + + if (hasattr(match_against, '__iter__') + and not isinstance(match_against, str)): + # We're asked to match against an iterable of items. + # The markup must be match at least one item in the + # iterable. We'll try each one in turn. + # + # To avoid infinite recursion we need to keep track of + # items we've already seen. + if not already_tried: + already_tried = set() + for item in match_against: + if item.__hash__: + key = item + else: + key = id(item) + if key in already_tried: + continue + else: + already_tried.add(key) + if self._matches(original_markup, item, already_tried): + return True + else: + return False + + # Beyond this point we might need to run the test twice: once against + # the tag's name and once against its prefixed name. + match = False + + if not match and isinstance(match_against, str): + # Exact string match + match = markup == match_against + + if not match and hasattr(match_against, 'search'): + # Regexp match + return match_against.search(markup) + + if (not match + and isinstance(original_markup, Tag) + and original_markup.prefix): + # Try the whole thing again with the prefixed tag name. + return self._matches( + original_markup.prefix + ':' + original_markup.name, match_against + ) + + return match + + +class ResultSet(list): + """A ResultSet is just a list that keeps track of the SoupStrainer + that created it.""" + def __init__(self, source, result=()): + super(ResultSet, self).__init__(result) + self.source = source + + def __getattr__(self, key): + raise AttributeError( + "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key + ) diff --git a/libs3/bs4/formatter.py b/libs3/bs4/formatter.py new file mode 100644 index 000000000..7dbaa3850 --- /dev/null +++ b/libs3/bs4/formatter.py @@ -0,0 +1,99 @@ +from bs4.dammit import EntitySubstitution + +class Formatter(EntitySubstitution): + """Describes a strategy to use when outputting a parse tree to a string. + + Some parts of this strategy come from the distinction between + HTML4, HTML5, and XML. Others are configurable by the user. + """ + # Registries of XML and HTML formatters. + XML_FORMATTERS = {} + HTML_FORMATTERS = {} + + HTML = 'html' + XML = 'xml' + + HTML_DEFAULTS = dict( + cdata_containing_tags=set(["script", "style"]), + ) + + def _default(self, language, value, kwarg): + if value is not None: + return value + if language == self.XML: + return set() + return self.HTML_DEFAULTS[kwarg] + + def __init__( + self, language=None, entity_substitution=None, + void_element_close_prefix='/', cdata_containing_tags=None, + ): + """ + + :param void_element_close_prefix: By default, represent void + elements as <tag/> rather than <tag> + """ + self.language = language + self.entity_substitution = entity_substitution + self.void_element_close_prefix = void_element_close_prefix + self.cdata_containing_tags = self._default( + language, cdata_containing_tags, 'cdata_containing_tags' + ) + + def substitute(self, ns): + """Process a string that needs to undergo entity substitution.""" + if not self.entity_substitution: + return ns + from .element import NavigableString + if (isinstance(ns, NavigableString) + and ns.parent is not None + and ns.parent.name in self.cdata_containing_tags): + # Do nothing. + return ns + # Substitute. + return self.entity_substitution(ns) + + def attribute_value(self, value): + """Process the value of an attribute.""" + return self.substitute(value) + + def attributes(self, tag): + """Reorder a tag's attributes however you want.""" + return sorted(tag.attrs.items()) + + +class HTMLFormatter(Formatter): + REGISTRY = {} + def __init__(self, *args, **kwargs): + return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) + + +class XMLFormatter(Formatter): + REGISTRY = {} + def __init__(self, *args, **kwargs): + return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) + + +# Set up aliases for the default formatters. +HTMLFormatter.REGISTRY['html'] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_html +) +HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_html, + void_element_close_prefix = None +) +HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_xml +) +HTMLFormatter.REGISTRY[None] = HTMLFormatter( + entity_substitution=None +) +XMLFormatter.REGISTRY["html"] = XMLFormatter( + entity_substitution=EntitySubstitution.substitute_html +) +XMLFormatter.REGISTRY["minimal"] = XMLFormatter( + entity_substitution=EntitySubstitution.substitute_xml +) +XMLFormatter.REGISTRY[None] = Formatter( + Formatter(Formatter.XML, entity_substitution=None) +) diff --git a/libs3/bs4/testing.py b/libs3/bs4/testing.py new file mode 100644 index 000000000..cc9966601 --- /dev/null +++ b/libs3/bs4/testing.py @@ -0,0 +1,992 @@ +# encoding: utf-8 +"""Helper classes for tests.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import pickle +import copy +import functools +import unittest +from unittest import TestCase +from bs4 import BeautifulSoup +from bs4.element import ( + CharsetMetaAttributeValue, + Comment, + ContentMetaAttributeValue, + Doctype, + SoupStrainer, + Tag +) + +from bs4.builder import HTMLParserTreeBuilder +default_builder = HTMLParserTreeBuilder + +BAD_DOCUMENT = """A bare string +<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"> +<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd"> +<div><![CDATA[A CDATA section where it doesn't belong]]></div> +<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div> +<div>A <meta> tag</div> +<div>A <br> tag that supposedly has contents.</br></div> +<div>AT&T</div> +<div><textarea>Within a textarea, markup like <b> tags and <&<& should be treated as literal</textarea></div> +<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div> +<div>This numeric entity is missing the final semicolon: <x t="piñata"></div> +<div><a href="http://example.com/</a> that attribute value never got closed</div> +<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div> +<! This document starts with a bogus declaration ><div>a</div> +<div>This document contains <!an incomplete declaration <div>(do you see it?)</div> +<div>This document ends with <!an incomplete declaration +<div><a style={height:21px;}>That attribute value was bogus</a></div> +<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace +<div><table><td nowrap>That boolean attribute had no value</td></table></div> +<div>Here's a nonexistent entity: &#foo; (do you see it?)</div> +<div>This document ends before the entity finishes: > +<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p> +<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b> +<div><table><tr><td>Here's a table</td></tr></table></div> +<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div> +<div>This tag contains nothing but whitespace: <b> </b></div> +<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div> +<div><table><div>This table contains bare markup</div></table></div> +<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div> +<div>This document contains a <!DOCTYPE surprise>surprise doctype</div> +<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div> +<div><our\u2603>Tag name contains Unicode characters</our\u2603></div> +<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +""" + + +class SoupTest(unittest.TestCase): + + @property + def default_builder(self): + return default_builder + + def soup(self, markup, **kwargs): + """Build a Beautiful Soup object from markup.""" + builder = kwargs.pop('builder', self.default_builder) + return BeautifulSoup(markup, builder=builder, **kwargs) + + def document_for(self, markup, **kwargs): + """Turn an HTML fragment into a document. + + The details depend on the builder. + """ + return self.default_builder(**kwargs).test_fragment_to_document(markup) + + def assertSoupEquals(self, to_parse, compare_parsed_to=None): + builder = self.default_builder + obj = BeautifulSoup(to_parse, builder=builder) + if compare_parsed_to is None: + compare_parsed_to = to_parse + + self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) + + def assertConnectedness(self, element): + """Ensure that next_element and previous_element are properly + set for all descendants of the given element. + """ + earlier = None + for e in element.descendants: + if earlier: + self.assertEqual(e, earlier.next_element) + self.assertEqual(earlier, e.previous_element) + earlier = e + + def linkage_validator(self, el, _recursive_call=False): + """Ensure proper linkage throughout the document.""" + descendant = None + # Document element should have no previous element or previous sibling. + # It also shouldn't have a next sibling. + if el.parent is None: + assert el.previous_element is None,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_element, None + ) + assert el.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_sibling, None + ) + assert el.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_sibling, None + ) + + idx = 0 + child = None + last_child = None + last_idx = len(el.contents) - 1 + for child in el.contents: + descendant = None + + # Parent should link next element to their first child + # That child should have no previous sibling + if idx == 0: + if el.parent is not None: + assert el.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_element, child + ) + assert child.previous_element is el,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + child, child.previous_element, el + ) + assert child.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format( + child, child.previous_sibling, None + ) + + # If not the first child, previous index should link as sibling to this index + # Previous element should match the last index or the last bubbled up descendant + else: + assert child.previous_sibling is el.contents[idx - 1],\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format( + child, child.previous_sibling, el.contents[idx - 1] + ) + assert el.contents[idx - 1].next_sibling is child,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + el.contents[idx - 1], el.contents[idx - 1].next_sibling, child + ) + + if last_child is not None: + assert child.previous_element is last_child,\ + "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format( + child, child.previous_element, last_child, child.parent.contents + ) + assert last_child.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + last_child, last_child.next_element, child + ) + + if isinstance(child, Tag) and child.contents: + descendant = self.linkage_validator(child, True) + # A bubbled up descendant should have no next siblings + assert descendant.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + descendant, descendant.next_sibling, None + ) + + # Mark last child as either the bubbled up descendant or the current child + if descendant is not None: + last_child = descendant + else: + last_child = child + + # If last child, there are non next siblings + if idx == last_idx: + assert child.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_sibling, None + ) + idx += 1 + + child = descendant if descendant is not None else child + if child is None: + child = el + + if not _recursive_call and child is not None: + target = el + while True: + if target is None: + assert child.next_element is None, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, None + ) + break + elif target.next_sibling is not None: + assert child.next_element is target.next_sibling, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, target.next_sibling + ) + break + target = target.parent + + # We are done, so nothing to return + return None + else: + # Return the child to the recursive caller + return child + + +class HTMLTreeBuilderSmokeTest(object): + + """A basic test of a treebuilder's competence. + + Any HTML treebuilder, present or future, should be able to pass + these tests. With invalid markup, there's room for interpretation, + and different parsers can handle it differently. But with the + markup in these tests, there's not much room for interpretation. + """ + + def test_empty_element_tags(self): + """Verify that all HTML4 and HTML5 empty element (aka void element) tags + are handled correctly. + """ + for name in [ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + 'spacer', 'frame' + ]: + soup = self.soup("") + new_tag = soup.new_tag(name) + self.assertEqual(True, new_tag.is_empty_element) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("<a><b>foo</a>") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + + def assertDoctypeHandled(self, doctype_fragment): + """Assert that a given doctype string is handled correctly.""" + doctype_str, soup = self._document_with_doctype(doctype_fragment) + + # Make sure a Doctype object was created. + doctype = soup.contents[0] + self.assertEqual(doctype.__class__, Doctype) + self.assertEqual(doctype, doctype_fragment) + self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) + + # Make sure that the doctype was correctly associated with the + # parse tree and that the rest of the document parsed. + self.assertEqual(soup.p.contents[0], 'foo') + + def _document_with_doctype(self, doctype_fragment): + """Generate and parse a document with the given doctype.""" + doctype = '<!DOCTYPE %s>' % doctype_fragment + markup = doctype + '\n<p>foo</p>' + soup = self.soup(markup) + return doctype, soup + + def test_normal_doctypes(self): + """Make sure normal, everyday HTML doctypes are handled correctly.""" + self.assertDoctypeHandled("html") + self.assertDoctypeHandled( + 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') + + def test_empty_doctype(self): + soup = self.soup("<!DOCTYPE>") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + + def test_public_doctype_with_url(self): + doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' + self.assertDoctypeHandled(doctype) + + def test_system_doctype(self): + self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') + + def test_namespaced_system_doctype(self): + # We can handle a namespaced doctype with a system ID. + self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') + + def test_namespaced_public_doctype(self): + # Test a namespaced doctype with a public id. + self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') + + def test_real_xhtml_document(self): + """A real XHTML document should come out more or less the same as it went in.""" + markup = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head><title>Hello.</title></head> +<body>Goodbye.</body> +</html>""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8").replace(b"\n", b""), + markup.replace(b"\n", b"")) + + def test_namespaced_html(self): + """When a namespaced XML document is parsed as HTML it should + be treated as HTML with weird tag names. + """ + markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>""" + soup = self.soup(markup) + self.assertEqual(2, len(soup.find_all("ns1:foo"))) + + def test_processing_instruction(self): + # We test both Unicode and bytestring to verify that + # process_markup correctly sets processing_instruction_class + # even when the markup is already Unicode and there is no + # need to process anything. + markup = """<?PITarget PIContent?>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.decode()) + + markup = b"""<?PITarget PIContent?>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_deepcopy(self): + """Make sure you can copy the tree builder. + + This is important because the builder is part of a + BeautifulSoup object, and we want to be able to copy that. + """ + copy.deepcopy(self.default_builder) + + def test_p_tag_is_never_empty_element(self): + """A <p> tag is never designated as an empty-element tag. + + Even if the markup shows it as an empty-element tag, it + shouldn't be presented that way. + """ + soup = self.soup("<p/>") + self.assertFalse(soup.p.is_empty_element) + self.assertEqual(str(soup.p), "<p></p>") + + def test_unclosed_tags_get_closed(self): + """A tag that's not closed by the end of the document should be closed. + + This applies to all tags except empty-element tags. + """ + self.assertSoupEquals("<p>", "<p></p>") + self.assertSoupEquals("<b>", "<b></b>") + + self.assertSoupEquals("<br>", "<br/>") + + def test_br_is_always_empty_element_tag(self): + """A <br> tag is designated as an empty-element tag. + + Some parsers treat <br></br> as one <br/> tag, some parsers as + two tags, but it should always be an empty-element tag. + """ + soup = self.soup("<br></br>") + self.assertTrue(soup.br.is_empty_element) + self.assertEqual(str(soup.br), "<br/>") + + def test_nested_formatting_elements(self): + self.assertSoupEquals("<em><em></em></em>") + + def test_double_head(self): + html = '''<!DOCTYPE html> +<html> +<head> +<title>Ordinary HEAD element test</title> +</head> +<script type="text/javascript"> +alert("Help!"); +</script> +<body> +Hello, world! +</body> +</html> +''' + soup = self.soup(html) + self.assertEqual("text/javascript", soup.find('script')['type']) + + def test_comment(self): + # Comments are represented as Comment objects. + markup = "<p>foo<!--foobar-->baz</p>" + self.assertSoupEquals(markup) + + soup = self.soup(markup) + comment = soup.find(text="foobar") + self.assertEqual(comment.__class__, Comment) + + # The comment is properly integrated into the tree. + foo = soup.find(text="foo") + self.assertEqual(comment, foo.next_element) + baz = soup.find(text="baz") + self.assertEqual(comment, baz.previous_element) + + def test_preserved_whitespace_in_pre_and_textarea(self): + """Whitespace must be preserved in <pre> and <textarea> tags, + even if that would mean not prettifying the markup. + """ + pre_markup = "<pre> </pre>" + textarea_markup = "<textarea> woo\nwoo </textarea>" + self.assertSoupEquals(pre_markup) + self.assertSoupEquals(textarea_markup) + + soup = self.soup(pre_markup) + self.assertEqual(soup.pre.prettify(), pre_markup) + + soup = self.soup(textarea_markup) + self.assertEqual(soup.textarea.prettify(), textarea_markup) + + soup = self.soup("<textarea></textarea>") + self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>") + + def test_nested_inline_elements(self): + """Inline elements can be nested indefinitely.""" + b_tag = "<b>Inside a B tag</b>" + self.assertSoupEquals(b_tag) + + nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>" + self.assertSoupEquals(nested_b_tag) + + double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>" + self.assertSoupEquals(nested_b_tag) + + def test_nested_block_level_elements(self): + """Block elements can be nested.""" + soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>') + blockquote = soup.blockquote + self.assertEqual(blockquote.p.b.string, 'Foo') + self.assertEqual(blockquote.b.string, 'Foo') + + def test_correctly_nested_tables(self): + """One table can go inside another one.""" + markup = ('<table id="1">' + '<tr>' + "<td>Here's another table:" + '<table id="2">' + '<tr><td>foo</td></tr>' + '</table></td>') + + self.assertSoupEquals( + markup, + '<table id="1"><tr><td>Here\'s another table:' + '<table id="2"><tr><td>foo</td></tr></table>' + '</td></tr></table>') + + self.assertSoupEquals( + "<table><thead><tr><td>Foo</td></tr></thead>" + "<tbody><tr><td>Bar</td></tr></tbody>" + "<tfoot><tr><td>Baz</td></tr></tfoot></table>") + + def test_multivalued_attribute_with_whitespace(self): + # Whitespace separating the values of a multi-valued attribute + # should be ignored. + + markup = '<div class=" foo bar "></a>' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.div['class']) + + # If you search by the literal name of the class it's like the whitespace + # wasn't there. + self.assertEqual(soup.div, soup.find('div', class_="foo bar")) + + def test_deeply_nested_multivalued_attribute(self): + # html5lib can set the attributes of the same tag many times + # as it rearranges the tree. This has caused problems with + # multivalued attributes. + markup = '<table><div><div class="css"></div></div></table>' + soup = self.soup(markup) + self.assertEqual(["css"], soup.div.div['class']) + + def test_multivalued_attribute_on_html(self): + # html5lib uses a different API to set the attributes ot the + # <html> tag. This has caused problems with multivalued + # attributes. + markup = '<html class="a b"></html>' + soup = self.soup(markup) + self.assertEqual(["a", "b"], soup.html['class']) + + def test_angle_brackets_in_attribute_values_are_escaped(self): + self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') + + def test_strings_resembling_character_entity_references(self): + # "&T" and "&p" look like incomplete character entities, but they are + # not. + self.assertSoupEquals( + "<p>• AT&T is in the s&p 500</p>", + "<p>\u2022 AT&T is in the s&p 500</p>" + ) + + def test_apos_entity(self): + self.assertSoupEquals( + "<p>Bob's Bar</p>", + "<p>Bob's Bar</p>", + ) + + def test_entities_in_foreign_document_encoding(self): + # “ and ” are invalid numeric entities referencing + # Windows-1252 characters. - references a character common + # to Windows-1252 and Unicode, and ☃ references a + # character only found in Unicode. + # + # All of these entities should be converted to Unicode + # characters. + markup = "<p>“Hello” -☃</p>" + soup = self.soup(markup) + self.assertEqual("“Hello” -☃", soup.p.string) + + def test_entities_in_attributes_converted_to_unicode(self): + expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' + self.assertSoupEquals('<p id="piñata"></p>', expect) + self.assertSoupEquals('<p id="piñata"></p>', expect) + self.assertSoupEquals('<p id="piñata"></p>', expect) + self.assertSoupEquals('<p id="piñata"></p>', expect) + + def test_entities_in_text_converted_to_unicode(self): + expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' + self.assertSoupEquals("<p>piñata</p>", expect) + self.assertSoupEquals("<p>piñata</p>", expect) + self.assertSoupEquals("<p>piñata</p>", expect) + self.assertSoupEquals("<p>piñata</p>", expect) + + def test_quot_entity_converted_to_quotation_mark(self): + self.assertSoupEquals("<p>I said "good day!"</p>", + '<p>I said "good day!"</p>') + + def test_out_of_range_entity(self): + expect = "\N{REPLACEMENT CHARACTER}" + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + + def test_multipart_strings(self): + "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." + soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") + self.assertEqual("p", soup.h2.string.next_element.name) + self.assertEqual("p", soup.p.name) + self.assertConnectedness(soup) + + def test_empty_element_tags(self): + """Verify consistent handling of empty-element tags, + no matter how they come in through the markup. + """ + self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>") + self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>") + + def test_head_tag_between_head_and_body(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """<html><head></head> + <link></link> + <body>foo</body> +</html> +""" + soup = self.soup(content) + self.assertNotEqual(None, soup.html.body) + self.assertConnectedness(soup) + + def test_multiple_copies_of_a_tag(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """<!DOCTYPE html> +<html> + <body> + <article id="a" > + <div><a href="1"></div> + <footer> + <a href="2"></a> + </footer> + </article> + </body> +</html> +""" + soup = self.soup(content) + self.assertConnectedness(soup.article) + + def test_basic_namespaces(self): + """Parsers don't need to *understand* namespaces, but at the + very least they should not choke on namespaces or lose + data.""" + + markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>' + soup = self.soup(markup) + self.assertEqual(markup, soup.encode()) + html = soup.html + self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) + self.assertEqual( + 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) + self.assertEqual( + 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) + + def test_multivalued_attribute_value_becomes_list(self): + markup = b'<a class="foo bar">' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.a['class']) + + # + # Generally speaking, tests below this point are more tests of + # Beautiful Soup than tests of the tree builders. But parsers are + # weird, so we run these tests separately for every tree builder + # to detect any differences between them. + # + + def test_can_parse_unicode_document(self): + # A seemingly innocuous document... but it's in Unicode! And + # it contains characters that can't be represented in the + # encoding found in the declaration! The horror! + markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' + soup = self.soup(markup) + self.assertEqual('Sacr\xe9 bleu!', soup.body.string) + + def test_soupstrainer(self): + """Parsers should be able to work with SoupStrainers.""" + strainer = SoupStrainer("b") + soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>", + parse_only=strainer) + self.assertEqual(soup.decode(), "<b>bold</b>") + + def test_single_quote_attribute_values_become_double_quotes(self): + self.assertSoupEquals("<foo attr='bar'></foo>", + '<foo attr="bar"></foo>') + + def test_attribute_values_with_nested_quotes_are_left_alone(self): + text = """<foo attr='bar "brawls" happen'>a</foo>""" + self.assertSoupEquals(text) + + def test_attribute_values_with_double_nested_quotes_get_quoted(self): + text = """<foo attr='bar "brawls" happen'>a</foo>""" + soup = self.soup(text) + soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' + self.assertSoupEquals( + soup.foo.decode(), + """<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""") + + def test_ampersand_in_attribute_value_gets_escaped(self): + self.assertSoupEquals('<this is="really messed up & stuff"></this>', + '<this is="really messed up & stuff"></this>') + + self.assertSoupEquals( + '<a href="http://example.org?a=1&b=2;3">foo</a>', + '<a href="http://example.org?a=1&b=2;3">foo</a>') + + def test_escaped_ampersand_in_attribute_value_is_left_alone(self): + self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>') + + def test_entities_in_strings_converted_during_parsing(self): + # Both XML and HTML entities are converted to Unicode characters + # during parsing. + text = "<p><<sacré bleu!>></p>" + expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" + self.assertSoupEquals(text, expected) + + def test_smart_quotes_converted_on_the_way_in(self): + # Microsoft smart quotes are converted to Unicode characters during + # parsing. + quote = b"<p>\x91Foo\x92</p>" + soup = self.soup(quote) + self.assertEqual( + soup.p.string, + "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup("<a> </a>") + self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) + + def test_entities_converted_on_the_way_out(self): + text = "<p><<sacré bleu!>></p>" + expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8") + soup = self.soup(text) + self.assertEqual(soup.p.encode("utf-8"), expected) + + def test_real_iso_latin_document(self): + # Smoke test of interrelated functionality, using an + # easy-to-understand document. + + # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. + unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' + + # That's because we're going to encode it into ISO-Latin-1, and use + # that to test. + iso_latin_html = unicode_html.encode("iso-8859-1") + + # Parse the ISO-Latin-1 HTML. + soup = self.soup(iso_latin_html) + # Encode it to UTF-8. + result = soup.encode("utf-8") + + # What do we expect the result to look like? Well, it would + # look like unicode_html, except that the META tag would say + # UTF-8 instead of ISO-Latin-1. + expected = unicode_html.replace("ISO-Latin-1", "utf-8") + + # And, of course, it would be in UTF-8, not Unicode. + expected = expected.encode("utf-8") + + # Ta-da! + self.assertEqual(result, expected) + + def test_real_shift_jis_document(self): + # Smoke test to make sure the parser can handle a document in + # Shift-JIS encoding, without choking. + shift_jis_html = ( + b'<html><head></head><body><pre>' + b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' + b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' + b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B' + b'</pre></body></html>') + unicode_html = shift_jis_html.decode("shift-jis") + soup = self.soup(unicode_html) + + # Make sure the parse tree is correctly encoded to various + # encodings. + self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) + self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) + + def test_real_hebrew_document(self): + # A real-world test to make sure we can convert ISO-8859-9 (a + # Hebrew encoding) to UTF-8. + hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>' + soup = self.soup( + hebrew_document, from_encoding="iso8859-8") + # Some tree builders call it iso8859-8, others call it iso-8859-9. + # That's not a difference we really care about. + assert soup.original_encoding in ('iso8859-8', 'iso-8859-8') + self.assertEqual( + soup.encode('utf-8'), + hebrew_document.decode("iso8859-8").encode("utf-8")) + + def test_meta_tag_reflects_current_encoding(self): + # Here's the <meta> tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('<meta content="text/html; charset=x-sjis" ' + 'http-equiv="Content-type"/>') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '<html><head>\n%s\n' + '<meta http-equiv="Content-language" content="ja"/>' + '</head><body>Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) + content = parsed_meta['content'] + self.assertEqual('text/html; charset=x-sjis', content) + + # But that value is actually a ContentMetaAttributeValue object. + self.assertTrue(isinstance(content, ContentMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('text/html; charset=utf8', content.encode("utf8")) + + # For the rest of the story, see TestSubstitutions in + # test_tree.py. + + def test_html5_style_meta_tag_reflects_current_encoding(self): + # Here's the <meta> tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('<meta id="encoding" charset="x-sjis" />') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '<html><head>\n%s\n' + '<meta http-equiv="Content-language" content="ja"/>' + '</head><body>Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', id="encoding") + charset = parsed_meta['charset'] + self.assertEqual('x-sjis', charset) + + # But that value is actually a CharsetMetaAttributeValue object. + self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('utf8', charset.encode("utf8")) + + def test_tag_with_no_attributes_can_have_attributes_added(self): + data = self.soup("<a>text</a>") + data.a['foo'] = 'bar' + self.assertEqual('<a foo="bar">text</a>', data.a.decode()) + + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + +class XMLTreeBuilderSmokeTest(object): + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("<a><b>foo</a>") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + + def test_docstring_generated(self): + soup = self.soup("<root/>") + self.assertEqual( + soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>') + + def test_xml_declaration(self): + markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_processing_instruction(self): + markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_real_xhtml_document(self): + """A real XHTML document should come out *exactly* the same as it went in.""" + markup = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head><title>Hello.</title></head> +<body>Goodbye.</body> +</html>""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8"), markup) + + def test_nested_namespaces(self): + doc = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> +<parent xmlns="http://ns1/"> +<child xmlns="http://ns2/" xmlns:ns3="http://ns3/"> +<grandchild ns3:attr="value" xmlns="http://ns4/"/> +</child> +</parent>""" + soup = self.soup(doc) + self.assertEqual(doc, soup.encode()) + + def test_formatter_processes_script_tag_for_xml_documents(self): + doc = """ + <script type="text/javascript"> + </script> +""" + soup = BeautifulSoup(doc, "lxml-xml") + # lxml would have stripped this while parsing, but we can add + # it later. + soup.script.string = 'console.log("< < hey > > ");' + encoded = soup.encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_can_parse_unicode_document(self): + markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' + soup = self.soup(markup) + self.assertEqual('Sacr\xe9 bleu!', soup.root.string) + + def test_popping_namespaced_tag(self): + markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' + soup = self.soup(markup) + self.assertEqual( + str(soup.rss), markup) + + def test_docstring_includes_correct_encoding(self): + soup = self.soup("<root/>") + self.assertEqual( + soup.encode("latin1"), + b'<?xml version="1.0" encoding="latin1"?>\n<root/>') + + def test_large_xml_document(self): + """A large XML document should come out the same as it went in.""" + markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>' + + b'0' * (2**12) + + b'</root>') + soup = self.soup(markup) + self.assertEqual(soup.encode("utf-8"), markup) + + + def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): + self.assertSoupEquals("<p>", "<p/>") + self.assertSoupEquals("<p>foo</p>") + + def test_namespaces_are_preserved(self): + markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>' + soup = self.soup(markup) + root = soup.root + self.assertEqual("http://example.com/", root['xmlns:a']) + self.assertEqual("http://example.net/", root['xmlns:b']) + + def test_closing_namespaced_tag(self): + markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' + soup = self.soup(markup) + self.assertEqual(str(soup.p), markup) + + def test_namespaced_attributes(self): + markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' + soup = self.soup(markup) + self.assertEqual(str(soup.foo), markup) + + def test_namespaced_attributes_xml_namespace(self): + markup = '<foo xml:lang="fr">bar</foo>' + soup = self.soup(markup) + self.assertEqual(str(soup.foo), markup) + + def test_find_by_prefixed_name(self): + doc = """<?xml version="1.0" encoding="utf-8"?> +<Document xmlns="http://example.com/ns0" + xmlns:ns1="http://example.com/ns1" + xmlns:ns2="http://example.com/ns2" + <ns1:tag>foo</ns1:tag> + <ns1:tag>bar</ns1:tag> + <ns2:tag key="value">baz</ns2:tag> +</Document> +""" + soup = self.soup(doc) + + # There are three <tag> tags. + self.assertEqual(3, len(soup.find_all('tag'))) + + # But two of them are ns1:tag and one of them is ns2:tag. + self.assertEqual(2, len(soup.find_all('ns1:tag'))) + self.assertEqual(1, len(soup.find_all('ns2:tag'))) + + self.assertEqual(1, len(soup.find_all('ns2:tag', key='value'))) + self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag']))) + + def test_copy_tag_preserves_namespace(self): + xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?> +<w:document xmlns:w="http://example.com/ns0"/>""" + + soup = self.soup(xml) + tag = soup.document + duplicate = copy.copy(tag) + + # The two tags have the same namespace prefix. + self.assertEqual(tag.prefix, duplicate.prefix) + + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + +class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): + """Smoke test for a tree builder that supports HTML5.""" + + def test_real_xhtml_document(self): + # Since XHTML is not HTML5, HTML5 parsers are not tested to handle + # XHTML documents in any particular way. + pass + + def test_html_tags_have_namespace(self): + markup = "<a>" + soup = self.soup(markup) + self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) + + def test_svg_tags_have_namespace(self): + markup = '<svg><circle/></svg>' + soup = self.soup(markup) + namespace = "http://www.w3.org/2000/svg" + self.assertEqual(namespace, soup.svg.namespace) + self.assertEqual(namespace, soup.circle.namespace) + + + def test_mathml_tags_have_namespace(self): + markup = '<math><msqrt>5</msqrt></math>' + soup = self.soup(markup) + namespace = 'http://www.w3.org/1998/Math/MathML' + self.assertEqual(namespace, soup.math.namespace) + self.assertEqual(namespace, soup.msqrt.namespace) + + def test_xml_declaration_becomes_comment(self): + markup = '<?xml version="1.0" encoding="utf-8"?><html></html>' + soup = self.soup(markup) + self.assertTrue(isinstance(soup.contents[0], Comment)) + self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') + self.assertEqual("html", soup.contents[0].next_element.name) + +def skipIf(condition, reason): + def nothing(test, *args, **kwargs): + return None + + def decorator(test_item): + if condition: + return nothing + else: + return test_item + + return decorator diff --git a/libs3/bs4/tests/__init__.py b/libs3/bs4/tests/__init__.py new file mode 100644 index 000000000..142c8cc3f --- /dev/null +++ b/libs3/bs4/tests/__init__.py @@ -0,0 +1 @@ +"The beautifulsoup tests." diff --git a/libs3/bs4/tests/test_builder_registry.py b/libs3/bs4/tests/test_builder_registry.py new file mode 100644 index 000000000..90cad8293 --- /dev/null +++ b/libs3/bs4/tests/test_builder_registry.py @@ -0,0 +1,147 @@ +"""Tests of the builder registry.""" + +import unittest +import warnings + +from bs4 import BeautifulSoup +from bs4.builder import ( + builder_registry as registry, + HTMLParserTreeBuilder, + TreeBuilderRegistry, +) + +try: + from bs4.builder import HTML5TreeBuilder + HTML5LIB_PRESENT = True +except ImportError: + HTML5LIB_PRESENT = False + +try: + from bs4.builder import ( + LXMLTreeBuilderForXML, + LXMLTreeBuilder, + ) + LXML_PRESENT = True +except ImportError: + LXML_PRESENT = False + + +class BuiltInRegistryTest(unittest.TestCase): + """Test the built-in registry with the default builders registered.""" + + def test_combination(self): + if LXML_PRESENT: + self.assertEqual(registry.lookup('fast', 'html'), + LXMLTreeBuilder) + + if LXML_PRESENT: + self.assertEqual(registry.lookup('permissive', 'xml'), + LXMLTreeBuilderForXML) + self.assertEqual(registry.lookup('strict', 'html'), + HTMLParserTreeBuilder) + if HTML5LIB_PRESENT: + self.assertEqual(registry.lookup('html5lib', 'html'), + HTML5TreeBuilder) + + def test_lookup_by_markup_type(self): + if LXML_PRESENT: + self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) + self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) + else: + self.assertEqual(registry.lookup('xml'), None) + if HTML5LIB_PRESENT: + self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) + else: + self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) + + def test_named_library(self): + if LXML_PRESENT: + self.assertEqual(registry.lookup('lxml', 'xml'), + LXMLTreeBuilderForXML) + self.assertEqual(registry.lookup('lxml', 'html'), + LXMLTreeBuilder) + if HTML5LIB_PRESENT: + self.assertEqual(registry.lookup('html5lib'), + HTML5TreeBuilder) + + self.assertEqual(registry.lookup('html.parser'), + HTMLParserTreeBuilder) + + def test_beautifulsoup_constructor_does_lookup(self): + + with warnings.catch_warnings(record=True) as w: + # This will create a warning about not explicitly + # specifying a parser, but we'll ignore it. + + # You can pass in a string. + BeautifulSoup("", features="html") + # Or a list of strings. + BeautifulSoup("", features=["html", "fast"]) + + # You'll get an exception if BS can't find an appropriate + # builder. + self.assertRaises(ValueError, BeautifulSoup, + "", features="no-such-feature") + +class RegistryTest(unittest.TestCase): + """Test the TreeBuilderRegistry class in general.""" + + def setUp(self): + self.registry = TreeBuilderRegistry() + + def builder_for_features(self, *feature_list): + cls = type('Builder_' + '_'.join(feature_list), + (object,), {'features' : feature_list}) + + self.registry.register(cls) + return cls + + def test_register_with_no_features(self): + builder = self.builder_for_features() + + # Since the builder advertises no features, you can't find it + # by looking up features. + self.assertEqual(self.registry.lookup('foo'), None) + + # But you can find it by doing a lookup with no features, if + # this happens to be the only registered builder. + self.assertEqual(self.registry.lookup(), builder) + + def test_register_with_features_makes_lookup_succeed(self): + builder = self.builder_for_features('foo', 'bar') + self.assertEqual(self.registry.lookup('foo'), builder) + self.assertEqual(self.registry.lookup('bar'), builder) + + def test_lookup_fails_when_no_builder_implements_feature(self): + builder = self.builder_for_features('foo', 'bar') + self.assertEqual(self.registry.lookup('baz'), None) + + def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): + builder1 = self.builder_for_features('foo') + builder2 = self.builder_for_features('bar') + self.assertEqual(self.registry.lookup(), builder2) + + def test_lookup_fails_when_no_tree_builders_registered(self): + self.assertEqual(self.registry.lookup(), None) + + def test_lookup_gets_most_recent_builder_supporting_all_features(self): + has_one = self.builder_for_features('foo') + has_the_other = self.builder_for_features('bar') + has_both_early = self.builder_for_features('foo', 'bar', 'baz') + has_both_late = self.builder_for_features('foo', 'bar', 'quux') + lacks_one = self.builder_for_features('bar') + has_the_other = self.builder_for_features('foo') + + # There are two builders featuring 'foo' and 'bar', but + # the one that also features 'quux' was registered later. + self.assertEqual(self.registry.lookup('foo', 'bar'), + has_both_late) + + # There is only one builder featuring 'foo', 'bar', and 'baz'. + self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), + has_both_early) + + def test_lookup_fails_when_cannot_reconcile_requested_features(self): + builder1 = self.builder_for_features('foo', 'bar') + builder2 = self.builder_for_features('foo', 'baz') + self.assertEqual(self.registry.lookup('bar', 'baz'), None) diff --git a/libs3/bs4/tests/test_docs.py b/libs3/bs4/tests/test_docs.py new file mode 100644 index 000000000..5b9f67709 --- /dev/null +++ b/libs3/bs4/tests/test_docs.py @@ -0,0 +1,36 @@ +"Test harness for doctests." + +# pylint: disable-msg=E0611,W0142 + +__metaclass__ = type +__all__ = [ + 'additional_tests', + ] + +import atexit +import doctest +import os +#from pkg_resources import ( +# resource_filename, resource_exists, resource_listdir, cleanup_resources) +import unittest + +DOCTEST_FLAGS = ( + doctest.ELLIPSIS | + doctest.NORMALIZE_WHITESPACE | + doctest.REPORT_NDIFF) + + +# def additional_tests(): +# "Run the doc tests (README.txt and docs/*, if any exist)" +# doctest_files = [ +# os.path.abspath(resource_filename('bs4', 'README.txt'))] +# if resource_exists('bs4', 'docs'): +# for name in resource_listdir('bs4', 'docs'): +# if name.endswith('.txt'): +# doctest_files.append( +# os.path.abspath( +# resource_filename('bs4', 'docs/%s' % name))) +# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) +# atexit.register(cleanup_resources) +# return unittest.TestSuite(( +# doctest.DocFileSuite(*doctest_files, **kwargs))) diff --git a/libs3/bs4/tests/test_html5lib.py b/libs3/bs4/tests/test_html5lib.py new file mode 100644 index 000000000..96529b0b3 --- /dev/null +++ b/libs3/bs4/tests/test_html5lib.py @@ -0,0 +1,170 @@ +"""Tests to ensure that the html5lib tree builder generates good trees.""" + +import warnings + +try: + from bs4.builder import HTML5TreeBuilder + HTML5LIB_PRESENT = True +except ImportError as e: + HTML5LIB_PRESENT = False +from bs4.element import SoupStrainer +from bs4.testing import ( + HTML5TreeBuilderSmokeTest, + SoupTest, + skipIf, +) + +@skipIf( + not HTML5LIB_PRESENT, + "html5lib seems not to be present, not testing its tree builder.") +class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): + """See ``HTML5TreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return HTML5TreeBuilder + + def test_soupstrainer(self): + # The html5lib tree builder does not support SoupStrainers. + strainer = SoupStrainer("b") + markup = "<p>A <b>bold</b> statement.</p>" + with warnings.catch_warnings(record=True) as w: + soup = self.soup(markup, parse_only=strainer) + self.assertEqual( + soup.decode(), self.document_for(markup)) + + self.assertTrue( + "the html5lib tree builder doesn't support parse_only" in + str(w[0].message)) + + def test_correctly_nested_tables(self): + """html5lib inserts <tbody> tags where other parsers don't.""" + markup = ('<table id="1">' + '<tr>' + "<td>Here's another table:" + '<table id="2">' + '<tr><td>foo</td></tr>' + '</table></td>') + + self.assertSoupEquals( + markup, + '<table id="1"><tbody><tr><td>Here\'s another table:' + '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>' + '</td></tr></tbody></table>') + + self.assertSoupEquals( + "<table><thead><tr><td>Foo</td></tr></thead>" + "<tbody><tr><td>Bar</td></tr></tbody>" + "<tfoot><tr><td>Baz</td></tr></tfoot></table>") + + def test_xml_declaration_followed_by_doctype(self): + markup = '''<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html> +<html> + <head> + </head> + <body> + <p>foo</p> + </body> +</html>''' + soup = self.soup(markup) + # Verify that we can reach the <p> tag; this means the tree is connected. + self.assertEqual(b"<p>foo</p>", soup.p.encode()) + + def test_reparented_markup(self): + markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>' + soup = self.soup(markup) + self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) + self.assertEqual(2, len(soup.find_all('p'))) + + + def test_reparented_markup_ends_with_whitespace(self): + markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n' + soup = self.soup(markup) + self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) + self.assertEqual(2, len(soup.find_all('p'))) + + def test_reparented_markup_containing_identical_whitespace_nodes(self): + """Verify that we keep the two whitespace nodes in this + document distinct when reparenting the adjacent <tbody> tags. + """ + markup = '<table> <tbody><tbody><ims></tbody> </table>' + soup = self.soup(markup) + space1, space2 = soup.find_all(string=' ') + tbody1, tbody2 = soup.find_all('tbody') + assert space1.next_element is tbody1 + assert tbody2.next_element is space2 + + def test_reparented_markup_containing_children(self): + markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>' + soup = self.soup(markup) + noscript = soup.noscript + self.assertEqual("target", noscript.next_element) + target = soup.find(string='target') + + # The 'aftermath' string was duplicated; we want the second one. + final_aftermath = soup.find_all(string='aftermath')[-1] + + # The <noscript> tag was moved beneath a copy of the <a> tag, + # but the 'target' string within is still connected to the + # (second) 'aftermath' string. + self.assertEqual(final_aftermath, target.next_element) + self.assertEqual(target, final_aftermath.previous_element) + + def test_processing_instruction(self): + """Processing instructions become comments.""" + markup = b"""<?PITarget PIContent?>""" + soup = self.soup(markup) + assert str(soup).startswith("<!--?PITarget PIContent?-->") + + def test_cloned_multivalue_node(self): + markup = b"""<a class="my_class"><p></a>""" + soup = self.soup(markup) + a1, a2 = soup.find_all('a') + self.assertEqual(a1, a2) + assert a1 is not a2 + + def test_foster_parenting(self): + markup = b"""<table><td></tbody>A""" + soup = self.soup(markup) + self.assertEqual("<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode()) + + def test_extraction(self): + """ + Test that extraction does not destroy the tree. + + https://bugs.launchpad.net/beautifulsoup/+bug/1782928 + """ + + markup = """ +<html><head></head> +<style> +</style><script></script><body><p>hello</p></body></html> +""" + soup = self.soup(markup) + [s.extract() for s in soup('script')] + [s.extract() for s in soup('style')] + + self.assertEqual(len(soup.find_all("p")), 1) + + def test_empty_comment(self): + """ + Test that empty comment does not break structure. + + https://bugs.launchpad.net/beautifulsoup/+bug/1806598 + """ + + markup = """ +<html> +<body> +<form> +<!----><input type="text"> +</form> +</body> +</html> +""" + soup = self.soup(markup) + inputs = [] + for form in soup.find_all('form'): + inputs.extend(form.find_all('input')) + self.assertEqual(len(inputs), 1) diff --git a/libs3/bs4/tests/test_htmlparser.py b/libs3/bs4/tests/test_htmlparser.py new file mode 100644 index 000000000..790489aa1 --- /dev/null +++ b/libs3/bs4/tests/test_htmlparser.py @@ -0,0 +1,47 @@ +"""Tests to ensure that the html.parser tree builder generates good +trees.""" + +from pdb import set_trace +import pickle +from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest +from bs4.builder import HTMLParserTreeBuilder +from bs4.builder._htmlparser import BeautifulSoupHTMLParser + +class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): + + default_builder = HTMLParserTreeBuilder + + def test_namespaced_system_doctype(self): + # html.parser can't handle namespaced doctypes, so skip this one. + pass + + def test_namespaced_public_doctype(self): + # html.parser can't handle namespaced doctypes, so skip this one. + pass + + def test_builder_is_pickled(self): + """Unlike most tree builders, HTMLParserTreeBuilder and will + be restored after pickling. + """ + tree = self.soup("<a><b>foo</a>") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertTrue(isinstance(loaded.builder, type(tree.builder))) + + def test_redundant_empty_element_closing_tags(self): + self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>") + self.assertSoupEquals('</br></br></br>', "") + + def test_empty_element(self): + # This verifies that any buffered data present when the parser + # finishes working is handled. + self.assertSoupEquals("foo &# bar", "foo &# bar") + + +class TestHTMLParserSubclass(SoupTest): + def test_error(self): + """Verify that our HTMLParser subclass implements error() in a way + that doesn't cause a crash. + """ + parser = BeautifulSoupHTMLParser() + parser.error("don't crash") diff --git a/libs3/bs4/tests/test_lxml.py b/libs3/bs4/tests/test_lxml.py new file mode 100644 index 000000000..29da71149 --- /dev/null +++ b/libs3/bs4/tests/test_lxml.py @@ -0,0 +1,100 @@ +"""Tests to ensure that the lxml tree builder generates good trees.""" + +import re +import warnings + +try: + import lxml.etree + LXML_PRESENT = True + LXML_VERSION = lxml.etree.LXML_VERSION +except ImportError as e: + LXML_PRESENT = False + LXML_VERSION = (0,) + +if LXML_PRESENT: + from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML + +from bs4 import ( + BeautifulSoup, + BeautifulStoneSoup, + ) +from bs4.element import Comment, Doctype, SoupStrainer +from bs4.testing import skipIf +from bs4.tests import test_htmlparser +from bs4.testing import ( + HTMLTreeBuilderSmokeTest, + XMLTreeBuilderSmokeTest, + SoupTest, + skipIf, +) + +@skipIf( + not LXML_PRESENT, + "lxml seems not to be present, not testing its tree builder.") +class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): + """See ``HTMLTreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return LXMLTreeBuilder + + def test_out_of_range_entity(self): + self.assertSoupEquals( + "<p>foo�bar</p>", "<p>foobar</p>") + self.assertSoupEquals( + "<p>foo�bar</p>", "<p>foobar</p>") + self.assertSoupEquals( + "<p>foo�bar</p>", "<p>foobar</p>") + + def test_entities_in_foreign_document_encoding(self): + # We can't implement this case correctly because by the time we + # hear about markup like "“", it's been (incorrectly) converted into + # a string like u'\x93' + pass + + # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this + # test if an old version of lxml is installed. + + @skipIf( + not LXML_PRESENT or LXML_VERSION < (2,3,5,0), + "Skipping doctype test for old version of lxml to avoid segfault.") + def test_empty_doctype(self): + soup = self.soup("<!DOCTYPE>") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + + def test_beautifulstonesoup_is_xml_parser(self): + # Make sure that the deprecated BSS class uses an xml builder + # if one is installed. + with warnings.catch_warnings(record=True) as w: + soup = BeautifulStoneSoup("<b />") + self.assertEqual("<b/>", str(soup.b)) + self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) + +@skipIf( + not LXML_PRESENT, + "lxml seems not to be present, not testing its XML tree builder.") +class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): + """See ``HTMLTreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return LXMLTreeBuilderForXML + + def test_namespace_indexing(self): + # We should not track un-prefixed namespaces as we can only hold one + # and it will be recognized as the default namespace by soupsieve, + # which may be confusing in some situations. When no namespace is provided + # for a selector, the default namespace (if defined) is assumed. + + soup = self.soup( + '<?xml version="1.1"?>\n' + '<root>' + '<tag xmlns="http://unprefixed-namespace.com">content</tag>' + '<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>' + '</root>' + ) + self.assertEqual( + soup._namespaces, + {'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'} + ) diff --git a/libs3/bs4/tests/test_soup.py b/libs3/bs4/tests/test_soup.py new file mode 100644 index 000000000..1eda9484b --- /dev/null +++ b/libs3/bs4/tests/test_soup.py @@ -0,0 +1,567 @@ +# -*- coding: utf-8 -*- +"""Tests of Beautiful Soup as a whole.""" + +from pdb import set_trace +import logging +import unittest +import sys +import tempfile + +from bs4 import ( + BeautifulSoup, + BeautifulStoneSoup, +) +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + SoupStrainer, + NamespacedAttribute, + ) +import bs4.dammit +from bs4.dammit import ( + EntitySubstitution, + UnicodeDammit, + EncodingDetector, +) +from bs4.testing import ( + default_builder, + SoupTest, + skipIf, +) +import warnings + +try: + from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML + LXML_PRESENT = True +except ImportError as e: + LXML_PRESENT = False + +PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) + +class TestConstructor(SoupTest): + + def test_short_unicode_input(self): + data = "<h1>éé</h1>" + soup = self.soup(data) + self.assertEqual("éé", soup.h1.string) + + def test_embedded_null(self): + data = "<h1>foo\0bar</h1>" + soup = self.soup(data) + self.assertEqual("foo\0bar", soup.h1.string) + + def test_exclude_encodings(self): + utf8_data = "Räksmörgås".encode("utf-8") + soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual("windows-1252", soup.original_encoding) + + def test_custom_builder_class(self): + # Verify that you can pass in a custom Builder class and + # it'll be instantiated with the appropriate keyword arguments. + class Mock(object): + def __init__(self, **kwargs): + self.called_with = kwargs + self.is_xml = True + def initialize_soup(self, soup): + pass + def prepare_markup(self, *args, **kwargs): + return '' + + kwargs = dict( + var="value", + # This is a deprecated BS3-era keyword argument, which + # will be stripped out. + convertEntities=True, + ) + with warnings.catch_warnings(record=True): + soup = BeautifulSoup('', builder=Mock, **kwargs) + assert isinstance(soup.builder, Mock) + self.assertEqual(dict(var="value"), soup.builder.called_with) + + # You can also instantiate the TreeBuilder yourself. In this + # case, that specific object is used and any keyword arguments + # to the BeautifulSoup constructor are ignored. + builder = Mock(**kwargs) + with warnings.catch_warnings(record=True) as w: + soup = BeautifulSoup( + '', builder=builder, ignored_value=True, + ) + msg = str(w[0].message) + assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.") + self.assertEqual(builder, soup.builder) + self.assertEqual(kwargs, builder.called_with) + + def test_cdata_list_attributes(self): + # Most attribute values are represented as scalars, but the + # HTML standard says that some attributes, like 'class' have + # space-separated lists as values. + markup = '<a id=" an id " class=" a class "></a>' + soup = self.soup(markup) + + # Note that the spaces are stripped for 'class' but not for 'id'. + a = soup.a + self.assertEqual(" an id ", a['id']) + self.assertEqual(["a", "class"], a['class']) + + # TreeBuilder takes an argument called 'mutli_valued_attributes' which lets + # you customize or disable this. As always, you can customize the TreeBuilder + # by passing in a keyword argument to the BeautifulSoup constructor. + soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None) + self.assertEqual(" a class ", soup.a['class']) + + # Here are two ways of saying that `id` is a multi-valued + # attribute in this context, but 'class' is not. + for switcheroo in ({'*': 'id'}, {'a': 'id'}): + with warnings.catch_warnings(record=True) as w: + # This will create a warning about not explicitly + # specifying a parser, but we'll ignore it. + soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo) + a = soup.a + self.assertEqual(["an", "id"], a['id']) + self.assertEqual(" a class ", a['class']) + + +class TestWarnings(SoupTest): + + def _no_parser_specified(self, s, is_there=True): + v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) + self.assertTrue(v) + + def test_warning_if_no_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("<a><b></b></a>") + msg = str(w[0].message) + self._assert_no_parser_specified(msg) + + def test_warning_if_parser_specified_too_vague(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("<a><b></b></a>", "html") + msg = str(w[0].message) + self._assert_no_parser_specified(msg) + + def test_no_warning_if_explicit_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("<a><b></b></a>", "html.parser") + self.assertEqual([], w) + + def test_parseOnlyThese_renamed_to_parse_only(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) + msg = str(w[0].message) + self.assertTrue("parseOnlyThese" in msg) + self.assertTrue("parse_only" in msg) + self.assertEqual(b"<b></b>", soup.encode()) + + def test_fromEncoding_renamed_to_from_encoding(self): + with warnings.catch_warnings(record=True) as w: + utf8 = b"\xc3\xa9" + soup = self.soup(utf8, fromEncoding="utf8") + msg = str(w[0].message) + self.assertTrue("fromEncoding" in msg) + self.assertTrue("from_encoding" in msg) + self.assertEqual("utf8", soup.original_encoding) + + def test_unrecognized_keyword_argument(self): + self.assertRaises( + TypeError, self.soup, "<a>", no_such_argument=True) + +class TestWarnings(SoupTest): + + def test_disk_file_warning(self): + filehandle = tempfile.NamedTemporaryFile() + filename = filehandle.name + try: + with warnings.catch_warnings(record=True) as w: + soup = self.soup(filename) + msg = str(w[0].message) + self.assertTrue("looks like a filename" in msg) + finally: + filehandle.close() + + # The file no longer exists, so Beautiful Soup will no longer issue the warning. + with warnings.catch_warnings(record=True) as w: + soup = self.soup(filename) + self.assertEqual(0, len(w)) + + def test_url_warning_with_bytes_url(self): + with warnings.catch_warnings(record=True) as warning_list: + soup = self.soup(b"http://www.crummybytes.com/") + # Be aware this isn't the only warning that can be raised during + # execution.. + self.assertTrue(any("looks like a URL" in str(w.message) + for w in warning_list)) + + def test_url_warning_with_unicode_url(self): + with warnings.catch_warnings(record=True) as warning_list: + # note - this url must differ from the bytes one otherwise + # python's warnings system swallows the second warning + soup = self.soup("http://www.crummyunicode.com/") + self.assertTrue(any("looks like a URL" in str(w.message) + for w in warning_list)) + + def test_url_warning_with_bytes_and_space(self): + with warnings.catch_warnings(record=True) as warning_list: + soup = self.soup(b"http://www.crummybytes.com/ is great") + self.assertFalse(any("looks like a URL" in str(w.message) + for w in warning_list)) + + def test_url_warning_with_unicode_and_space(self): + with warnings.catch_warnings(record=True) as warning_list: + soup = self.soup("http://www.crummyuncode.com/ is great") + self.assertFalse(any("looks like a URL" in str(w.message) + for w in warning_list)) + + +class TestSelectiveParsing(SoupTest): + + def test_parse_with_soupstrainer(self): + markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" + strainer = SoupStrainer("b") + soup = self.soup(markup, parse_only=strainer) + self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>") + + +class TestEntitySubstitution(unittest.TestCase): + """Standalone tests of the EntitySubstitution class.""" + def setUp(self): + self.sub = EntitySubstitution + + def test_simple_html_substitution(self): + # Unicode characters corresponding to named HTML entites + # are substituted, and no others. + s = "foo\u2200\N{SNOWMAN}\u00f5bar" + self.assertEqual(self.sub.substitute_html(s), + "foo∀\N{SNOWMAN}õbar") + + def test_smart_quote_substitution(self): + # MS smart quotes are a common source of frustration, so we + # give them a special test. + quotes = b"\x91\x92foo\x93\x94" + dammit = UnicodeDammit(quotes) + self.assertEqual(self.sub.substitute_html(dammit.markup), + "‘’foo“”") + + def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): + s = 'Welcome to "my bar"' + self.assertEqual(self.sub.substitute_xml(s, False), s) + + def test_xml_attribute_quoting_normally_uses_double_quotes(self): + self.assertEqual(self.sub.substitute_xml("Welcome", True), + '"Welcome"') + self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), + '"Bob\'s Bar"') + + def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): + s = 'Welcome to "my bar"' + self.assertEqual(self.sub.substitute_xml(s, True), + "'Welcome to \"my bar\"'") + + def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): + s = 'Welcome to "Bob\'s Bar"' + self.assertEqual( + self.sub.substitute_xml(s, True), + '"Welcome to "Bob\'s Bar""') + + def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): + quoted = 'Welcome to "Bob\'s Bar"' + self.assertEqual(self.sub.substitute_xml(quoted), quoted) + + def test_xml_quoting_handles_angle_brackets(self): + self.assertEqual( + self.sub.substitute_xml("foo<bar>"), + "foo<bar>") + + def test_xml_quoting_handles_ampersands(self): + self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") + + def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): + self.assertEqual( + self.sub.substitute_xml("ÁT&T"), + "&Aacute;T&T") + + def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): + self.assertEqual( + self.sub.substitute_xml_containing_entities("ÁT&T"), + "ÁT&T") + + def test_quotes_not_html_substituted(self): + """There's no need to do this except inside attribute values.""" + text = 'Bob\'s "bar"' + self.assertEqual(self.sub.substitute_html(text), text) + + +class TestEncodingConversion(SoupTest): + # Test Beautiful Soup's ability to decode and encode from various + # encodings. + + def setUp(self): + super(TestEncodingConversion, self).setUp() + self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' + self.utf8_data = self.unicode_data.encode("utf-8") + # Just so you know what it looks like. + self.assertEqual( + self.utf8_data, + b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>') + + def test_ascii_in_unicode_out(self): + # ASCII input is converted to Unicode. The original_encoding + # attribute is set to 'utf-8', a superset of ASCII. + chardet = bs4.dammit.chardet_dammit + logging.disable(logging.WARNING) + try: + def noop(str): + return None + # Disable chardet, which will realize that the ASCII is ASCII. + bs4.dammit.chardet_dammit = noop + ascii = b"<foo>a</foo>" + soup_from_ascii = self.soup(ascii) + unicode_output = soup_from_ascii.decode() + self.assertTrue(isinstance(unicode_output, str)) + self.assertEqual(unicode_output, self.document_for(ascii.decode())) + self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") + finally: + logging.disable(logging.NOTSET) + bs4.dammit.chardet_dammit = chardet + + def test_unicode_in_unicode_out(self): + # Unicode input is left alone. The original_encoding attribute + # is not set. + soup_from_unicode = self.soup(self.unicode_data) + self.assertEqual(soup_from_unicode.decode(), self.unicode_data) + self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') + self.assertEqual(soup_from_unicode.original_encoding, None) + + def test_utf8_in_unicode_out(self): + # UTF-8 input is converted to Unicode. The original_encoding + # attribute is set. + soup_from_utf8 = self.soup(self.utf8_data) + self.assertEqual(soup_from_utf8.decode(), self.unicode_data) + self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') + + def test_utf8_out(self): + # The internal data structures can be encoded as UTF-8. + soup_from_unicode = self.soup(self.unicode_data) + self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) + + @skipIf( + PYTHON_3_PRE_3_2, + "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") + def test_attribute_name_containing_unicode_characters(self): + markup = '<div><a \N{SNOWMAN}="snowman"></a></div>' + self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) + +class TestUnicodeDammit(unittest.TestCase): + """Standalone tests of UnicodeDammit.""" + + def test_unicode_input(self): + markup = "I'm already Unicode! \N{SNOWMAN}" + dammit = UnicodeDammit(markup) + self.assertEqual(dammit.unicode_markup, markup) + + def test_smart_quotes_to_unicode(self): + markup = b"<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup) + self.assertEqual( + dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>") + + def test_smart_quotes_to_xml_entities(self): + markup = b"<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup, smart_quotes_to="xml") + self.assertEqual( + dammit.unicode_markup, "<foo>‘’“”</foo>") + + def test_smart_quotes_to_html_entities(self): + markup = b"<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup, smart_quotes_to="html") + self.assertEqual( + dammit.unicode_markup, "<foo>‘’“”</foo>") + + def test_smart_quotes_to_ascii(self): + markup = b"<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup, smart_quotes_to="ascii") + self.assertEqual( + dammit.unicode_markup, """<foo>''""</foo>""") + + def test_detect_utf8(self): + utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" + dammit = UnicodeDammit(utf8) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') + + + def test_convert_hebrew(self): + hebrew = b"\xed\xe5\xec\xf9" + dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') + self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') + + def test_dont_see_smart_quotes_where_there_are_none(self): + utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" + dammit = UnicodeDammit(utf_8) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) + + def test_ignore_inappropriate_codecs(self): + utf8_data = "Räksmörgås".encode("utf-8") + dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + + def test_ignore_invalid_codecs(self): + utf8_data = "Räksmörgås".encode("utf-8") + for bad_encoding in ['.utf8', '...', 'utF---16.!']: + dammit = UnicodeDammit(utf8_data, [bad_encoding]) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + + def test_exclude_encodings(self): + # This is UTF-8. + utf8_data = "Räksmörgås".encode("utf-8") + + # But if we exclude UTF-8 from consideration, the guess is + # Windows-1252. + dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') + + # And if we exclude that, there is no valid guess at all. + dammit = UnicodeDammit( + utf8_data, exclude_encodings=["utf-8", "windows-1252"]) + self.assertEqual(dammit.original_encoding, None) + + def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): + detected = EncodingDetector( + b'<?xml version="1.0" encoding="UTF-\xdb" ?>') + encodings = list(detected.encodings) + assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings + + def test_detect_html5_style_meta_tag(self): + + for data in ( + b'<html><meta charset="euc-jp" /></html>', + b"<html><meta charset='euc-jp' /></html>", + b"<html><meta charset=euc-jp /></html>", + b"<html><meta charset=euc-jp/></html>"): + dammit = UnicodeDammit(data, is_html=True) + self.assertEqual( + "euc-jp", dammit.original_encoding) + + def test_last_ditch_entity_replacement(self): + # This is a UTF-8 document that contains bytestrings + # completely incompatible with UTF-8 (ie. encoded with some other + # encoding). + # + # Since there is no consistent encoding for the document, + # Unicode, Dammit will eventually encode the document as UTF-8 + # and encode the incompatible characters as REPLACEMENT + # CHARACTER. + # + # If chardet is installed, it will detect that the document + # can be converted into ISO-8859-1 without errors. This happens + # to be the wrong encoding, but it is a consistent encoding, so the + # code we're testing here won't run. + # + # So we temporarily disable chardet if it's present. + doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> +<html><b>\330\250\330\252\330\261</b> +<i>\310\322\321\220\312\321\355\344</i></html>""" + chardet = bs4.dammit.chardet_dammit + logging.disable(logging.WARNING) + try: + def noop(str): + return None + bs4.dammit.chardet_dammit = noop + dammit = UnicodeDammit(doc) + self.assertEqual(True, dammit.contains_replacement_characters) + self.assertTrue("\ufffd" in dammit.unicode_markup) + + soup = BeautifulSoup(doc, "html.parser") + self.assertTrue(soup.contains_replacement_characters) + finally: + logging.disable(logging.NOTSET) + bs4.dammit.chardet_dammit = chardet + + def test_byte_order_mark_removed(self): + # A document written in UTF-16LE will have its byte order marker stripped. + data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' + dammit = UnicodeDammit(data) + self.assertEqual("<a>áé</a>", dammit.unicode_markup) + self.assertEqual("utf-16le", dammit.original_encoding) + + def test_detwingle(self): + # Here's a UTF8 document. + utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") + + # Here's a Windows-1252 document. + windows_1252 = ( + "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" + "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") + + # Through some unholy alchemy, they've been stuck together. + doc = utf8 + windows_1252 + utf8 + + # The document can't be turned into UTF-8: + self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") + + # Unicode, Dammit thinks the whole document is Windows-1252, + # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" + + # But if we run it through fix_embedded_windows_1252, it's fixed: + + fixed = UnicodeDammit.detwingle(doc) + self.assertEqual( + "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) + + def test_detwingle_ignores_multibyte_characters(self): + # Each of these characters has a UTF-8 representation ending + # in \x93. \x93 is a smart quote if interpreted as + # Windows-1252. But our code knows to skip over multibyte + # UTF-8 characters, so they'll survive the process unscathed. + for tricky_unicode_char in ( + "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' + "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' + "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. + ): + input = tricky_unicode_char.encode("utf8") + self.assertTrue(input.endswith(b'\x93')) + output = UnicodeDammit.detwingle(input) + self.assertEqual(output, input) + +class TestNamedspacedAttribute(SoupTest): + + def test_name_may_be_none(self): + a = NamespacedAttribute("xmlns", None) + self.assertEqual(a, "xmlns") + + def test_attribute_is_equivalent_to_colon_separated_string(self): + a = NamespacedAttribute("a", "b") + self.assertEqual("a:b", a) + + def test_attributes_are_equivalent_if_prefix_and_name_identical(self): + a = NamespacedAttribute("a", "b", "c") + b = NamespacedAttribute("a", "b", "c") + self.assertEqual(a, b) + + # The actual namespace is not considered. + c = NamespacedAttribute("a", "b", None) + self.assertEqual(a, c) + + # But name and prefix are important. + d = NamespacedAttribute("a", "z", "c") + self.assertNotEqual(a, d) + + e = NamespacedAttribute("z", "b", "c") + self.assertNotEqual(a, e) + + +class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): + + def test_content_meta_attribute_value(self): + value = CharsetMetaAttributeValue("euc-jp") + self.assertEqual("euc-jp", value) + self.assertEqual("euc-jp", value.original_value) + self.assertEqual("utf8", value.encode("utf8")) + + + def test_content_meta_attribute_value(self): + value = ContentMetaAttributeValue("text/html; charset=euc-jp") + self.assertEqual("text/html; charset=euc-jp", value) + self.assertEqual("text/html; charset=euc-jp", value.original_value) + self.assertEqual("text/html; charset=utf8", value.encode("utf8")) diff --git a/libs3/bs4/tests/test_tree.py b/libs3/bs4/tests/test_tree.py new file mode 100644 index 000000000..3b4beeb8f --- /dev/null +++ b/libs3/bs4/tests/test_tree.py @@ -0,0 +1,2205 @@ +# -*- coding: utf-8 -*- +"""Tests for Beautiful Soup's tree traversal methods. + +The tree traversal methods are the main advantage of using Beautiful +Soup over just using a parser. + +Different parsers will build different Beautiful Soup trees given the +same markup, but all Beautiful Soup trees can be traversed with the +methods tested here. +""" + +from pdb import set_trace +import copy +import pickle +import re +import warnings +from bs4 import BeautifulSoup +from bs4.builder import ( + builder_registry, + HTMLParserTreeBuilder, +) +from bs4.element import ( + PY3K, + CData, + Comment, + Declaration, + Doctype, + Formatter, + NavigableString, + SoupStrainer, + Tag, +) +from bs4.testing import ( + SoupTest, + skipIf, +) + +XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) +LXML_PRESENT = (builder_registry.lookup("lxml") is not None) + +class TreeTest(SoupTest): + + def assertSelects(self, tags, should_match): + """Make sure that the given tags have the correct text. + + This is used in tests that define a bunch of tags, each + containing a single string, and then select certain strings by + some mechanism. + """ + self.assertEqual([tag.string for tag in tags], should_match) + + def assertSelectsIDs(self, tags, should_match): + """Make sure that the given tags have the correct IDs. + + This is used in tests that define a bunch of tags, each + containing a single string, and then select certain strings by + some mechanism. + """ + self.assertEqual([tag['id'] for tag in tags], should_match) + + +class TestFind(TreeTest): + """Basic tests of the find() method. + + find() just calls find_all() with limit=1, so it's not tested all + that thouroughly here. + """ + + def test_find_tag(self): + soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>") + self.assertEqual(soup.find("b").string, "2") + + def test_unicode_text_find(self): + soup = self.soup('<h1>Räksmörgås</h1>') + self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås') + + def test_unicode_attribute_find(self): + soup = self.soup('<h1 id="Räksmörgås">here it is</h1>') + str(soup) + self.assertEqual("here it is", soup.find(id='Räksmörgås').text) + + + def test_find_everything(self): + """Test an optimization that finds all tags.""" + soup = self.soup("<a>foo</a><b>bar</b>") + self.assertEqual(2, len(soup.find_all())) + + def test_find_everything_with_name(self): + """Test an optimization that finds all tags with a given name.""" + soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>") + self.assertEqual(2, len(soup.find_all('a'))) + +class TestFindAll(TreeTest): + """Basic tests of the find_all() method.""" + + def test_find_all_text_nodes(self): + """You can search the tree for text nodes.""" + soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") + # Exact match. + self.assertEqual(soup.find_all(string="bar"), ["bar"]) + self.assertEqual(soup.find_all(text="bar"), ["bar"]) + # Match any of a number of strings. + self.assertEqual( + soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"]) + # Match a regular expression. + self.assertEqual(soup.find_all(text=re.compile('.*')), + ["Foo", "bar", '\xbb']) + # Match anything. + self.assertEqual(soup.find_all(text=True), + ["Foo", "bar", '\xbb']) + + def test_find_all_limit(self): + """You can limit the number of items returned by find_all.""" + soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>") + self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) + self.assertSelects(soup.find_all('a', limit=1), ["1"]) + self.assertSelects( + soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) + + # A limit of 0 means no limit. + self.assertSelects( + soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) + + def test_calling_a_tag_is_calling_findall(self): + soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>") + self.assertSelects(soup('a', limit=1), ["1"]) + self.assertSelects(soup.b(id="foo"), ["3"]) + + def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): + soup = self.soup("<a></a>") + # Create a self-referential list. + l = [] + l.append(l) + + # Without special code in _normalize_search_value, this would cause infinite + # recursion. + self.assertEqual([], soup.find_all(l)) + + def test_find_all_resultset(self): + """All find_all calls return a ResultSet""" + soup = self.soup("<a></a>") + result = soup.find_all("a") + self.assertTrue(hasattr(result, "source")) + + result = soup.find_all(True) + self.assertTrue(hasattr(result, "source")) + + result = soup.find_all(text="foo") + self.assertTrue(hasattr(result, "source")) + + +class TestFindAllBasicNamespaces(TreeTest): + + def test_find_by_namespaced_name(self): + soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">') + self.assertEqual("4", soup.find("mathml:msqrt").string) + self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) + + +class TestFindAllByName(TreeTest): + """Test ways of finding tags by tag name.""" + + def setUp(self): + super(TreeTest, self).setUp() + self.tree = self.soup("""<a>First tag.</a> + <b>Second tag.</b> + <c>Third <a>Nested tag.</a> tag.</c>""") + + def test_find_all_by_tag_name(self): + # Find all the <a> tags. + self.assertSelects( + self.tree.find_all('a'), ['First tag.', 'Nested tag.']) + + def test_find_all_by_name_and_text(self): + self.assertSelects( + self.tree.find_all('a', text='First tag.'), ['First tag.']) + + self.assertSelects( + self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) + + self.assertSelects( + self.tree.find_all('a', text=re.compile("tag")), + ['First tag.', 'Nested tag.']) + + + def test_find_all_on_non_root_element(self): + # You can call find_all on any node, not just the root. + self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) + + def test_calling_element_invokes_find_all(self): + self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) + + def test_find_all_by_tag_strainer(self): + self.assertSelects( + self.tree.find_all(SoupStrainer('a')), + ['First tag.', 'Nested tag.']) + + def test_find_all_by_tag_names(self): + self.assertSelects( + self.tree.find_all(['a', 'b']), + ['First tag.', 'Second tag.', 'Nested tag.']) + + def test_find_all_by_tag_dict(self): + self.assertSelects( + self.tree.find_all({'a' : True, 'b' : True}), + ['First tag.', 'Second tag.', 'Nested tag.']) + + def test_find_all_by_tag_re(self): + self.assertSelects( + self.tree.find_all(re.compile('^[ab]$')), + ['First tag.', 'Second tag.', 'Nested tag.']) + + def test_find_all_with_tags_matching_method(self): + # You can define an oracle method that determines whether + # a tag matches the search. + def id_matches_name(tag): + return tag.name == tag.get('id') + + tree = self.soup("""<a id="a">Match 1.</a> + <a id="1">Does not match.</a> + <b id="b">Match 2.</a>""") + + self.assertSelects( + tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) + + def test_find_with_multi_valued_attribute(self): + soup = self.soup( + "<div class='a b'>1</div><div class='a c'>2</div><div class='a d'>3</div>" + ) + r1 = soup.find('div', 'a d'); + r2 = soup.find('div', re.compile(r'a d')); + r3, r4 = soup.find_all('div', ['a b', 'a d']); + self.assertEqual('3', r1.string) + self.assertEqual('3', r2.string) + self.assertEqual('1', r3.string) + self.assertEqual('3', r4.string) + + +class TestFindAllByAttribute(TreeTest): + + def test_find_all_by_attribute_name(self): + # You can pass in keyword arguments to find_all to search by + # attribute. + tree = self.soup(""" + <a id="first">Matching a.</a> + <a id="second"> + Non-matching <b id="first">Matching b.</b>a. + </a>""") + self.assertSelects(tree.find_all(id='first'), + ["Matching a.", "Matching b."]) + + def test_find_all_by_utf8_attribute_value(self): + peace = "םולש".encode("utf8") + data = '<a title="םולש"></a>'.encode("utf8") + soup = self.soup(data) + self.assertEqual([soup.a], soup.find_all(title=peace)) + self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) + self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) + + def test_find_all_by_attribute_dict(self): + # You can pass in a dictionary as the argument 'attrs'. This + # lets you search for attributes like 'name' (a fixed argument + # to find_all) and 'class' (a reserved word in Python.) + tree = self.soup(""" + <a name="name1" class="class1">Name match.</a> + <a name="name2" class="class2">Class match.</a> + <a name="name3" class="class3">Non-match.</a> + <name1>A tag called 'name1'.</name1> + """) + + # This doesn't do what you want. + self.assertSelects(tree.find_all(name='name1'), + ["A tag called 'name1'."]) + # This does what you want. + self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), + ["Name match."]) + + self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), + ["Class match."]) + + def test_find_all_by_class(self): + tree = self.soup(""" + <a class="1">Class 1.</a> + <a class="2">Class 2.</a> + <b class="1">Class 1.</b> + <c class="3 4">Class 3 and 4.</c> + """) + + # Passing in the class_ keyword argument will search against + # the 'class' attribute. + self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) + self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) + self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) + + # Passing in a string to 'attrs' will also search the CSS class. + self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) + self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) + self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) + self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) + + def test_find_by_class_when_multiple_classes_present(self): + tree = self.soup("<gar class='foo bar'>Found it</gar>") + + f = tree.find_all("gar", class_=re.compile("o")) + self.assertSelects(f, ["Found it"]) + + f = tree.find_all("gar", class_=re.compile("a")) + self.assertSelects(f, ["Found it"]) + + # If the search fails to match the individual strings "foo" and "bar", + # it will be tried against the combined string "foo bar". + f = tree.find_all("gar", class_=re.compile("o b")) + self.assertSelects(f, ["Found it"]) + + def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): + soup = self.soup("<a class='bar'>Found it</a>") + + self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) + + def big_attribute_value(value): + return len(value) > 3 + + self.assertSelects(soup.find_all("a", big_attribute_value), []) + + def small_attribute_value(value): + return len(value) <= 3 + + self.assertSelects( + soup.find_all("a", small_attribute_value), ["Found it"]) + + def test_find_all_with_string_for_attrs_finds_multiple_classes(self): + soup = self.soup('<a class="foo bar"></a><a class="foo"></a>') + a, a2 = soup.find_all("a") + self.assertEqual([a, a2], soup.find_all("a", "foo")) + self.assertEqual([a], soup.find_all("a", "bar")) + + # If you specify the class as a string that contains a + # space, only that specific value will be found. + self.assertEqual([a], soup.find_all("a", class_="foo bar")) + self.assertEqual([a], soup.find_all("a", "foo bar")) + self.assertEqual([], soup.find_all("a", "bar foo")) + + def test_find_all_by_attribute_soupstrainer(self): + tree = self.soup(""" + <a id="first">Match.</a> + <a id="second">Non-match.</a>""") + + strainer = SoupStrainer(attrs={'id' : 'first'}) + self.assertSelects(tree.find_all(strainer), ['Match.']) + + def test_find_all_with_missing_attribute(self): + # You can pass in None as the value of an attribute to find_all. + # This will match tags that do not have that attribute set. + tree = self.soup("""<a id="1">ID present.</a> + <a>No ID present.</a> + <a id="">ID is empty.</a>""") + self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) + + def test_find_all_with_defined_attribute(self): + # You can pass in None as the value of an attribute to find_all. + # This will match tags that have that attribute set to any value. + tree = self.soup("""<a id="1">ID present.</a> + <a>No ID present.</a> + <a id="">ID is empty.</a>""") + self.assertSelects( + tree.find_all(id=True), ["ID present.", "ID is empty."]) + + def test_find_all_with_numeric_attribute(self): + # If you search for a number, it's treated as a string. + tree = self.soup("""<a id=1>Unquoted attribute.</a> + <a id="1">Quoted attribute.</a>""") + + expected = ["Unquoted attribute.", "Quoted attribute."] + self.assertSelects(tree.find_all(id=1), expected) + self.assertSelects(tree.find_all(id="1"), expected) + + def test_find_all_with_list_attribute_values(self): + # You can pass a list of attribute values instead of just one, + # and you'll get tags that match any of the values. + tree = self.soup("""<a id="1">1</a> + <a id="2">2</a> + <a id="3">3</a> + <a>No ID.</a>""") + self.assertSelects(tree.find_all(id=["1", "3", "4"]), + ["1", "3"]) + + def test_find_all_with_regular_expression_attribute_value(self): + # You can pass a regular expression as an attribute value, and + # you'll get tags whose values for that attribute match the + # regular expression. + tree = self.soup("""<a id="a">One a.</a> + <a id="aa">Two as.</a> + <a id="ab">Mixed as and bs.</a> + <a id="b">One b.</a> + <a>No ID.</a>""") + + self.assertSelects(tree.find_all(id=re.compile("^a+$")), + ["One a.", "Two as."]) + + def test_find_by_name_and_containing_string(self): + soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>") + a = soup.a + + self.assertEqual([a], soup.find_all("a", text="foo")) + self.assertEqual([], soup.find_all("a", text="bar")) + self.assertEqual([], soup.find_all("a", text="bar")) + + def test_find_by_name_and_containing_string_when_string_is_buried(self): + soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>") + self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) + + def test_find_by_attribute_and_containing_string(self): + soup = self.soup('<b id="1">foo</b><a id="2">foo</a>') + a = soup.a + + self.assertEqual([a], soup.find_all(id=2, text="foo")) + self.assertEqual([], soup.find_all(id=1, text="bar")) + + +class TestSmooth(TreeTest): + """Test Tag.smooth.""" + + def test_smooth(self): + soup = self.soup("<div>a</div>") + div = soup.div + div.append("b") + div.append("c") + div.append(Comment("Comment 1")) + div.append(Comment("Comment 2")) + div.append("d") + builder = self.default_builder() + span = Tag(soup, builder, 'span') + span.append('1') + span.append('2') + div.append(span) + + # At this point the tree has a bunch of adjacent + # NavigableStrings. This is normal, but it has no meaning in + # terms of HTML, so we may want to smooth things out for + # output. + + # Since the <span> tag has two children, its .string is None. + self.assertEqual(None, div.span.string) + + self.assertEqual(7, len(div.contents)) + div.smooth() + self.assertEqual(5, len(div.contents)) + + # The three strings at the beginning of div.contents have been + # merged into on string. + # + self.assertEqual('abc', div.contents[0]) + + # The call is recursive -- the <span> tag was also smoothed. + self.assertEqual('12', div.span.string) + + # The two comments have _not_ been merged, even though + # comments are strings. Merging comments would change the + # meaning of the HTML. + self.assertEqual('Comment 1', div.contents[1]) + self.assertEqual('Comment 2', div.contents[2]) + + +class TestIndex(TreeTest): + """Test Tag.index""" + def test_index(self): + tree = self.soup("""<div> + <a>Identical</a> + <b>Not identical</b> + <a>Identical</a> + + <c><d>Identical with child</d></c> + <b>Also not identical</b> + <c><d>Identical with child</d></c> + </div>""") + div = tree.div + for i, element in enumerate(div.contents): + self.assertEqual(i, div.index(element)) + self.assertRaises(ValueError, tree.index, 1) + + +class TestParentOperations(TreeTest): + """Test navigation and searching through an element's parents.""" + + def setUp(self): + super(TestParentOperations, self).setUp() + self.tree = self.soup('''<ul id="empty"></ul> + <ul id="top"> + <ul id="middle"> + <ul id="bottom"> + <b>Start here</b> + </ul> + </ul>''') + self.start = self.tree.b + + + def test_parent(self): + self.assertEqual(self.start.parent['id'], 'bottom') + self.assertEqual(self.start.parent.parent['id'], 'middle') + self.assertEqual(self.start.parent.parent.parent['id'], 'top') + + def test_parent_of_top_tag_is_soup_object(self): + top_tag = self.tree.contents[0] + self.assertEqual(top_tag.parent, self.tree) + + def test_soup_object_has_no_parent(self): + self.assertEqual(None, self.tree.parent) + + def test_find_parents(self): + self.assertSelectsIDs( + self.start.find_parents('ul'), ['bottom', 'middle', 'top']) + self.assertSelectsIDs( + self.start.find_parents('ul', id="middle"), ['middle']) + + def test_find_parent(self): + self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') + self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top') + + def test_parent_of_text_element(self): + text = self.tree.find(text="Start here") + self.assertEqual(text.parent.name, 'b') + + def test_text_element_find_parent(self): + text = self.tree.find(text="Start here") + self.assertEqual(text.find_parent('ul')['id'], 'bottom') + + def test_parent_generator(self): + parents = [parent['id'] for parent in self.start.parents + if parent is not None and 'id' in parent.attrs] + self.assertEqual(parents, ['bottom', 'middle', 'top']) + + +class ProximityTest(TreeTest): + + def setUp(self): + super(TreeTest, self).setUp() + self.tree = self.soup( + '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>') + + +class TestNextOperations(ProximityTest): + + def setUp(self): + super(TestNextOperations, self).setUp() + self.start = self.tree.b + + def test_next(self): + self.assertEqual(self.start.next_element, "One") + self.assertEqual(self.start.next_element.next_element['id'], "2") + + def test_next_of_last_item_is_none(self): + last = self.tree.find(text="Three") + self.assertEqual(last.next_element, None) + + def test_next_of_root_is_none(self): + # The document root is outside the next/previous chain. + self.assertEqual(self.tree.next_element, None) + + def test_find_all_next(self): + self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) + self.start.find_all_next(id=3) + self.assertSelects(self.start.find_all_next(id=3), ["Three"]) + + def test_find_next(self): + self.assertEqual(self.start.find_next('b')['id'], '2') + self.assertEqual(self.start.find_next(text="Three"), "Three") + + def test_find_next_for_text_element(self): + text = self.tree.find(text="One") + self.assertEqual(text.find_next("b").string, "Two") + self.assertSelects(text.find_all_next("b"), ["Two", "Three"]) + + def test_next_generator(self): + start = self.tree.find(text="Two") + successors = [node for node in start.next_elements] + # There are two successors: the final <b> tag and its text contents. + tag, contents = successors + self.assertEqual(tag['id'], '3') + self.assertEqual(contents, "Three") + +class TestPreviousOperations(ProximityTest): + + def setUp(self): + super(TestPreviousOperations, self).setUp() + self.end = self.tree.find(text="Three") + + def test_previous(self): + self.assertEqual(self.end.previous_element['id'], "3") + self.assertEqual(self.end.previous_element.previous_element, "Two") + + def test_previous_of_first_item_is_none(self): + first = self.tree.find('html') + self.assertEqual(first.previous_element, None) + + def test_previous_of_root_is_none(self): + # The document root is outside the next/previous chain. + # XXX This is broken! + #self.assertEqual(self.tree.previous_element, None) + pass + + def test_find_all_previous(self): + # The <b> tag containing the "Three" node is the predecessor + # of the "Three" node itself, which is why "Three" shows up + # here. + self.assertSelects( + self.end.find_all_previous('b'), ["Three", "Two", "One"]) + self.assertSelects(self.end.find_all_previous(id=1), ["One"]) + + def test_find_previous(self): + self.assertEqual(self.end.find_previous('b')['id'], '3') + self.assertEqual(self.end.find_previous(text="One"), "One") + + def test_find_previous_for_text_element(self): + text = self.tree.find(text="Three") + self.assertEqual(text.find_previous("b").string, "Three") + self.assertSelects( + text.find_all_previous("b"), ["Three", "Two", "One"]) + + def test_previous_generator(self): + start = self.tree.find(text="One") + predecessors = [node for node in start.previous_elements] + + # There are four predecessors: the <b> tag containing "One" + # the <body> tag, the <head> tag, and the <html> tag. + b, body, head, html = predecessors + self.assertEqual(b['id'], '1') + self.assertEqual(body.name, "body") + self.assertEqual(head.name, "head") + self.assertEqual(html.name, "html") + + +class SiblingTest(TreeTest): + + def setUp(self): + super(SiblingTest, self).setUp() + markup = '''<html> + <span id="1"> + <span id="1.1"></span> + </span> + <span id="2"> + <span id="2.1"></span> + </span> + <span id="3"> + <span id="3.1"></span> + </span> + <span id="4"></span> + </html>''' + # All that whitespace looks good but makes the tests more + # difficult. Get rid of it. + markup = re.compile(r"\n\s*").sub("", markup) + self.tree = self.soup(markup) + + +class TestNextSibling(SiblingTest): + + def setUp(self): + super(TestNextSibling, self).setUp() + self.start = self.tree.find(id="1") + + def test_next_sibling_of_root_is_none(self): + self.assertEqual(self.tree.next_sibling, None) + + def test_next_sibling(self): + self.assertEqual(self.start.next_sibling['id'], '2') + self.assertEqual(self.start.next_sibling.next_sibling['id'], '3') + + # Note the difference between next_sibling and next_element. + self.assertEqual(self.start.next_element['id'], '1.1') + + def test_next_sibling_may_not_exist(self): + self.assertEqual(self.tree.html.next_sibling, None) + + nested_span = self.tree.find(id="1.1") + self.assertEqual(nested_span.next_sibling, None) + + last_span = self.tree.find(id="4") + self.assertEqual(last_span.next_sibling, None) + + def test_find_next_sibling(self): + self.assertEqual(self.start.find_next_sibling('span')['id'], '2') + + def test_next_siblings(self): + self.assertSelectsIDs(self.start.find_next_siblings("span"), + ['2', '3', '4']) + + self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3']) + + def test_next_sibling_for_text_element(self): + soup = self.soup("Foo<b>bar</b>baz") + start = soup.find(text="Foo") + self.assertEqual(start.next_sibling.name, 'b') + self.assertEqual(start.next_sibling.next_sibling, 'baz') + + self.assertSelects(start.find_next_siblings('b'), ['bar']) + self.assertEqual(start.find_next_sibling(text="baz"), "baz") + self.assertEqual(start.find_next_sibling(text="nonesuch"), None) + + +class TestPreviousSibling(SiblingTest): + + def setUp(self): + super(TestPreviousSibling, self).setUp() + self.end = self.tree.find(id="4") + + def test_previous_sibling_of_root_is_none(self): + self.assertEqual(self.tree.previous_sibling, None) + + def test_previous_sibling(self): + self.assertEqual(self.end.previous_sibling['id'], '3') + self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2') + + # Note the difference between previous_sibling and previous_element. + self.assertEqual(self.end.previous_element['id'], '3.1') + + def test_previous_sibling_may_not_exist(self): + self.assertEqual(self.tree.html.previous_sibling, None) + + nested_span = self.tree.find(id="1.1") + self.assertEqual(nested_span.previous_sibling, None) + + first_span = self.tree.find(id="1") + self.assertEqual(first_span.previous_sibling, None) + + def test_find_previous_sibling(self): + self.assertEqual(self.end.find_previous_sibling('span')['id'], '3') + + def test_previous_siblings(self): + self.assertSelectsIDs(self.end.find_previous_siblings("span"), + ['3', '2', '1']) + + self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1']) + + def test_previous_sibling_for_text_element(self): + soup = self.soup("Foo<b>bar</b>baz") + start = soup.find(text="baz") + self.assertEqual(start.previous_sibling.name, 'b') + self.assertEqual(start.previous_sibling.previous_sibling, 'Foo') + + self.assertSelects(start.find_previous_siblings('b'), ['bar']) + self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo") + self.assertEqual(start.find_previous_sibling(text="nonesuch"), None) + + +class TestTagCreation(SoupTest): + """Test the ability to create new tags.""" + def test_new_tag(self): + soup = self.soup("") + new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"}) + self.assertTrue(isinstance(new_tag, Tag)) + self.assertEqual("foo", new_tag.name) + self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs) + self.assertEqual(None, new_tag.parent) + + def test_tag_inherits_self_closing_rules_from_builder(self): + if XML_BUILDER_PRESENT: + xml_soup = BeautifulSoup("", "lxml-xml") + xml_br = xml_soup.new_tag("br") + xml_p = xml_soup.new_tag("p") + + # Both the <br> and <p> tag are empty-element, just because + # they have no contents. + self.assertEqual(b"<br/>", xml_br.encode()) + self.assertEqual(b"<p/>", xml_p.encode()) + + html_soup = BeautifulSoup("", "html.parser") + html_br = html_soup.new_tag("br") + html_p = html_soup.new_tag("p") + + # The HTML builder users HTML's rules about which tags are + # empty-element tags, and the new tags reflect these rules. + self.assertEqual(b"<br/>", html_br.encode()) + self.assertEqual(b"<p></p>", html_p.encode()) + + def test_new_string_creates_navigablestring(self): + soup = self.soup("") + s = soup.new_string("foo") + self.assertEqual("foo", s) + self.assertTrue(isinstance(s, NavigableString)) + + def test_new_string_can_create_navigablestring_subclass(self): + soup = self.soup("") + s = soup.new_string("foo", Comment) + self.assertEqual("foo", s) + self.assertTrue(isinstance(s, Comment)) + +class TestTreeModification(SoupTest): + + def test_attribute_modification(self): + soup = self.soup('<a id="1"></a>') + soup.a['id'] = 2 + self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>')) + del(soup.a['id']) + self.assertEqual(soup.decode(), self.document_for('<a></a>')) + soup.a['id2'] = 'foo' + self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>')) + + def test_new_tag_creation(self): + builder = builder_registry.lookup('html')() + soup = self.soup("<body></body>", builder=builder) + a = Tag(soup, builder, 'a') + ol = Tag(soup, builder, 'ol') + a['href'] = 'http://foo.com/' + soup.body.insert(0, a) + soup.body.insert(1, ol) + self.assertEqual( + soup.body.encode(), + b'<body><a href="http://foo.com/"></a><ol></ol></body>') + + def test_append_to_contents_moves_tag(self): + doc = """<p id="1">Don't leave me <b>here</b>.</p> + <p id="2">Don\'t leave!</p>""" + soup = self.soup(doc) + second_para = soup.find(id='2') + bold = soup.b + + # Move the <b> tag to the end of the second paragraph. + soup.find(id='2').append(soup.b) + + # The <b> tag is now a child of the second paragraph. + self.assertEqual(bold.parent, second_para) + + self.assertEqual( + soup.decode(), self.document_for( + '<p id="1">Don\'t leave me .</p>\n' + '<p id="2">Don\'t leave!<b>here</b></p>')) + + def test_replace_with_returns_thing_that_was_replaced(self): + text = "<a></a><b><c></c></b>" + soup = self.soup(text) + a = soup.a + new_a = a.replace_with(soup.c) + self.assertEqual(a, new_a) + + def test_unwrap_returns_thing_that_was_replaced(self): + text = "<a><b></b><c></c></a>" + soup = self.soup(text) + a = soup.a + new_a = a.unwrap() + self.assertEqual(a, new_a) + + def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self): + soup = self.soup("<a><b>Foo</b></a><c>Bar</c>") + a = soup.a + a.extract() + self.assertEqual(None, a.parent) + self.assertRaises(ValueError, a.unwrap) + self.assertRaises(ValueError, a.replace_with, soup.c) + + def test_replace_tag_with_itself(self): + text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" + soup = self.soup(text) + c = soup.c + soup.c.replace_with(c) + self.assertEqual(soup.decode(), self.document_for(text)) + + def test_replace_tag_with_its_parent_raises_exception(self): + text = "<a><b></b></a>" + soup = self.soup(text) + self.assertRaises(ValueError, soup.b.replace_with, soup.a) + + def test_insert_tag_into_itself_raises_exception(self): + text = "<a><b></b></a>" + soup = self.soup(text) + self.assertRaises(ValueError, soup.a.insert, 0, soup.a) + + def test_insert_beautifulsoup_object_inserts_children(self): + """Inserting one BeautifulSoup object into another actually inserts all + of its children -- you'll never combine BeautifulSoup objects. + """ + soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>") + + text = "<p>p2</p><p>p3</p>" + to_insert = self.soup(text) + soup.insert(1, to_insert) + + for i in soup.descendants: + assert not isinstance(i, BeautifulSoup) + + p1, p2, p3, p4 = list(soup.children) + self.assertEqual("And now, a word:", p1.string) + self.assertEqual("p2", p2.string) + self.assertEqual("p3", p3.string) + self.assertEqual("And we're back.", p4.string) + + + def test_replace_with_maintains_next_element_throughout(self): + soup = self.soup('<p><a>one</a><b>three</b></p>') + a = soup.a + b = a.contents[0] + # Make it so the <a> tag has two text children. + a.insert(1, "two") + + # Now replace each one with the empty string. + left, right = a.contents + left.replaceWith('') + right.replaceWith('') + + # The <b> tag is still connected to the tree. + self.assertEqual("three", soup.b.string) + + def test_replace_final_node(self): + soup = self.soup("<b>Argh!</b>") + soup.find(text="Argh!").replace_with("Hooray!") + new_text = soup.find(text="Hooray!") + b = soup.b + self.assertEqual(new_text.previous_element, b) + self.assertEqual(new_text.parent, b) + self.assertEqual(new_text.previous_element.next_element, new_text) + self.assertEqual(new_text.next_element, None) + + def test_consecutive_text_nodes(self): + # A builder should never create two consecutive text nodes, + # but if you insert one next to another, Beautiful Soup will + # handle it correctly. + soup = self.soup("<a><b>Argh!</b><c></c></a>") + soup.b.insert(1, "Hooray!") + + self.assertEqual( + soup.decode(), self.document_for( + "<a><b>Argh!Hooray!</b><c></c></a>")) + + new_text = soup.find(text="Hooray!") + self.assertEqual(new_text.previous_element, "Argh!") + self.assertEqual(new_text.previous_element.next_element, new_text) + + self.assertEqual(new_text.previous_sibling, "Argh!") + self.assertEqual(new_text.previous_sibling.next_sibling, new_text) + + self.assertEqual(new_text.next_sibling, None) + self.assertEqual(new_text.next_element, soup.c) + + def test_insert_string(self): + soup = self.soup("<a></a>") + soup.a.insert(0, "bar") + soup.a.insert(0, "foo") + # The string were added to the tag. + self.assertEqual(["foo", "bar"], soup.a.contents) + # And they were converted to NavigableStrings. + self.assertEqual(soup.a.contents[0].next_element, "bar") + + def test_insert_tag(self): + builder = self.default_builder() + soup = self.soup( + "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) + magic_tag = Tag(soup, builder, 'magictag') + magic_tag.insert(0, "the") + soup.a.insert(1, magic_tag) + + self.assertEqual( + soup.decode(), self.document_for( + "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>")) + + # Make sure all the relationships are hooked up correctly. + b_tag = soup.b + self.assertEqual(b_tag.next_sibling, magic_tag) + self.assertEqual(magic_tag.previous_sibling, b_tag) + + find = b_tag.find(text="Find") + self.assertEqual(find.next_element, magic_tag) + self.assertEqual(magic_tag.previous_element, find) + + c_tag = soup.c + self.assertEqual(magic_tag.next_sibling, c_tag) + self.assertEqual(c_tag.previous_sibling, magic_tag) + + the = magic_tag.find(text="the") + self.assertEqual(the.parent, magic_tag) + self.assertEqual(the.next_element, c_tag) + self.assertEqual(c_tag.previous_element, the) + + def test_append_child_thats_already_at_the_end(self): + data = "<a><b></b></a>" + soup = self.soup(data) + soup.a.append(soup.b) + self.assertEqual(data, soup.decode()) + + def test_extend(self): + data = "<a><b><c><d><e><f><g></g></f></e></d></c></b></a>" + soup = self.soup(data) + l = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b] + soup.a.extend(l) + self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode()) + + def test_move_tag_to_beginning_of_parent(self): + data = "<a><b></b><c></c><d></d></a>" + soup = self.soup(data) + soup.a.insert(0, soup.d) + self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode()) + + def test_insert_works_on_empty_element_tag(self): + # This is a little strange, since most HTML parsers don't allow + # markup like this to come through. But in general, we don't + # know what the parser would or wouldn't have allowed, so + # I'm letting this succeed for now. + soup = self.soup("<br/>") + soup.br.insert(1, "Contents") + self.assertEqual(str(soup.br), "<br>Contents</br>") + + def test_insert_before(self): + soup = self.soup("<a>foo</a><b>bar</b>") + soup.b.insert_before("BAZ") + soup.a.insert_before("QUUX") + self.assertEqual( + soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>")) + + soup.a.insert_before(soup.b) + self.assertEqual( + soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) + + # Can't insert an element before itself. + b = soup.b + self.assertRaises(ValueError, b.insert_before, b) + + # Can't insert before if an element has no parent. + b.extract() + self.assertRaises(ValueError, b.insert_before, "nope") + + # Can insert an identical element + soup = self.soup("<a>") + soup.a.insert_before(soup.new_tag("a")) + + def test_insert_multiple_before(self): + soup = self.soup("<a>foo</a><b>bar</b>") + soup.b.insert_before("BAZ", " ", "QUUX") + soup.a.insert_before("QUUX", " ", "BAZ") + self.assertEqual( + soup.decode(), self.document_for("QUUX BAZ<a>foo</a>BAZ QUUX<b>bar</b>")) + + soup.a.insert_before(soup.b, "FOO") + self.assertEqual( + soup.decode(), self.document_for("QUUX BAZ<b>bar</b>FOO<a>foo</a>BAZ QUUX")) + + def test_insert_after(self): + soup = self.soup("<a>foo</a><b>bar</b>") + soup.b.insert_after("BAZ") + soup.a.insert_after("QUUX") + self.assertEqual( + soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ")) + soup.b.insert_after(soup.a) + self.assertEqual( + soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) + + # Can't insert an element after itself. + b = soup.b + self.assertRaises(ValueError, b.insert_after, b) + + # Can't insert after if an element has no parent. + b.extract() + self.assertRaises(ValueError, b.insert_after, "nope") + + # Can insert an identical element + soup = self.soup("<a>") + soup.a.insert_before(soup.new_tag("a")) + + def test_insert_multiple_after(self): + soup = self.soup("<a>foo</a><b>bar</b>") + soup.b.insert_after("BAZ", " ", "QUUX") + soup.a.insert_after("QUUX", " ", "BAZ") + self.assertEqual( + soup.decode(), self.document_for("<a>foo</a>QUUX BAZ<b>bar</b>BAZ QUUX")) + soup.b.insert_after(soup.a, "FOO ") + self.assertEqual( + soup.decode(), self.document_for("QUUX BAZ<b>bar</b><a>foo</a>FOO BAZ QUUX")) + + def test_insert_after_raises_exception_if_after_has_no_meaning(self): + soup = self.soup("") + tag = soup.new_tag("a") + string = soup.new_string("") + self.assertRaises(ValueError, string.insert_after, tag) + self.assertRaises(NotImplementedError, soup.insert_after, tag) + self.assertRaises(ValueError, tag.insert_after, tag) + + def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): + soup = self.soup("") + tag = soup.new_tag("a") + string = soup.new_string("") + self.assertRaises(ValueError, string.insert_before, tag) + self.assertRaises(NotImplementedError, soup.insert_before, tag) + self.assertRaises(ValueError, tag.insert_before, tag) + + def test_replace_with(self): + soup = self.soup( + "<p>There's <b>no</b> business like <b>show</b> business</p>") + no, show = soup.find_all('b') + show.replace_with(no) + self.assertEqual( + soup.decode(), + self.document_for( + "<p>There's business like <b>no</b> business</p>")) + + self.assertEqual(show.parent, None) + self.assertEqual(no.parent, soup.p) + self.assertEqual(no.next_element, "no") + self.assertEqual(no.next_sibling, " business") + + def test_replace_first_child(self): + data = "<a><b></b><c></c></a>" + soup = self.soup(data) + soup.b.replace_with(soup.c) + self.assertEqual("<a><c></c></a>", soup.decode()) + + def test_replace_last_child(self): + data = "<a><b></b><c></c></a>" + soup = self.soup(data) + soup.c.replace_with(soup.b) + self.assertEqual("<a><b></b></a>", soup.decode()) + + def test_nested_tag_replace_with(self): + soup = self.soup( + """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""") + + # Replace the entire <b> tag and its contents ("reserve the + # right") with the <f> tag ("refuse"). + remove_tag = soup.b + move_tag = soup.f + remove_tag.replace_with(move_tag) + + self.assertEqual( + soup.decode(), self.document_for( + "<a>We<f>refuse</f></a><e>to<g>service</g></e>")) + + # The <b> tag is now an orphan. + self.assertEqual(remove_tag.parent, None) + self.assertEqual(remove_tag.find(text="right").next_element, None) + self.assertEqual(remove_tag.previous_element, None) + self.assertEqual(remove_tag.next_sibling, None) + self.assertEqual(remove_tag.previous_sibling, None) + + # The <f> tag is now connected to the <a> tag. + self.assertEqual(move_tag.parent, soup.a) + self.assertEqual(move_tag.previous_element, "We") + self.assertEqual(move_tag.next_element.next_element, soup.e) + self.assertEqual(move_tag.next_sibling, None) + + # The gap where the <f> tag used to be has been mended, and + # the word "to" is now connected to the <g> tag. + to_text = soup.find(text="to") + g_tag = soup.g + self.assertEqual(to_text.next_element, g_tag) + self.assertEqual(to_text.next_sibling, g_tag) + self.assertEqual(g_tag.previous_element, to_text) + self.assertEqual(g_tag.previous_sibling, to_text) + + def test_unwrap(self): + tree = self.soup(""" + <p>Unneeded <em>formatting</em> is unneeded</p> + """) + tree.em.unwrap() + self.assertEqual(tree.em, None) + self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") + + def test_wrap(self): + soup = self.soup("I wish I was bold.") + value = soup.string.wrap(soup.new_tag("b")) + self.assertEqual(value.decode(), "<b>I wish I was bold.</b>") + self.assertEqual( + soup.decode(), self.document_for("<b>I wish I was bold.</b>")) + + def test_wrap_extracts_tag_from_elsewhere(self): + soup = self.soup("<b></b>I wish I was bold.") + soup.b.next_sibling.wrap(soup.b) + self.assertEqual( + soup.decode(), self.document_for("<b>I wish I was bold.</b>")) + + def test_wrap_puts_new_contents_at_the_end(self): + soup = self.soup("<b>I like being bold.</b>I wish I was bold.") + soup.b.next_sibling.wrap(soup.b) + self.assertEqual(2, len(soup.b.contents)) + self.assertEqual( + soup.decode(), self.document_for( + "<b>I like being bold.I wish I was bold.</b>")) + + def test_extract(self): + soup = self.soup( + '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>') + + self.assertEqual(len(soup.body.contents), 3) + extracted = soup.find(id="nav").extract() + + self.assertEqual( + soup.decode(), "<html><body>Some content. More content.</body></html>") + self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>') + + # The extracted tag is now an orphan. + self.assertEqual(len(soup.body.contents), 2) + self.assertEqual(extracted.parent, None) + self.assertEqual(extracted.previous_element, None) + self.assertEqual(extracted.next_element.next_element, None) + + # The gap where the extracted tag used to be has been mended. + content_1 = soup.find(text="Some content. ") + content_2 = soup.find(text=" More content.") + self.assertEqual(content_1.next_element, content_2) + self.assertEqual(content_1.next_sibling, content_2) + self.assertEqual(content_2.previous_element, content_1) + self.assertEqual(content_2.previous_sibling, content_1) + + def test_extract_distinguishes_between_identical_strings(self): + soup = self.soup("<a>foo</a><b>bar</b>") + foo_1 = soup.a.string + bar_1 = soup.b.string + foo_2 = soup.new_string("foo") + bar_2 = soup.new_string("bar") + soup.a.append(foo_2) + soup.b.append(bar_2) + + # Now there are two identical strings in the <a> tag, and two + # in the <b> tag. Let's remove the first "foo" and the second + # "bar". + foo_1.extract() + bar_2.extract() + self.assertEqual(foo_2, soup.a.string) + self.assertEqual(bar_2, soup.b.string) + + def test_extract_multiples_of_same_tag(self): + soup = self.soup(""" +<html> +<head> +<script>foo</script> +</head> +<body> + <script>bar</script> + <a></a> +</body> +<script>baz</script> +</html>""") + [soup.script.extract() for i in soup.find_all("script")] + self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body)) + + + def test_extract_works_when_element_is_surrounded_by_identical_strings(self): + soup = self.soup( + '<html>\n' + '<body>hi</body>\n' + '</html>') + soup.find('body').extract() + self.assertEqual(None, soup.find('body')) + + + def test_clear(self): + """Tag.clear()""" + soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") + # clear using extract() + a = soup.a + soup.p.clear() + self.assertEqual(len(soup.p.contents), 0) + self.assertTrue(hasattr(a, "contents")) + + # clear using decompose() + em = a.em + a.clear(decompose=True) + self.assertEqual(0, len(em.contents)) + + def test_string_set(self): + """Tag.string = 'string'""" + soup = self.soup("<a></a> <b><c></c></b>") + soup.a.string = "foo" + self.assertEqual(soup.a.contents, ["foo"]) + soup.b.string = "bar" + self.assertEqual(soup.b.contents, ["bar"]) + + def test_string_set_does_not_affect_original_string(self): + soup = self.soup("<a><b>foo</b><c>bar</c>") + soup.b.string = soup.c.string + self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>") + + def test_set_string_preserves_class_of_string(self): + soup = self.soup("<a></a>") + cdata = CData("foo") + soup.a.string = cdata + self.assertTrue(isinstance(soup.a.string, CData)) + +class TestElementObjects(SoupTest): + """Test various features of element objects.""" + + def test_len(self): + """The length of an element is its number of children.""" + soup = self.soup("<top>1<b>2</b>3</top>") + + # The BeautifulSoup object itself contains one element: the + # <top> tag. + self.assertEqual(len(soup.contents), 1) + self.assertEqual(len(soup), 1) + + # The <top> tag contains three elements: the text node "1", the + # <b> tag, and the text node "3". + self.assertEqual(len(soup.top), 3) + self.assertEqual(len(soup.top.contents), 3) + + def test_member_access_invokes_find(self): + """Accessing a Python member .foo invokes find('foo')""" + soup = self.soup('<b><i></i></b>') + self.assertEqual(soup.b, soup.find('b')) + self.assertEqual(soup.b.i, soup.find('b').find('i')) + self.assertEqual(soup.a, None) + + def test_deprecated_member_access(self): + soup = self.soup('<b><i></i></b>') + with warnings.catch_warnings(record=True) as w: + tag = soup.bTag + self.assertEqual(soup.b, tag) + self.assertEqual( + '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")', + str(w[0].message)) + + def test_has_attr(self): + """has_attr() checks for the presence of an attribute. + + Please note note: has_attr() is different from + __in__. has_attr() checks the tag's attributes and __in__ + checks the tag's chidlren. + """ + soup = self.soup("<foo attr='bar'>") + self.assertTrue(soup.foo.has_attr('attr')) + self.assertFalse(soup.foo.has_attr('attr2')) + + + def test_attributes_come_out_in_alphabetical_order(self): + markup = '<b a="1" z="5" m="3" f="2" y="4"></b>' + self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>') + + def test_string(self): + # A tag that contains only a text node makes that node + # available as .string. + soup = self.soup("<b>foo</b>") + self.assertEqual(soup.b.string, 'foo') + + def test_empty_tag_has_no_string(self): + # A tag with no children has no .stirng. + soup = self.soup("<b></b>") + self.assertEqual(soup.b.string, None) + + def test_tag_with_multiple_children_has_no_string(self): + # A tag with no children has no .string. + soup = self.soup("<a>foo<b></b><b></b></b>") + self.assertEqual(soup.b.string, None) + + soup = self.soup("<a>foo<b></b>bar</b>") + self.assertEqual(soup.b.string, None) + + # Even if all the children are strings, due to trickery, + # it won't work--but this would be a good optimization. + soup = self.soup("<a>foo</b>") + soup.a.insert(1, "bar") + self.assertEqual(soup.a.string, None) + + def test_tag_with_recursive_string_has_string(self): + # A tag with a single child which has a .string inherits that + # .string. + soup = self.soup("<a><b>foo</b></a>") + self.assertEqual(soup.a.string, "foo") + self.assertEqual(soup.string, "foo") + + def test_lack_of_string(self): + """Only a tag containing a single text node has a .string.""" + soup = self.soup("<b>f<i>e</i>o</b>") + self.assertFalse(soup.b.string) + + soup = self.soup("<b></b>") + self.assertFalse(soup.b.string) + + def test_all_text(self): + """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" + soup = self.soup("<a>a<b>r</b> <r> t </r></a>") + self.assertEqual(soup.a.text, "ar t ") + self.assertEqual(soup.a.get_text(strip=True), "art") + self.assertEqual(soup.a.get_text(","), "a,r, , t ") + self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") + + def test_get_text_ignores_comments(self): + soup = self.soup("foo<!--IGNORE-->bar") + self.assertEqual(soup.get_text(), "foobar") + + self.assertEqual( + soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar") + self.assertEqual( + soup.get_text(types=None), "fooIGNOREbar") + + def test_all_strings_ignores_comments(self): + soup = self.soup("foo<!--IGNORE-->bar") + self.assertEqual(['foo', 'bar'], list(soup.strings)) + +class TestCDAtaListAttributes(SoupTest): + + """Testing cdata-list attributes like 'class'. + """ + def test_single_value_becomes_list(self): + soup = self.soup("<a class='foo'>") + self.assertEqual(["foo"],soup.a['class']) + + def test_multiple_values_becomes_list(self): + soup = self.soup("<a class='foo bar'>") + self.assertEqual(["foo", "bar"], soup.a['class']) + + def test_multiple_values_separated_by_weird_whitespace(self): + soup = self.soup("<a class='foo\tbar\nbaz'>") + self.assertEqual(["foo", "bar", "baz"],soup.a['class']) + + def test_attributes_joined_into_string_on_output(self): + soup = self.soup("<a class='foo\tbar'>") + self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode()) + + def test_get_attribute_list(self): + soup = self.soup("<a id='abc def'>") + self.assertEqual(['abc def'], soup.a.get_attribute_list('id')) + + def test_accept_charset(self): + soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">') + self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) + + def test_cdata_attribute_applying_only_to_one_tag(self): + data = '<a accept-charset="ISO-8859-1 UTF-8"></a>' + soup = self.soup(data) + # We saw in another test that accept-charset is a cdata-list + # attribute for the <form> tag. But it's not a cdata-list + # attribute for any other tag. + self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) + + def test_string_has_immutable_name_property(self): + string = self.soup("s").string + self.assertEqual(None, string.name) + def t(): + string.name = 'foo' + self.assertRaises(AttributeError, t) + +class TestPersistence(SoupTest): + "Testing features like pickle and deepcopy." + + def setUp(self): + super(TestPersistence, self).setUp() + self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" +"http://www.w3.org/TR/REC-html40/transitional.dtd"> +<html> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> +<title>Beautiful Soup: We called him Tortoise because he taught us.</title> +<link rev="made" href="mailto:[email protected]"> +<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping."> +<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)"> +<meta name="author" content="Leonard Richardson"> +</head> +<body> +<a href="foo">foo</a> +<a href="foo"><b>bar</b></a> +</body> +</html>""" + self.tree = self.soup(self.page) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + dumped = pickle.dumps(self.tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), self.tree.decode()) + + def test_deepcopy_identity(self): + # Making a deepcopy of a tree yields an identical tree. + copied = copy.deepcopy(self.tree) + self.assertEqual(copied.decode(), self.tree.decode()) + + def test_copy_preserves_encoding(self): + soup = BeautifulSoup(b'<p> </p>', 'html.parser') + encoding = soup.original_encoding + copy = soup.__copy__() + self.assertEqual("<p> </p>", str(copy)) + self.assertEqual(encoding, copy.original_encoding) + + def test_unicode_pickle(self): + # A tree containing Unicode characters can be pickled. + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.decode(), soup.decode()) + + def test_copy_navigablestring_is_not_attached_to_tree(self): + html = "<b>Foo<a></a></b><b>Bar</b>" + soup = self.soup(html) + s1 = soup.find(string="Foo") + s2 = copy.copy(s1) + self.assertEqual(s1, s2) + self.assertEqual(None, s2.parent) + self.assertEqual(None, s2.next_element) + self.assertNotEqual(None, s1.next_sibling) + self.assertEqual(None, s2.next_sibling) + self.assertEqual(None, s2.previous_element) + + def test_copy_navigablestring_subclass_has_same_type(self): + html = "<b><!--Foo--></b>" + soup = self.soup(html) + s1 = soup.string + s2 = copy.copy(s1) + self.assertEqual(s1, s2) + self.assertTrue(isinstance(s2, Comment)) + + def test_copy_entire_soup(self): + html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" + soup = self.soup(html) + soup_copy = copy.copy(soup) + self.assertEqual(soup, soup_copy) + + def test_copy_tag_copies_contents(self): + html = "<div><b>Foo<a></a></b><b>Bar</b></div>end" + soup = self.soup(html) + div = soup.div + div_copy = copy.copy(div) + + # The two tags look the same, and evaluate to equal. + self.assertEqual(str(div), str(div_copy)) + self.assertEqual(div, div_copy) + + # But they're not the same object. + self.assertFalse(div is div_copy) + + # And they don't have the same relation to the parse tree. The + # copy is not associated with a parse tree at all. + self.assertEqual(None, div_copy.parent) + self.assertEqual(None, div_copy.previous_element) + self.assertEqual(None, div_copy.find(string='Bar').next_element) + self.assertNotEqual(None, div.find(string='Bar').next_element) + +class TestSubstitutions(SoupTest): + + def test_default_formatter_is_minimal(self): + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + self.assertEqual( + decoded, + self.document_for( + "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) + + def test_formatter_html(self): + markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html") + self.assertEqual( + decoded, + self.document_for("<br/><b><<Sacré bleu!>></b>")) + + def test_formatter_html5(self): + markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html5") + self.assertEqual( + decoded, + self.document_for("<br><b><<Sacré bleu!>></b>")) + + def test_formatter_minimal(self): + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + self.assertEqual( + decoded, + self.document_for( + "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) + + def test_formatter_null(self): + markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + soup = self.soup(markup) + decoded = soup.decode(formatter=None) + # Neither the angle brackets nor the e-with-acute are converted. + # This is not valid HTML, but it's what the user wanted. + self.assertEqual(decoded, + self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) + + def test_formatter_custom(self): + markup = "<b><foo></b><b>bar</b><br/>" + soup = self.soup(markup) + decoded = soup.decode(formatter = lambda x: x.upper()) + # Instead of normal entity conversion code, the custom + # callable is called on every string. + self.assertEqual( + decoded, + self.document_for("<b><FOO></b><b>BAR</b><br/>")) + + def test_formatter_is_run_on_attribute_values(self): + markup = '<a href="http://a.com?a=b&c=é">e</a>' + soup = self.soup(markup) + a = soup.a + + expect_minimal = '<a href="http://a.com?a=b&c=é">e</a>' + + self.assertEqual(expect_minimal, a.decode()) + self.assertEqual(expect_minimal, a.decode(formatter="minimal")) + + expect_html = '<a href="http://a.com?a=b&c=é">e</a>' + self.assertEqual(expect_html, a.decode(formatter="html")) + + self.assertEqual(markup, a.decode(formatter=None)) + expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>' + self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) + + def test_formatter_skips_script_tag_for_html_documents(self): + doc = """ + <script type="text/javascript"> + console.log("< < hey > > "); + </script> +""" + encoded = BeautifulSoup(doc, 'html.parser').encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_formatter_skips_style_tag_for_html_documents(self): + doc = """ + <style type="text/css"> + console.log("< < hey > > "); + </style> +""" + encoded = BeautifulSoup(doc, 'html.parser').encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_prettify_leaves_preformatted_text_alone(self): + soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>") + # Everything outside the <pre> tag is reformatted, but everything + # inside is left alone. + self.assertEqual( + '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>', + soup.div.prettify()) + + def test_prettify_accepts_formatter_function(self): + soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') + pretty = soup.prettify(formatter = lambda x: x.upper()) + self.assertTrue("FOO" in pretty) + + def test_prettify_outputs_unicode_by_default(self): + soup = self.soup("<a></a>") + self.assertEqual(str, type(soup.prettify())) + + def test_prettify_can_encode_data(self): + soup = self.soup("<a></a>") + self.assertEqual(bytes, type(soup.prettify("utf-8"))) + + def test_html_entity_substitution_off_by_default(self): + markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" + soup = self.soup(markup) + encoded = soup.b.encode("utf-8") + self.assertEqual(encoded, markup.encode('utf-8')) + + def test_encoding_substitution(self): + # Here's the <meta> tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('<meta content="text/html; charset=x-sjis" ' + 'http-equiv="Content-type"/>') + soup = self.soup(meta_tag) + + # Parse the document, and the charset apprears unchanged. + self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') + + # Encode the document into some encoding, and the encoding is + # substituted into the meta tag. + utf_8 = soup.encode("utf-8") + self.assertTrue(b"charset=utf-8" in utf_8) + + euc_jp = soup.encode("euc_jp") + self.assertTrue(b"charset=euc_jp" in euc_jp) + + shift_jis = soup.encode("shift-jis") + self.assertTrue(b"charset=shift-jis" in shift_jis) + + utf_16_u = soup.encode("utf-16").decode("utf-16") + self.assertTrue("charset=utf-16" in utf_16_u) + + def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): + markup = ('<head><meta content="text/html; charset=x-sjis" ' + 'http-equiv="Content-type"/></head><pre>foo</pre>') + + # Beautiful Soup used to try to rewrite the meta tag even if the + # meta tag got filtered out by the strainer. This test makes + # sure that doesn't happen. + strainer = SoupStrainer('pre') + soup = self.soup(markup, parse_only=strainer) + self.assertEqual(soup.contents[0].name, 'pre') + +class TestEncoding(SoupTest): + """Test the ability to encode objects into strings.""" + + def test_unicode_string_can_be_encoded(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + self.assertEqual(soup.b.string.encode("utf-8"), + "\N{SNOWMAN}".encode("utf-8")) + + def test_tag_containing_unicode_string_can_be_encoded(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + self.assertEqual( + soup.b.encode("utf-8"), html.encode("utf-8")) + + def test_encoding_substitutes_unrecognized_characters_by_default(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>") + + def test_encoding_can_be_made_strict(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + self.assertRaises( + UnicodeEncodeError, soup.encode, "ascii", errors="strict") + + def test_decode_contents(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents()) + + def test_encode_contents(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + self.assertEqual( + "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( + encoding="utf8")) + + def test_deprecated_renderContents(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + self.assertEqual( + "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) + + def test_repr(self): + html = "<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + if PY3K: + self.assertEqual(html, repr(soup)) + else: + self.assertEqual(b'<b>\\u2603</b>', repr(soup)) + +class TestFormatter(SoupTest): + + def test_sort_attributes(self): + # Test the ability to override Formatter.attributes() to, + # e.g., disable the normal sorting of attributes. + class UnsortedFormatter(Formatter): + def attributes(self, tag): + self.called_with = tag + for k, v in sorted(tag.attrs.items()): + if k == 'ignore': + continue + yield k,v + + soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>') + formatter = UnsortedFormatter() + decoded = soup.decode(formatter=formatter) + + # attributes() was called on the <p> tag. It filtered out one + # attribute and sorted the other two. + self.assertEqual(formatter.called_with, soup.p) + self.assertEqual('<p aval="2" cval="1"></p>', decoded) + + +class TestNavigableStringSubclasses(SoupTest): + + def test_cdata(self): + # None of the current builders turn CDATA sections into CData + # objects, but you can create them manually. + soup = self.soup("") + cdata = CData("foo") + soup.insert(1, cdata) + self.assertEqual(str(soup), "<![CDATA[foo]]>") + self.assertEqual(soup.find(text="foo"), "foo") + self.assertEqual(soup.contents[0], "foo") + + def test_cdata_is_never_formatted(self): + """Text inside a CData object is passed into the formatter. + + But the return value is ignored. + """ + + self.count = 0 + def increment(*args): + self.count += 1 + return "BITTER FAILURE" + + soup = self.soup("") + cdata = CData("<><><>") + soup.insert(1, cdata) + self.assertEqual( + b"<![CDATA[<><><>]]>", soup.encode(formatter=increment)) + self.assertEqual(1, self.count) + + def test_doctype_ends_in_newline(self): + # Unlike other NavigableString subclasses, a DOCTYPE always ends + # in a newline. + doctype = Doctype("foo") + soup = self.soup("") + soup.insert(1, doctype) + self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") + + def test_declaration(self): + d = Declaration("foo") + self.assertEqual("<?foo?>", d.output_ready()) + +class TestSoupSelector(TreeTest): + + HTML = """ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" +"http://www.w3.org/TR/html4/strict.dtd"> +<html> +<head> +<title>The title</title> +<link rel="stylesheet" href="blah.css" type="text/css" id="l1"> +</head> +<body> +<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag> +<div id="main" class="fancy"> +<div id="inner"> +<h1 id="header1">An H1</h1> +<p>Some text</p> +<p class="onep" id="p1">Some more text</p> +<h2 id="header2">An H2</h2> +<p class="class1 class2 class3" id="pmulti">Another</p> +<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> +<h2 id="header3">Another H2</h2> +<a id="me" href="http://simonwillison.net/" rel="me">me</a> +<span class="s1"> +<a href="#" id="s1a1">span1a1</a> +<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a> +<span class="span2"> +<a href="#" id="s2a1">span2a1</a> +</span> +<span class="span3"></span> +<custom-dashed-tag class="dashed" id="dash2"/> +<div data-tag="dashedvalue" id="data1"/> +</span> +</div> +<x id="xid"> +<z id="zida"/> +<z id="zidab"/> +<z id="zidac"/> +</x> +<y id="yid"> +<z id="zidb"/> +</y> +<p lang="en" id="lang-en">English</p> +<p lang="en-gb" id="lang-en-gb">English UK</p> +<p lang="en-us" id="lang-en-us">English US</p> +<p lang="fr" id="lang-fr">French</p> +</div> + +<div id="footer"> +</div> +""" + + def setUp(self): + self.soup = BeautifulSoup(self.HTML, 'html.parser') + + def assertSelects(self, selector, expected_ids, **kwargs): + el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)] + el_ids.sort() + expected_ids.sort() + self.assertEqual(expected_ids, el_ids, + "Selector %s, expected [%s], got [%s]" % ( + selector, ', '.join(expected_ids), ', '.join(el_ids) + ) + ) + + assertSelect = assertSelects + + def assertSelectMultiple(self, *tests): + for selector, expected_ids in tests: + self.assertSelect(selector, expected_ids) + + def test_one_tag_one(self): + els = self.soup.select('title') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'title') + self.assertEqual(els[0].contents, ['The title']) + + def test_one_tag_many(self): + els = self.soup.select('div') + self.assertEqual(len(els), 4) + for div in els: + self.assertEqual(div.name, 'div') + + el = self.soup.select_one('div') + self.assertEqual('main', el['id']) + + def test_select_one_returns_none_if_no_match(self): + match = self.soup.select_one('nonexistenttag') + self.assertEqual(None, match) + + + def test_tag_in_tag_one(self): + els = self.soup.select('div div') + self.assertSelects('div div', ['inner', 'data1']) + + def test_tag_in_tag_many(self): + for selector in ('html div', 'html body div', 'body div'): + self.assertSelects(selector, ['data1', 'main', 'inner', 'footer']) + + + def test_limit(self): + self.assertSelects('html div', ['main'], limit=1) + self.assertSelects('html body div', ['inner', 'main'], limit=2) + self.assertSelects('body div', ['data1', 'main', 'inner', 'footer'], + limit=10) + + def test_tag_no_match(self): + self.assertEqual(len(self.soup.select('del')), 0) + + def test_invalid_tag(self): + self.assertRaises(SyntaxError, self.soup.select, 'tag%t') + + def test_select_dashed_tag_ids(self): + self.assertSelects('custom-dashed-tag', ['dash1', 'dash2']) + + def test_select_dashed_by_id(self): + dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]') + self.assertEqual(dashed[0].name, 'custom-dashed-tag') + self.assertEqual(dashed[0]['id'], 'dash2') + + def test_dashed_tag_text(self): + self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.') + + def test_select_dashed_matches_find_all(self): + self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag')) + + def test_header_tags(self): + self.assertSelectMultiple( + ('h1', ['header1']), + ('h2', ['header2', 'header3']), + ) + + def test_class_one(self): + for selector in ('.onep', 'p.onep', 'html p.onep'): + els = self.soup.select(selector) + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'p') + self.assertEqual(els[0]['class'], ['onep']) + + def test_class_mismatched_tag(self): + els = self.soup.select('div.onep') + self.assertEqual(len(els), 0) + + def test_one_id(self): + for selector in ('div#inner', '#inner', 'div div#inner'): + self.assertSelects(selector, ['inner']) + + def test_bad_id(self): + els = self.soup.select('#doesnotexist') + self.assertEqual(len(els), 0) + + def test_items_in_id(self): + els = self.soup.select('div#inner p') + self.assertEqual(len(els), 3) + for el in els: + self.assertEqual(el.name, 'p') + self.assertEqual(els[1]['class'], ['onep']) + self.assertFalse(els[0].has_attr('class')) + + def test_a_bunch_of_emptys(self): + for selector in ('div#main del', 'div#main div.oops', 'div div#main'): + self.assertEqual(len(self.soup.select(selector)), 0) + + def test_multi_class_support(self): + for selector in ('.class1', 'p.class1', '.class2', 'p.class2', + '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): + self.assertSelects(selector, ['pmulti']) + + def test_multi_class_selection(self): + for selector in ('.class1.class3', '.class3.class2', + '.class1.class2.class3'): + self.assertSelects(selector, ['pmulti']) + + def test_child_selector(self): + self.assertSelects('.s1 > a', ['s1a1', 's1a2']) + self.assertSelects('.s1 > a span', ['s1a2s1']) + + def test_child_selector_id(self): + self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) + + def test_attribute_equals(self): + self.assertSelectMultiple( + ('p[class="onep"]', ['p1']), + ('p[id="p1"]', ['p1']), + ('[class="onep"]', ['p1']), + ('[id="p1"]', ['p1']), + ('link[rel="stylesheet"]', ['l1']), + ('link[type="text/css"]', ['l1']), + ('link[href="blah.css"]', ['l1']), + ('link[href="no-blah.css"]', []), + ('[rel="stylesheet"]', ['l1']), + ('[type="text/css"]', ['l1']), + ('[href="blah.css"]', ['l1']), + ('[href="no-blah.css"]', []), + ('p[href="no-blah.css"]', []), + ('[href="no-blah.css"]', []), + ) + + def test_attribute_tilde(self): + self.assertSelectMultiple( + ('p[class~="class1"]', ['pmulti']), + ('p[class~="class2"]', ['pmulti']), + ('p[class~="class3"]', ['pmulti']), + ('[class~="class1"]', ['pmulti']), + ('[class~="class2"]', ['pmulti']), + ('[class~="class3"]', ['pmulti']), + ('a[rel~="friend"]', ['bob']), + ('a[rel~="met"]', ['bob']), + ('[rel~="friend"]', ['bob']), + ('[rel~="met"]', ['bob']), + ) + + def test_attribute_startswith(self): + self.assertSelectMultiple( + ('[rel^="style"]', ['l1']), + ('link[rel^="style"]', ['l1']), + ('notlink[rel^="notstyle"]', []), + ('[rel^="notstyle"]', []), + ('link[rel^="notstyle"]', []), + ('link[href^="bla"]', ['l1']), + ('a[href^="http://"]', ['bob', 'me']), + ('[href^="http://"]', ['bob', 'me']), + ('[id^="p"]', ['pmulti', 'p1']), + ('[id^="m"]', ['me', 'main']), + ('div[id^="m"]', ['main']), + ('a[id^="m"]', ['me']), + ('div[data-tag^="dashed"]', ['data1']) + ) + + def test_attribute_endswith(self): + self.assertSelectMultiple( + ('[href$=".css"]', ['l1']), + ('link[href$=".css"]', ['l1']), + ('link[id$="1"]', ['l1']), + ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']), + ('div[id$="1"]', ['data1']), + ('[id$="noending"]', []), + ) + + def test_attribute_contains(self): + self.assertSelectMultiple( + # From test_attribute_startswith + ('[rel*="style"]', ['l1']), + ('link[rel*="style"]', ['l1']), + ('notlink[rel*="notstyle"]', []), + ('[rel*="notstyle"]', []), + ('link[rel*="notstyle"]', []), + ('link[href*="bla"]', ['l1']), + ('[href*="http://"]', ['bob', 'me']), + ('[id*="p"]', ['pmulti', 'p1']), + ('div[id*="m"]', ['main']), + ('a[id*="m"]', ['me']), + # From test_attribute_endswith + ('[href*=".css"]', ['l1']), + ('link[href*=".css"]', ['l1']), + ('link[id*="1"]', ['l1']), + ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']), + ('div[id*="1"]', ['data1']), + ('[id*="noending"]', []), + # New for this test + ('[href*="."]', ['bob', 'me', 'l1']), + ('a[href*="."]', ['bob', 'me']), + ('link[href*="."]', ['l1']), + ('div[id*="n"]', ['main', 'inner']), + ('div[id*="nn"]', ['inner']), + ('div[data-tag*="edval"]', ['data1']) + ) + + def test_attribute_exact_or_hypen(self): + self.assertSelectMultiple( + ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('p[lang|="fr"]', ['lang-fr']), + ('p[lang|="gb"]', []), + ) + + def test_attribute_exists(self): + self.assertSelectMultiple( + ('[rel]', ['l1', 'bob', 'me']), + ('link[rel]', ['l1']), + ('a[rel]', ['bob', 'me']), + ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), + ('p[class]', ['p1', 'pmulti']), + ('[blah]', []), + ('p[blah]', []), + ('div[data-tag]', ['data1']) + ) + + def test_quoted_space_in_selector_name(self): + html = """<div style="display: wrong">nope</div> + <div style="display: right">yes</div> + """ + soup = BeautifulSoup(html, 'html.parser') + [chosen] = soup.select('div[style="display: right"]') + self.assertEqual("yes", chosen.string) + + def test_unsupported_pseudoclass(self): + self.assertRaises( + NotImplementedError, self.soup.select, "a:no-such-pseudoclass") + + self.assertRaises( + SyntaxError, self.soup.select, "a:nth-of-type(a)") + + def test_nth_of_type(self): + # Try to select first paragraph + els = self.soup.select('div#inner p:nth-of-type(1)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, 'Some text') + + # Try to select third paragraph + els = self.soup.select('div#inner p:nth-of-type(3)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, 'Another') + + # Try to select (non-existent!) fourth paragraph + els = self.soup.select('div#inner p:nth-of-type(4)') + self.assertEqual(len(els), 0) + + # Zero will select no tags. + els = self.soup.select('div p:nth-of-type(0)') + self.assertEqual(len(els), 0) + + def test_nth_of_type_direct_descendant(self): + els = self.soup.select('div#inner > p:nth-of-type(1)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, 'Some text') + + def test_id_child_selector_nth_of_type(self): + self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) + + def test_select_on_element(self): + # Other tests operate on the tree; this operates on an element + # within the tree. + inner = self.soup.find("div", id="main") + selected = inner.select("div") + # The <div id="inner"> tag was selected. The <div id="footer"> + # tag was not. + self.assertSelectsIDs(selected, ['inner', 'data1']) + + def test_overspecified_child_id(self): + self.assertSelects(".fancy #inner", ['inner']) + self.assertSelects(".normal #inner", []) + + def test_adjacent_sibling_selector(self): + self.assertSelects('#p1 + h2', ['header2']) + self.assertSelects('#p1 + h2 + p', ['pmulti']) + self.assertSelects('#p1 + #header2 + .class1', ['pmulti']) + self.assertEqual([], self.soup.select('#p1 + p')) + + def test_general_sibling_selector(self): + self.assertSelects('#p1 ~ h2', ['header2', 'header3']) + self.assertSelects('#p1 ~ #header2', ['header2']) + self.assertSelects('#p1 ~ h2 + a', ['me']) + self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me']) + self.assertEqual([], self.soup.select('#inner ~ h2')) + + def test_dangling_combinator(self): + self.assertRaises(SyntaxError, self.soup.select, 'h1 >') + + def test_sibling_combinator_wont_select_same_tag_twice(self): + self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) + + # Test the selector grouping operator (the comma) + def test_multiple_select(self): + self.assertSelects('x, y', ['xid', 'yid']) + + def test_multiple_select_with_no_space(self): + self.assertSelects('x,y', ['xid', 'yid']) + + def test_multiple_select_with_more_space(self): + self.assertSelects('x, y', ['xid', 'yid']) + + def test_multiple_select_duplicated(self): + self.assertSelects('x, x', ['xid']) + + def test_multiple_select_sibling(self): + self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr']) + + def test_multiple_select_tag_and_direct_descendant(self): + self.assertSelects('x, y > z', ['xid', 'zidb']) + + def test_multiple_select_direct_descendant_and_tags(self): + self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) + + def test_multiple_select_indirect_descendant(self): + self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) + + def test_invalid_multiple_select(self): + self.assertRaises(SyntaxError, self.soup.select, ',x, y') + self.assertRaises(SyntaxError, self.soup.select, 'x,,y') + + def test_multiple_select_attrs(self): + self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb']) + + def test_multiple_select_ids(self): + self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab']) + + def test_multiple_select_nested(self): + self.assertSelects('body > div > x, y > z', ['xid', 'zidb']) + + def test_select_duplicate_elements(self): + # When markup contains duplicate elements, a multiple select + # will find all of them. + markup = '<div class="c1"/><div class="c2"/><div class="c1"/>' + soup = BeautifulSoup(markup, 'html.parser') + selected = soup.select(".c1, .c2") + self.assertEqual(3, len(selected)) + + # Verify that find_all finds the same elements, though because + # of an implementation detail it finds them in a different + # order. + for element in soup.find_all(class_=['c1', 'c2']): + assert element in selected diff --git a/libs3/yaml/__init__.py b/libs3/yaml/__init__.py new file mode 100644 index 000000000..5df0bb5fd --- /dev/null +++ b/libs3/yaml/__init__.py @@ -0,0 +1,402 @@ + +from .error import * + +from .tokens import * +from .events import * +from .nodes import * + +from .loader import * +from .dumper import * + +__version__ = '5.1' +try: + from .cyaml import * + __with_libyaml__ = True +except ImportError: + __with_libyaml__ = False + +import io + +#------------------------------------------------------------------------------ +# Warnings control +#------------------------------------------------------------------------------ + +# 'Global' warnings state: +_warnings_enabled = { + 'YAMLLoadWarning': True, +} + +# Get or set global warnings' state +def warnings(settings=None): + if settings is None: + return _warnings_enabled + + if type(settings) is dict: + for key in settings: + if key in _warnings_enabled: + _warnings_enabled[key] = settings[key] + +# Warn when load() is called without Loader=... +class YAMLLoadWarning(RuntimeWarning): + pass + +def load_warning(method): + if _warnings_enabled['YAMLLoadWarning'] is False: + return + + import warnings + + message = ( + "calling yaml.%s() without Loader=... is deprecated, as the " + "default Loader is unsafe. Please read " + "https://msg.pyyaml.org/load for full details." + ) % method + + warnings.warn(message, YAMLLoadWarning, stacklevel=3) + +#------------------------------------------------------------------------------ +def scan(stream, Loader=Loader): + """ + Scan a YAML stream and produce scanning tokens. + """ + loader = Loader(stream) + try: + while loader.check_token(): + yield loader.get_token() + finally: + loader.dispose() + +def parse(stream, Loader=Loader): + """ + Parse a YAML stream and produce parsing events. + """ + loader = Loader(stream) + try: + while loader.check_event(): + yield loader.get_event() + finally: + loader.dispose() + +def compose(stream, Loader=Loader): + """ + Parse the first YAML document in a stream + and produce the corresponding representation tree. + """ + loader = Loader(stream) + try: + return loader.get_single_node() + finally: + loader.dispose() + +def compose_all(stream, Loader=Loader): + """ + Parse all YAML documents in a stream + and produce corresponding representation trees. + """ + loader = Loader(stream) + try: + while loader.check_node(): + yield loader.get_node() + finally: + loader.dispose() + +def load(stream, Loader=None): + """ + Parse the first YAML document in a stream + and produce the corresponding Python object. + """ + if Loader is None: + load_warning('load') + Loader = FullLoader + + loader = Loader(stream) + try: + return loader.get_single_data() + finally: + loader.dispose() + +def load_all(stream, Loader=None): + """ + Parse all YAML documents in a stream + and produce corresponding Python objects. + """ + if Loader is None: + load_warning('load_all') + Loader = FullLoader + + loader = Loader(stream) + try: + while loader.check_data(): + yield loader.get_data() + finally: + loader.dispose() + +def full_load(stream): + """ + Parse the first YAML document in a stream + and produce the corresponding Python object. + + Resolve all tags except those known to be + unsafe on untrusted input. + """ + return load(stream, FullLoader) + +def full_load_all(stream): + """ + Parse all YAML documents in a stream + and produce corresponding Python objects. + + Resolve all tags except those known to be + unsafe on untrusted input. + """ + return load_all(stream, FullLoader) + +def safe_load(stream): + """ + Parse the first YAML document in a stream + and produce the corresponding Python object. + + Resolve only basic YAML tags. This is known + to be safe for untrusted input. + """ + return load(stream, SafeLoader) + +def safe_load_all(stream): + """ + Parse all YAML documents in a stream + and produce corresponding Python objects. + + Resolve only basic YAML tags. This is known + to be safe for untrusted input. + """ + return load_all(stream, SafeLoader) + +def unsafe_load(stream): + """ + Parse the first YAML document in a stream + and produce the corresponding Python object. + + Resolve all tags, even those known to be + unsafe on untrusted input. + """ + return load(stream, UnsafeLoader) + +def unsafe_load_all(stream): + """ + Parse all YAML documents in a stream + and produce corresponding Python objects. + + Resolve all tags, even those known to be + unsafe on untrusted input. + """ + return load_all(stream, UnsafeLoader) + +def emit(events, stream=None, Dumper=Dumper, + canonical=None, indent=None, width=None, + allow_unicode=None, line_break=None): + """ + Emit YAML parsing events into a stream. + If stream is None, return the produced string instead. + """ + getvalue = None + if stream is None: + stream = io.StringIO() + getvalue = stream.getvalue + dumper = Dumper(stream, canonical=canonical, indent=indent, width=width, + allow_unicode=allow_unicode, line_break=line_break) + try: + for event in events: + dumper.emit(event) + finally: + dumper.dispose() + if getvalue: + return getvalue() + +def serialize_all(nodes, stream=None, Dumper=Dumper, + canonical=None, indent=None, width=None, + allow_unicode=None, line_break=None, + encoding=None, explicit_start=None, explicit_end=None, + version=None, tags=None): + """ + Serialize a sequence of representation trees into a YAML stream. + If stream is None, return the produced string instead. + """ + getvalue = None + if stream is None: + if encoding is None: + stream = io.StringIO() + else: + stream = io.BytesIO() + getvalue = stream.getvalue + dumper = Dumper(stream, canonical=canonical, indent=indent, width=width, + allow_unicode=allow_unicode, line_break=line_break, + encoding=encoding, version=version, tags=tags, + explicit_start=explicit_start, explicit_end=explicit_end) + try: + dumper.open() + for node in nodes: + dumper.serialize(node) + dumper.close() + finally: + dumper.dispose() + if getvalue: + return getvalue() + +def serialize(node, stream=None, Dumper=Dumper, **kwds): + """ + Serialize a representation tree into a YAML stream. + If stream is None, return the produced string instead. + """ + return serialize_all([node], stream, Dumper=Dumper, **kwds) + +def dump_all(documents, stream=None, Dumper=Dumper, + default_style=None, default_flow_style=False, + canonical=None, indent=None, width=None, + allow_unicode=None, line_break=None, + encoding=None, explicit_start=None, explicit_end=None, + version=None, tags=None, sort_keys=True): + """ + Serialize a sequence of Python objects into a YAML stream. + If stream is None, return the produced string instead. + """ + getvalue = None + if stream is None: + if encoding is None: + stream = io.StringIO() + else: + stream = io.BytesIO() + getvalue = stream.getvalue + dumper = Dumper(stream, default_style=default_style, + default_flow_style=default_flow_style, + canonical=canonical, indent=indent, width=width, + allow_unicode=allow_unicode, line_break=line_break, + encoding=encoding, version=version, tags=tags, + explicit_start=explicit_start, explicit_end=explicit_end, sort_keys=sort_keys) + try: + dumper.open() + for data in documents: + dumper.represent(data) + dumper.close() + finally: + dumper.dispose() + if getvalue: + return getvalue() + +def dump(data, stream=None, Dumper=Dumper, **kwds): + """ + Serialize a Python object into a YAML stream. + If stream is None, return the produced string instead. + """ + return dump_all([data], stream, Dumper=Dumper, **kwds) + +def safe_dump_all(documents, stream=None, **kwds): + """ + Serialize a sequence of Python objects into a YAML stream. + Produce only basic YAML tags. + If stream is None, return the produced string instead. + """ + return dump_all(documents, stream, Dumper=SafeDumper, **kwds) + +def safe_dump(data, stream=None, **kwds): + """ + Serialize a Python object into a YAML stream. + Produce only basic YAML tags. + If stream is None, return the produced string instead. + """ + return dump_all([data], stream, Dumper=SafeDumper, **kwds) + +def add_implicit_resolver(tag, regexp, first=None, + Loader=Loader, Dumper=Dumper): + """ + Add an implicit scalar detector. + If an implicit scalar value matches the given regexp, + the corresponding tag is assigned to the scalar. + first is a sequence of possible initial characters or None. + """ + Loader.add_implicit_resolver(tag, regexp, first) + Dumper.add_implicit_resolver(tag, regexp, first) + +def add_path_resolver(tag, path, kind=None, Loader=Loader, Dumper=Dumper): + """ + Add a path based resolver for the given tag. + A path is a list of keys that forms a path + to a node in the representation tree. + Keys can be string values, integers, or None. + """ + Loader.add_path_resolver(tag, path, kind) + Dumper.add_path_resolver(tag, path, kind) + +def add_constructor(tag, constructor, Loader=Loader): + """ + Add a constructor for the given tag. + Constructor is a function that accepts a Loader instance + and a node object and produces the corresponding Python object. + """ + Loader.add_constructor(tag, constructor) + +def add_multi_constructor(tag_prefix, multi_constructor, Loader=Loader): + """ + Add a multi-constructor for the given tag prefix. + Multi-constructor is called for a node if its tag starts with tag_prefix. + Multi-constructor accepts a Loader instance, a tag suffix, + and a node object and produces the corresponding Python object. + """ + Loader.add_multi_constructor(tag_prefix, multi_constructor) + +def add_representer(data_type, representer, Dumper=Dumper): + """ + Add a representer for the given type. + Representer is a function accepting a Dumper instance + and an instance of the given data type + and producing the corresponding representation node. + """ + Dumper.add_representer(data_type, representer) + +def add_multi_representer(data_type, multi_representer, Dumper=Dumper): + """ + Add a representer for the given type. + Multi-representer is a function accepting a Dumper instance + and an instance of the given data type or subtype + and producing the corresponding representation node. + """ + Dumper.add_multi_representer(data_type, multi_representer) + +class YAMLObjectMetaclass(type): + """ + The metaclass for YAMLObject. + """ + def __init__(cls, name, bases, kwds): + super(YAMLObjectMetaclass, cls).__init__(name, bases, kwds) + if 'yaml_tag' in kwds and kwds['yaml_tag'] is not None: + cls.yaml_loader.add_constructor(cls.yaml_tag, cls.from_yaml) + cls.yaml_dumper.add_representer(cls, cls.to_yaml) + +class YAMLObject(metaclass=YAMLObjectMetaclass): + """ + An object that can dump itself to a YAML stream + and load itself from a YAML stream. + """ + + __slots__ = () # no direct instantiation, so allow immutable subclasses + + yaml_loader = Loader + yaml_dumper = Dumper + + yaml_tag = None + yaml_flow_style = None + + @classmethod + def from_yaml(cls, loader, node): + """ + Convert a representation node to a Python object. + """ + return loader.construct_yaml_object(node, cls) + + @classmethod + def to_yaml(cls, dumper, data): + """ + Convert a Python object to a representation node. + """ + return dumper.represent_yaml_object(cls.yaml_tag, data, cls, + flow_style=cls.yaml_flow_style) + diff --git a/libs3/yaml/composer.py b/libs3/yaml/composer.py new file mode 100644 index 000000000..6d15cb40e --- /dev/null +++ b/libs3/yaml/composer.py @@ -0,0 +1,139 @@ + +__all__ = ['Composer', 'ComposerError'] + +from .error import MarkedYAMLError +from .events import * +from .nodes import * + +class ComposerError(MarkedYAMLError): + pass + +class Composer: + + def __init__(self): + self.anchors = {} + + def check_node(self): + # Drop the STREAM-START event. + if self.check_event(StreamStartEvent): + self.get_event() + + # If there are more documents available? + return not self.check_event(StreamEndEvent) + + def get_node(self): + # Get the root node of the next document. + if not self.check_event(StreamEndEvent): + return self.compose_document() + + def get_single_node(self): + # Drop the STREAM-START event. + self.get_event() + + # Compose a document if the stream is not empty. + document = None + if not self.check_event(StreamEndEvent): + document = self.compose_document() + + # Ensure that the stream contains no more documents. + if not self.check_event(StreamEndEvent): + event = self.get_event() + raise ComposerError("expected a single document in the stream", + document.start_mark, "but found another document", + event.start_mark) + + # Drop the STREAM-END event. + self.get_event() + + return document + + def compose_document(self): + # Drop the DOCUMENT-START event. + self.get_event() + + # Compose the root node. + node = self.compose_node(None, None) + + # Drop the DOCUMENT-END event. + self.get_event() + + self.anchors = {} + return node + + def compose_node(self, parent, index): + if self.check_event(AliasEvent): + event = self.get_event() + anchor = event.anchor + if anchor not in self.anchors: + raise ComposerError(None, None, "found undefined alias %r" + % anchor, event.start_mark) + return self.anchors[anchor] + event = self.peek_event() + anchor = event.anchor + if anchor is not None: + if anchor in self.anchors: + raise ComposerError("found duplicate anchor %r; first occurrence" + % anchor, self.anchors[anchor].start_mark, + "second occurrence", event.start_mark) + self.descend_resolver(parent, index) + if self.check_event(ScalarEvent): + node = self.compose_scalar_node(anchor) + elif self.check_event(SequenceStartEvent): + node = self.compose_sequence_node(anchor) + elif self.check_event(MappingStartEvent): + node = self.compose_mapping_node(anchor) + self.ascend_resolver() + return node + + def compose_scalar_node(self, anchor): + event = self.get_event() + tag = event.tag + if tag is None or tag == '!': + tag = self.resolve(ScalarNode, event.value, event.implicit) + node = ScalarNode(tag, event.value, + event.start_mark, event.end_mark, style=event.style) + if anchor is not None: + self.anchors[anchor] = node + return node + + def compose_sequence_node(self, anchor): + start_event = self.get_event() + tag = start_event.tag + if tag is None or tag == '!': + tag = self.resolve(SequenceNode, None, start_event.implicit) + node = SequenceNode(tag, [], + start_event.start_mark, None, + flow_style=start_event.flow_style) + if anchor is not None: + self.anchors[anchor] = node + index = 0 + while not self.check_event(SequenceEndEvent): + node.value.append(self.compose_node(node, index)) + index += 1 + end_event = self.get_event() + node.end_mark = end_event.end_mark + return node + + def compose_mapping_node(self, anchor): + start_event = self.get_event() + tag = start_event.tag + if tag is None or tag == '!': + tag = self.resolve(MappingNode, None, start_event.implicit) + node = MappingNode(tag, [], + start_event.start_mark, None, + flow_style=start_event.flow_style) + if anchor is not None: + self.anchors[anchor] = node + while not self.check_event(MappingEndEvent): + #key_event = self.peek_event() + item_key = self.compose_node(node, None) + #if item_key in node.value: + # raise ComposerError("while composing a mapping", start_event.start_mark, + # "found duplicate key", key_event.start_mark) + item_value = self.compose_node(node, item_key) + #node.value[item_key] = item_value + node.value.append((item_key, item_value)) + end_event = self.get_event() + node.end_mark = end_event.end_mark + return node + diff --git a/libs3/yaml/constructor.py b/libs3/yaml/constructor.py new file mode 100644 index 000000000..34fc1ae92 --- /dev/null +++ b/libs3/yaml/constructor.py @@ -0,0 +1,720 @@ + +__all__ = [ + 'BaseConstructor', + 'SafeConstructor', + 'FullConstructor', + 'UnsafeConstructor', + 'Constructor', + 'ConstructorError' +] + +from .error import * +from .nodes import * + +import collections.abc, datetime, base64, binascii, re, sys, types + +class ConstructorError(MarkedYAMLError): + pass + +class BaseConstructor: + + yaml_constructors = {} + yaml_multi_constructors = {} + + def __init__(self): + self.constructed_objects = {} + self.recursive_objects = {} + self.state_generators = [] + self.deep_construct = False + + def check_data(self): + # If there are more documents available? + return self.check_node() + + def get_data(self): + # Construct and return the next document. + if self.check_node(): + return self.construct_document(self.get_node()) + + def get_single_data(self): + # Ensure that the stream contains a single document and construct it. + node = self.get_single_node() + if node is not None: + return self.construct_document(node) + return None + + def construct_document(self, node): + data = self.construct_object(node) + while self.state_generators: + state_generators = self.state_generators + self.state_generators = [] + for generator in state_generators: + for dummy in generator: + pass + self.constructed_objects = {} + self.recursive_objects = {} + self.deep_construct = False + return data + + def construct_object(self, node, deep=False): + if node in self.constructed_objects: + return self.constructed_objects[node] + if deep: + old_deep = self.deep_construct + self.deep_construct = True + if node in self.recursive_objects: + raise ConstructorError(None, None, + "found unconstructable recursive node", node.start_mark) + self.recursive_objects[node] = None + constructor = None + tag_suffix = None + if node.tag in self.yaml_constructors: + constructor = self.yaml_constructors[node.tag] + else: + for tag_prefix in self.yaml_multi_constructors: + if node.tag.startswith(tag_prefix): + tag_suffix = node.tag[len(tag_prefix):] + constructor = self.yaml_multi_constructors[tag_prefix] + break + else: + if None in self.yaml_multi_constructors: + tag_suffix = node.tag + constructor = self.yaml_multi_constructors[None] + elif None in self.yaml_constructors: + constructor = self.yaml_constructors[None] + elif isinstance(node, ScalarNode): + constructor = self.__class__.construct_scalar + elif isinstance(node, SequenceNode): + constructor = self.__class__.construct_sequence + elif isinstance(node, MappingNode): + constructor = self.__class__.construct_mapping + if tag_suffix is None: + data = constructor(self, node) + else: + data = constructor(self, tag_suffix, node) + if isinstance(data, types.GeneratorType): + generator = data + data = next(generator) + if self.deep_construct: + for dummy in generator: + pass + else: + self.state_generators.append(generator) + self.constructed_objects[node] = data + del self.recursive_objects[node] + if deep: + self.deep_construct = old_deep + return data + + def construct_scalar(self, node): + if not isinstance(node, ScalarNode): + raise ConstructorError(None, None, + "expected a scalar node, but found %s" % node.id, + node.start_mark) + return node.value + + def construct_sequence(self, node, deep=False): + if not isinstance(node, SequenceNode): + raise ConstructorError(None, None, + "expected a sequence node, but found %s" % node.id, + node.start_mark) + return [self.construct_object(child, deep=deep) + for child in node.value] + + def construct_mapping(self, node, deep=False): + if not isinstance(node, MappingNode): + raise ConstructorError(None, None, + "expected a mapping node, but found %s" % node.id, + node.start_mark) + mapping = {} + for key_node, value_node in node.value: + key = self.construct_object(key_node, deep=deep) + if not isinstance(key, collections.abc.Hashable): + raise ConstructorError("while constructing a mapping", node.start_mark, + "found unhashable key", key_node.start_mark) + value = self.construct_object(value_node, deep=deep) + mapping[key] = value + return mapping + + def construct_pairs(self, node, deep=False): + if not isinstance(node, MappingNode): + raise ConstructorError(None, None, + "expected a mapping node, but found %s" % node.id, + node.start_mark) + pairs = [] + for key_node, value_node in node.value: + key = self.construct_object(key_node, deep=deep) + value = self.construct_object(value_node, deep=deep) + pairs.append((key, value)) + return pairs + + @classmethod + def add_constructor(cls, tag, constructor): + if not 'yaml_constructors' in cls.__dict__: + cls.yaml_constructors = cls.yaml_constructors.copy() + cls.yaml_constructors[tag] = constructor + + @classmethod + def add_multi_constructor(cls, tag_prefix, multi_constructor): + if not 'yaml_multi_constructors' in cls.__dict__: + cls.yaml_multi_constructors = cls.yaml_multi_constructors.copy() + cls.yaml_multi_constructors[tag_prefix] = multi_constructor + +class SafeConstructor(BaseConstructor): + + def construct_scalar(self, node): + if isinstance(node, MappingNode): + for key_node, value_node in node.value: + if key_node.tag == 'tag:yaml.org,2002:value': + return self.construct_scalar(value_node) + return super().construct_scalar(node) + + def flatten_mapping(self, node): + merge = [] + index = 0 + while index < len(node.value): + key_node, value_node = node.value[index] + if key_node.tag == 'tag:yaml.org,2002:merge': + del node.value[index] + if isinstance(value_node, MappingNode): + self.flatten_mapping(value_node) + merge.extend(value_node.value) + elif isinstance(value_node, SequenceNode): + submerge = [] + for subnode in value_node.value: + if not isinstance(subnode, MappingNode): + raise ConstructorError("while constructing a mapping", + node.start_mark, + "expected a mapping for merging, but found %s" + % subnode.id, subnode.start_mark) + self.flatten_mapping(subnode) + submerge.append(subnode.value) + submerge.reverse() + for value in submerge: + merge.extend(value) + else: + raise ConstructorError("while constructing a mapping", node.start_mark, + "expected a mapping or list of mappings for merging, but found %s" + % value_node.id, value_node.start_mark) + elif key_node.tag == 'tag:yaml.org,2002:value': + key_node.tag = 'tag:yaml.org,2002:str' + index += 1 + else: + index += 1 + if merge: + node.value = merge + node.value + + def construct_mapping(self, node, deep=False): + if isinstance(node, MappingNode): + self.flatten_mapping(node) + return super().construct_mapping(node, deep=deep) + + def construct_yaml_null(self, node): + self.construct_scalar(node) + return None + + bool_values = { + 'yes': True, + 'no': False, + 'true': True, + 'false': False, + 'on': True, + 'off': False, + } + + def construct_yaml_bool(self, node): + value = self.construct_scalar(node) + return self.bool_values[value.lower()] + + def construct_yaml_int(self, node): + value = self.construct_scalar(node) + value = value.replace('_', '') + sign = +1 + if value[0] == '-': + sign = -1 + if value[0] in '+-': + value = value[1:] + if value == '0': + return 0 + elif value.startswith('0b'): + return sign*int(value[2:], 2) + elif value.startswith('0x'): + return sign*int(value[2:], 16) + elif value[0] == '0': + return sign*int(value, 8) + elif ':' in value: + digits = [int(part) for part in value.split(':')] + digits.reverse() + base = 1 + value = 0 + for digit in digits: + value += digit*base + base *= 60 + return sign*value + else: + return sign*int(value) + + inf_value = 1e300 + while inf_value != inf_value*inf_value: + inf_value *= inf_value + nan_value = -inf_value/inf_value # Trying to make a quiet NaN (like C99). + + def construct_yaml_float(self, node): + value = self.construct_scalar(node) + value = value.replace('_', '').lower() + sign = +1 + if value[0] == '-': + sign = -1 + if value[0] in '+-': + value = value[1:] + if value == '.inf': + return sign*self.inf_value + elif value == '.nan': + return self.nan_value + elif ':' in value: + digits = [float(part) for part in value.split(':')] + digits.reverse() + base = 1 + value = 0.0 + for digit in digits: + value += digit*base + base *= 60 + return sign*value + else: + return sign*float(value) + + def construct_yaml_binary(self, node): + try: + value = self.construct_scalar(node).encode('ascii') + except UnicodeEncodeError as exc: + raise ConstructorError(None, None, + "failed to convert base64 data into ascii: %s" % exc, + node.start_mark) + try: + if hasattr(base64, 'decodebytes'): + return base64.decodebytes(value) + else: + return base64.decodestring(value) + except binascii.Error as exc: + raise ConstructorError(None, None, + "failed to decode base64 data: %s" % exc, node.start_mark) + + timestamp_regexp = re.compile( + r'''^(?P<year>[0-9][0-9][0-9][0-9]) + -(?P<month>[0-9][0-9]?) + -(?P<day>[0-9][0-9]?) + (?:(?:[Tt]|[ \t]+) + (?P<hour>[0-9][0-9]?) + :(?P<minute>[0-9][0-9]) + :(?P<second>[0-9][0-9]) + (?:\.(?P<fraction>[0-9]*))? + (?:[ \t]*(?P<tz>Z|(?P<tz_sign>[-+])(?P<tz_hour>[0-9][0-9]?) + (?::(?P<tz_minute>[0-9][0-9]))?))?)?$''', re.X) + + def construct_yaml_timestamp(self, node): + value = self.construct_scalar(node) + match = self.timestamp_regexp.match(node.value) + values = match.groupdict() + year = int(values['year']) + month = int(values['month']) + day = int(values['day']) + if not values['hour']: + return datetime.date(year, month, day) + hour = int(values['hour']) + minute = int(values['minute']) + second = int(values['second']) + fraction = 0 + if values['fraction']: + fraction = values['fraction'][:6] + while len(fraction) < 6: + fraction += '0' + fraction = int(fraction) + delta = None + if values['tz_sign']: + tz_hour = int(values['tz_hour']) + tz_minute = int(values['tz_minute'] or 0) + delta = datetime.timedelta(hours=tz_hour, minutes=tz_minute) + if values['tz_sign'] == '-': + delta = -delta + data = datetime.datetime(year, month, day, hour, minute, second, fraction) + if delta: + data -= delta + return data + + def construct_yaml_omap(self, node): + # Note: we do not check for duplicate keys, because it's too + # CPU-expensive. + omap = [] + yield omap + if not isinstance(node, SequenceNode): + raise ConstructorError("while constructing an ordered map", node.start_mark, + "expected a sequence, but found %s" % node.id, node.start_mark) + for subnode in node.value: + if not isinstance(subnode, MappingNode): + raise ConstructorError("while constructing an ordered map", node.start_mark, + "expected a mapping of length 1, but found %s" % subnode.id, + subnode.start_mark) + if len(subnode.value) != 1: + raise ConstructorError("while constructing an ordered map", node.start_mark, + "expected a single mapping item, but found %d items" % len(subnode.value), + subnode.start_mark) + key_node, value_node = subnode.value[0] + key = self.construct_object(key_node) + value = self.construct_object(value_node) + omap.append((key, value)) + + def construct_yaml_pairs(self, node): + # Note: the same code as `construct_yaml_omap`. + pairs = [] + yield pairs + if not isinstance(node, SequenceNode): + raise ConstructorError("while constructing pairs", node.start_mark, + "expected a sequence, but found %s" % node.id, node.start_mark) + for subnode in node.value: + if not isinstance(subnode, MappingNode): + raise ConstructorError("while constructing pairs", node.start_mark, + "expected a mapping of length 1, but found %s" % subnode.id, + subnode.start_mark) + if len(subnode.value) != 1: + raise ConstructorError("while constructing pairs", node.start_mark, + "expected a single mapping item, but found %d items" % len(subnode.value), + subnode.start_mark) + key_node, value_node = subnode.value[0] + key = self.construct_object(key_node) + value = self.construct_object(value_node) + pairs.append((key, value)) + + def construct_yaml_set(self, node): + data = set() + yield data + value = self.construct_mapping(node) + data.update(value) + + def construct_yaml_str(self, node): + return self.construct_scalar(node) + + def construct_yaml_seq(self, node): + data = [] + yield data + data.extend(self.construct_sequence(node)) + + def construct_yaml_map(self, node): + data = {} + yield data + value = self.construct_mapping(node) + data.update(value) + + def construct_yaml_object(self, node, cls): + data = cls.__new__(cls) + yield data + if hasattr(data, '__setstate__'): + state = self.construct_mapping(node, deep=True) + data.__setstate__(state) + else: + state = self.construct_mapping(node) + data.__dict__.update(state) + + def construct_undefined(self, node): + raise ConstructorError(None, None, + "could not determine a constructor for the tag %r" % node.tag, + node.start_mark) + +SafeConstructor.add_constructor( + 'tag:yaml.org,2002:null', + SafeConstructor.construct_yaml_null) + +SafeConstructor.add_constructor( + 'tag:yaml.org,2002:bool', + SafeConstructor.construct_yaml_bool) + +SafeConstructor.add_constructor( + 'tag:yaml.org,2002:int', + SafeConstructor.construct_yaml_int) + +SafeConstructor.add_constructor( + 'tag:yaml.org,2002:float', + SafeConstructor.construct_yaml_float) + +SafeConstructor.add_constructor( + 'tag:yaml.org,2002:binary', + SafeConstructor.construct_yaml_binary) + +SafeConstructor.add_constructor( + 'tag:yaml.org,2002:timestamp', + SafeConstructor.construct_yaml_timestamp) + +SafeConstructor.add_constructor( + 'tag:yaml.org,2002:omap', + SafeConstructor.construct_yaml_omap) + +SafeConstructor.add_constructor( + 'tag:yaml.org,2002:pairs', + SafeConstructor.construct_yaml_pairs) + +SafeConstructor.add_constructor( + 'tag:yaml.org,2002:set', + SafeConstructor.construct_yaml_set) + +SafeConstructor.add_constructor( + 'tag:yaml.org,2002:str', + SafeConstructor.construct_yaml_str) + +SafeConstructor.add_constructor( + 'tag:yaml.org,2002:seq', + SafeConstructor.construct_yaml_seq) + +SafeConstructor.add_constructor( + 'tag:yaml.org,2002:map', + SafeConstructor.construct_yaml_map) + +SafeConstructor.add_constructor(None, + SafeConstructor.construct_undefined) + +class FullConstructor(SafeConstructor): + + def construct_python_str(self, node): + return self.construct_scalar(node) + + def construct_python_unicode(self, node): + return self.construct_scalar(node) + + def construct_python_bytes(self, node): + try: + value = self.construct_scalar(node).encode('ascii') + except UnicodeEncodeError as exc: + raise ConstructorError(None, None, + "failed to convert base64 data into ascii: %s" % exc, + node.start_mark) + try: + if hasattr(base64, 'decodebytes'): + return base64.decodebytes(value) + else: + return base64.decodestring(value) + except binascii.Error as exc: + raise ConstructorError(None, None, + "failed to decode base64 data: %s" % exc, node.start_mark) + + def construct_python_long(self, node): + return self.construct_yaml_int(node) + + def construct_python_complex(self, node): + return complex(self.construct_scalar(node)) + + def construct_python_tuple(self, node): + return tuple(self.construct_sequence(node)) + + def find_python_module(self, name, mark, unsafe=False): + if not name: + raise ConstructorError("while constructing a Python module", mark, + "expected non-empty name appended to the tag", mark) + if unsafe: + try: + __import__(name) + except ImportError as exc: + raise ConstructorError("while constructing a Python module", mark, + "cannot find module %r (%s)" % (name, exc), mark) + if not name in sys.modules: + raise ConstructorError("while constructing a Python module", mark, + "module %r is not imported" % name, mark) + return sys.modules[name] + + def find_python_name(self, name, mark, unsafe=False): + if not name: + raise ConstructorError("while constructing a Python object", mark, + "expected non-empty name appended to the tag", mark) + if '.' in name: + module_name, object_name = name.rsplit('.', 1) + else: + module_name = 'builtins' + object_name = name + if unsafe: + try: + __import__(module_name) + except ImportError as exc: + raise ConstructorError("while constructing a Python object", mark, + "cannot find module %r (%s)" % (module_name, exc), mark) + if not module_name in sys.modules: + raise ConstructorError("while constructing a Python object", mark, + "module %r is not imported" % module_name, mark) + module = sys.modules[module_name] + if not hasattr(module, object_name): + raise ConstructorError("while constructing a Python object", mark, + "cannot find %r in the module %r" + % (object_name, module.__name__), mark) + return getattr(module, object_name) + + def construct_python_name(self, suffix, node): + value = self.construct_scalar(node) + if value: + raise ConstructorError("while constructing a Python name", node.start_mark, + "expected the empty value, but found %r" % value, node.start_mark) + return self.find_python_name(suffix, node.start_mark) + + def construct_python_module(self, suffix, node): + value = self.construct_scalar(node) + if value: + raise ConstructorError("while constructing a Python module", node.start_mark, + "expected the empty value, but found %r" % value, node.start_mark) + return self.find_python_module(suffix, node.start_mark) + + def make_python_instance(self, suffix, node, + args=None, kwds=None, newobj=False, unsafe=False): + if not args: + args = [] + if not kwds: + kwds = {} + cls = self.find_python_name(suffix, node.start_mark) + if not (unsafe or isinstance(cls, type)): + raise ConstructorError("while constructing a Python instance", node.start_mark, + "expected a class, but found %r" % type(cls), + node.start_mark) + if newobj and isinstance(cls, type): + return cls.__new__(cls, *args, **kwds) + else: + return cls(*args, **kwds) + + def set_python_instance_state(self, instance, state): + if hasattr(instance, '__setstate__'): + instance.__setstate__(state) + else: + slotstate = {} + if isinstance(state, tuple) and len(state) == 2: + state, slotstate = state + if hasattr(instance, '__dict__'): + instance.__dict__.update(state) + elif state: + slotstate.update(state) + for key, value in slotstate.items(): + setattr(object, key, value) + + def construct_python_object(self, suffix, node): + # Format: + # !!python/object:module.name { ... state ... } + instance = self.make_python_instance(suffix, node, newobj=True) + yield instance + deep = hasattr(instance, '__setstate__') + state = self.construct_mapping(node, deep=deep) + self.set_python_instance_state(instance, state) + + def construct_python_object_apply(self, suffix, node, newobj=False): + # Format: + # !!python/object/apply # (or !!python/object/new) + # args: [ ... arguments ... ] + # kwds: { ... keywords ... } + # state: ... state ... + # listitems: [ ... listitems ... ] + # dictitems: { ... dictitems ... } + # or short format: + # !!python/object/apply [ ... arguments ... ] + # The difference between !!python/object/apply and !!python/object/new + # is how an object is created, check make_python_instance for details. + if isinstance(node, SequenceNode): + args = self.construct_sequence(node, deep=True) + kwds = {} + state = {} + listitems = [] + dictitems = {} + else: + value = self.construct_mapping(node, deep=True) + args = value.get('args', []) + kwds = value.get('kwds', {}) + state = value.get('state', {}) + listitems = value.get('listitems', []) + dictitems = value.get('dictitems', {}) + instance = self.make_python_instance(suffix, node, args, kwds, newobj) + if state: + self.set_python_instance_state(instance, state) + if listitems: + instance.extend(listitems) + if dictitems: + for key in dictitems: + instance[key] = dictitems[key] + return instance + + def construct_python_object_new(self, suffix, node): + return self.construct_python_object_apply(suffix, node, newobj=True) + +FullConstructor.add_constructor( + 'tag:yaml.org,2002:python/none', + FullConstructor.construct_yaml_null) + +FullConstructor.add_constructor( + 'tag:yaml.org,2002:python/bool', + FullConstructor.construct_yaml_bool) + +FullConstructor.add_constructor( + 'tag:yaml.org,2002:python/str', + FullConstructor.construct_python_str) + +FullConstructor.add_constructor( + 'tag:yaml.org,2002:python/unicode', + FullConstructor.construct_python_unicode) + +FullConstructor.add_constructor( + 'tag:yaml.org,2002:python/bytes', + FullConstructor.construct_python_bytes) + +FullConstructor.add_constructor( + 'tag:yaml.org,2002:python/int', + FullConstructor.construct_yaml_int) + +FullConstructor.add_constructor( + 'tag:yaml.org,2002:python/long', + FullConstructor.construct_python_long) + +FullConstructor.add_constructor( + 'tag:yaml.org,2002:python/float', + FullConstructor.construct_yaml_float) + +FullConstructor.add_constructor( + 'tag:yaml.org,2002:python/complex', + FullConstructor.construct_python_complex) + +FullConstructor.add_constructor( + 'tag:yaml.org,2002:python/list', + FullConstructor.construct_yaml_seq) + +FullConstructor.add_constructor( + 'tag:yaml.org,2002:python/tuple', + FullConstructor.construct_python_tuple) + +FullConstructor.add_constructor( + 'tag:yaml.org,2002:python/dict', + FullConstructor.construct_yaml_map) + +FullConstructor.add_multi_constructor( + 'tag:yaml.org,2002:python/name:', + FullConstructor.construct_python_name) + +FullConstructor.add_multi_constructor( + 'tag:yaml.org,2002:python/module:', + FullConstructor.construct_python_module) + +FullConstructor.add_multi_constructor( + 'tag:yaml.org,2002:python/object:', + FullConstructor.construct_python_object) + +FullConstructor.add_multi_constructor( + 'tag:yaml.org,2002:python/object/apply:', + FullConstructor.construct_python_object_apply) + +FullConstructor.add_multi_constructor( + 'tag:yaml.org,2002:python/object/new:', + FullConstructor.construct_python_object_new) + +class UnsafeConstructor(FullConstructor): + + def find_python_module(self, name, mark): + return super(UnsafeConstructor, self).find_python_module(name, mark, unsafe=True) + + def find_python_name(self, name, mark): + return super(UnsafeConstructor, self).find_python_name(name, mark, unsafe=True) + + def make_python_instance(self, suffix, node, args=None, kwds=None, newobj=False): + return super(UnsafeConstructor, self).make_python_instance( + suffix, node, args, kwds, newobj, unsafe=True) + +# Constructor is same as UnsafeConstructor. Need to leave this in place in case +# people have extended it directly. +class Constructor(UnsafeConstructor): + pass diff --git a/libs3/yaml/cyaml.py b/libs3/yaml/cyaml.py new file mode 100644 index 000000000..1e606c74b --- /dev/null +++ b/libs3/yaml/cyaml.py @@ -0,0 +1,101 @@ + +__all__ = [ + 'CBaseLoader', 'CSafeLoader', 'CFullLoader', 'CUnsafeLoader', 'CLoader', + 'CBaseDumper', 'CSafeDumper', 'CDumper' +] + +from _yaml import CParser, CEmitter + +from .constructor import * + +from .serializer import * +from .representer import * + +from .resolver import * + +class CBaseLoader(CParser, BaseConstructor, BaseResolver): + + def __init__(self, stream): + CParser.__init__(self, stream) + BaseConstructor.__init__(self) + BaseResolver.__init__(self) + +class CSafeLoader(CParser, SafeConstructor, Resolver): + + def __init__(self, stream): + CParser.__init__(self, stream) + SafeConstructor.__init__(self) + Resolver.__init__(self) + +class CFullLoader(CParser, FullConstructor, Resolver): + + def __init__(self, stream): + CParser.__init__(self, stream) + FullConstructor.__init__(self) + Resolver.__init__(self) + +class CUnsafeLoader(CParser, UnsafeConstructor, Resolver): + + def __init__(self, stream): + CParser.__init__(self, stream) + UnsafeConstructor.__init__(self) + Resolver.__init__(self) + +class CLoader(CParser, Constructor, Resolver): + + def __init__(self, stream): + CParser.__init__(self, stream) + Constructor.__init__(self) + Resolver.__init__(self) + +class CBaseDumper(CEmitter, BaseRepresenter, BaseResolver): + + def __init__(self, stream, + default_style=None, default_flow_style=False, + canonical=None, indent=None, width=None, + allow_unicode=None, line_break=None, + encoding=None, explicit_start=None, explicit_end=None, + version=None, tags=None, sort_keys=True): + CEmitter.__init__(self, stream, canonical=canonical, + indent=indent, width=width, encoding=encoding, + allow_unicode=allow_unicode, line_break=line_break, + explicit_start=explicit_start, explicit_end=explicit_end, + version=version, tags=tags) + Representer.__init__(self, default_style=default_style, + default_flow_style=default_flow_style, sort_keys=sort_keys) + Resolver.__init__(self) + +class CSafeDumper(CEmitter, SafeRepresenter, Resolver): + + def __init__(self, stream, + default_style=None, default_flow_style=False, + canonical=None, indent=None, width=None, + allow_unicode=None, line_break=None, + encoding=None, explicit_start=None, explicit_end=None, + version=None, tags=None, sort_keys=True): + CEmitter.__init__(self, stream, canonical=canonical, + indent=indent, width=width, encoding=encoding, + allow_unicode=allow_unicode, line_break=line_break, + explicit_start=explicit_start, explicit_end=explicit_end, + version=version, tags=tags) + SafeRepresenter.__init__(self, default_style=default_style, + default_flow_style=default_flow_style, sort_keys=sort_keys) + Resolver.__init__(self) + +class CDumper(CEmitter, Serializer, Representer, Resolver): + + def __init__(self, stream, + default_style=None, default_flow_style=False, + canonical=None, indent=None, width=None, + allow_unicode=None, line_break=None, + encoding=None, explicit_start=None, explicit_end=None, + version=None, tags=None, sort_keys=True): + CEmitter.__init__(self, stream, canonical=canonical, + indent=indent, width=width, encoding=encoding, + allow_unicode=allow_unicode, line_break=line_break, + explicit_start=explicit_start, explicit_end=explicit_end, + version=version, tags=tags) + Representer.__init__(self, default_style=default_style, + default_flow_style=default_flow_style, sort_keys=sort_keys) + Resolver.__init__(self) + diff --git a/libs3/yaml/dumper.py b/libs3/yaml/dumper.py new file mode 100644 index 000000000..6aadba551 --- /dev/null +++ b/libs3/yaml/dumper.py @@ -0,0 +1,62 @@ + +__all__ = ['BaseDumper', 'SafeDumper', 'Dumper'] + +from .emitter import * +from .serializer import * +from .representer import * +from .resolver import * + +class BaseDumper(Emitter, Serializer, BaseRepresenter, BaseResolver): + + def __init__(self, stream, + default_style=None, default_flow_style=False, + canonical=None, indent=None, width=None, + allow_unicode=None, line_break=None, + encoding=None, explicit_start=None, explicit_end=None, + version=None, tags=None, sort_keys=True): + Emitter.__init__(self, stream, canonical=canonical, + indent=indent, width=width, + allow_unicode=allow_unicode, line_break=line_break) + Serializer.__init__(self, encoding=encoding, + explicit_start=explicit_start, explicit_end=explicit_end, + version=version, tags=tags) + Representer.__init__(self, default_style=default_style, + default_flow_style=default_flow_style, sort_keys=sort_keys) + Resolver.__init__(self) + +class SafeDumper(Emitter, Serializer, SafeRepresenter, Resolver): + + def __init__(self, stream, + default_style=None, default_flow_style=False, + canonical=None, indent=None, width=None, + allow_unicode=None, line_break=None, + encoding=None, explicit_start=None, explicit_end=None, + version=None, tags=None, sort_keys=True): + Emitter.__init__(self, stream, canonical=canonical, + indent=indent, width=width, + allow_unicode=allow_unicode, line_break=line_break) + Serializer.__init__(self, encoding=encoding, + explicit_start=explicit_start, explicit_end=explicit_end, + version=version, tags=tags) + SafeRepresenter.__init__(self, default_style=default_style, + default_flow_style=default_flow_style, sort_keys=sort_keys) + Resolver.__init__(self) + +class Dumper(Emitter, Serializer, Representer, Resolver): + + def __init__(self, stream, + default_style=None, default_flow_style=False, + canonical=None, indent=None, width=None, + allow_unicode=None, line_break=None, + encoding=None, explicit_start=None, explicit_end=None, + version=None, tags=None, sort_keys=True): + Emitter.__init__(self, stream, canonical=canonical, + indent=indent, width=width, + allow_unicode=allow_unicode, line_break=line_break) + Serializer.__init__(self, encoding=encoding, + explicit_start=explicit_start, explicit_end=explicit_end, + version=version, tags=tags) + Representer.__init__(self, default_style=default_style, + default_flow_style=default_flow_style, sort_keys=sort_keys) + Resolver.__init__(self) + diff --git a/libs3/yaml/emitter.py b/libs3/yaml/emitter.py new file mode 100644 index 000000000..a664d0111 --- /dev/null +++ b/libs3/yaml/emitter.py @@ -0,0 +1,1137 @@ + +# Emitter expects events obeying the following grammar: +# stream ::= STREAM-START document* STREAM-END +# document ::= DOCUMENT-START node DOCUMENT-END +# node ::= SCALAR | sequence | mapping +# sequence ::= SEQUENCE-START node* SEQUENCE-END +# mapping ::= MAPPING-START (node node)* MAPPING-END + +__all__ = ['Emitter', 'EmitterError'] + +from .error import YAMLError +from .events import * + +class EmitterError(YAMLError): + pass + +class ScalarAnalysis: + def __init__(self, scalar, empty, multiline, + allow_flow_plain, allow_block_plain, + allow_single_quoted, allow_double_quoted, + allow_block): + self.scalar = scalar + self.empty = empty + self.multiline = multiline + self.allow_flow_plain = allow_flow_plain + self.allow_block_plain = allow_block_plain + self.allow_single_quoted = allow_single_quoted + self.allow_double_quoted = allow_double_quoted + self.allow_block = allow_block + +class Emitter: + + DEFAULT_TAG_PREFIXES = { + '!' : '!', + 'tag:yaml.org,2002:' : '!!', + } + + def __init__(self, stream, canonical=None, indent=None, width=None, + allow_unicode=None, line_break=None): + + # The stream should have the methods `write` and possibly `flush`. + self.stream = stream + + # Encoding can be overridden by STREAM-START. + self.encoding = None + + # Emitter is a state machine with a stack of states to handle nested + # structures. + self.states = [] + self.state = self.expect_stream_start + + # Current event and the event queue. + self.events = [] + self.event = None + + # The current indentation level and the stack of previous indents. + self.indents = [] + self.indent = None + + # Flow level. + self.flow_level = 0 + + # Contexts. + self.root_context = False + self.sequence_context = False + self.mapping_context = False + self.simple_key_context = False + + # Characteristics of the last emitted character: + # - current position. + # - is it a whitespace? + # - is it an indention character + # (indentation space, '-', '?', or ':')? + self.line = 0 + self.column = 0 + self.whitespace = True + self.indention = True + + # Whether the document requires an explicit document indicator + self.open_ended = False + + # Formatting details. + self.canonical = canonical + self.allow_unicode = allow_unicode + self.best_indent = 2 + if indent and 1 < indent < 10: + self.best_indent = indent + self.best_width = 80 + if width and width > self.best_indent*2: + self.best_width = width + self.best_line_break = '\n' + if line_break in ['\r', '\n', '\r\n']: + self.best_line_break = line_break + + # Tag prefixes. + self.tag_prefixes = None + + # Prepared anchor and tag. + self.prepared_anchor = None + self.prepared_tag = None + + # Scalar analysis and style. + self.analysis = None + self.style = None + + def dispose(self): + # Reset the state attributes (to clear self-references) + self.states = [] + self.state = None + + def emit(self, event): + self.events.append(event) + while not self.need_more_events(): + self.event = self.events.pop(0) + self.state() + self.event = None + + # In some cases, we wait for a few next events before emitting. + + def need_more_events(self): + if not self.events: + return True + event = self.events[0] + if isinstance(event, DocumentStartEvent): + return self.need_events(1) + elif isinstance(event, SequenceStartEvent): + return self.need_events(2) + elif isinstance(event, MappingStartEvent): + return self.need_events(3) + else: + return False + + def need_events(self, count): + level = 0 + for event in self.events[1:]: + if isinstance(event, (DocumentStartEvent, CollectionStartEvent)): + level += 1 + elif isinstance(event, (DocumentEndEvent, CollectionEndEvent)): + level -= 1 + elif isinstance(event, StreamEndEvent): + level = -1 + if level < 0: + return False + return (len(self.events) < count+1) + + def increase_indent(self, flow=False, indentless=False): + self.indents.append(self.indent) + if self.indent is None: + if flow: + self.indent = self.best_indent + else: + self.indent = 0 + elif not indentless: + self.indent += self.best_indent + + # States. + + # Stream handlers. + + def expect_stream_start(self): + if isinstance(self.event, StreamStartEvent): + if self.event.encoding and not hasattr(self.stream, 'encoding'): + self.encoding = self.event.encoding + self.write_stream_start() + self.state = self.expect_first_document_start + else: + raise EmitterError("expected StreamStartEvent, but got %s" + % self.event) + + def expect_nothing(self): + raise EmitterError("expected nothing, but got %s" % self.event) + + # Document handlers. + + def expect_first_document_start(self): + return self.expect_document_start(first=True) + + def expect_document_start(self, first=False): + if isinstance(self.event, DocumentStartEvent): + if (self.event.version or self.event.tags) and self.open_ended: + self.write_indicator('...', True) + self.write_indent() + if self.event.version: + version_text = self.prepare_version(self.event.version) + self.write_version_directive(version_text) + self.tag_prefixes = self.DEFAULT_TAG_PREFIXES.copy() + if self.event.tags: + handles = sorted(self.event.tags.keys()) + for handle in handles: + prefix = self.event.tags[handle] + self.tag_prefixes[prefix] = handle + handle_text = self.prepare_tag_handle(handle) + prefix_text = self.prepare_tag_prefix(prefix) + self.write_tag_directive(handle_text, prefix_text) + implicit = (first and not self.event.explicit and not self.canonical + and not self.event.version and not self.event.tags + and not self.check_empty_document()) + if not implicit: + self.write_indent() + self.write_indicator('---', True) + if self.canonical: + self.write_indent() + self.state = self.expect_document_root + elif isinstance(self.event, StreamEndEvent): + if self.open_ended: + self.write_indicator('...', True) + self.write_indent() + self.write_stream_end() + self.state = self.expect_nothing + else: + raise EmitterError("expected DocumentStartEvent, but got %s" + % self.event) + + def expect_document_end(self): + if isinstance(self.event, DocumentEndEvent): + self.write_indent() + if self.event.explicit: + self.write_indicator('...', True) + self.write_indent() + self.flush_stream() + self.state = self.expect_document_start + else: + raise EmitterError("expected DocumentEndEvent, but got %s" + % self.event) + + def expect_document_root(self): + self.states.append(self.expect_document_end) + self.expect_node(root=True) + + # Node handlers. + + def expect_node(self, root=False, sequence=False, mapping=False, + simple_key=False): + self.root_context = root + self.sequence_context = sequence + self.mapping_context = mapping + self.simple_key_context = simple_key + if isinstance(self.event, AliasEvent): + self.expect_alias() + elif isinstance(self.event, (ScalarEvent, CollectionStartEvent)): + self.process_anchor('&') + self.process_tag() + if isinstance(self.event, ScalarEvent): + self.expect_scalar() + elif isinstance(self.event, SequenceStartEvent): + if self.flow_level or self.canonical or self.event.flow_style \ + or self.check_empty_sequence(): + self.expect_flow_sequence() + else: + self.expect_block_sequence() + elif isinstance(self.event, MappingStartEvent): + if self.flow_level or self.canonical or self.event.flow_style \ + or self.check_empty_mapping(): + self.expect_flow_mapping() + else: + self.expect_block_mapping() + else: + raise EmitterError("expected NodeEvent, but got %s" % self.event) + + def expect_alias(self): + if self.event.anchor is None: + raise EmitterError("anchor is not specified for alias") + self.process_anchor('*') + self.state = self.states.pop() + + def expect_scalar(self): + self.increase_indent(flow=True) + self.process_scalar() + self.indent = self.indents.pop() + self.state = self.states.pop() + + # Flow sequence handlers. + + def expect_flow_sequence(self): + self.write_indicator('[', True, whitespace=True) + self.flow_level += 1 + self.increase_indent(flow=True) + self.state = self.expect_first_flow_sequence_item + + def expect_first_flow_sequence_item(self): + if isinstance(self.event, SequenceEndEvent): + self.indent = self.indents.pop() + self.flow_level -= 1 + self.write_indicator(']', False) + self.state = self.states.pop() + else: + if self.canonical or self.column > self.best_width: + self.write_indent() + self.states.append(self.expect_flow_sequence_item) + self.expect_node(sequence=True) + + def expect_flow_sequence_item(self): + if isinstance(self.event, SequenceEndEvent): + self.indent = self.indents.pop() + self.flow_level -= 1 + if self.canonical: + self.write_indicator(',', False) + self.write_indent() + self.write_indicator(']', False) + self.state = self.states.pop() + else: + self.write_indicator(',', False) + if self.canonical or self.column > self.best_width: + self.write_indent() + self.states.append(self.expect_flow_sequence_item) + self.expect_node(sequence=True) + + # Flow mapping handlers. + + def expect_flow_mapping(self): + self.write_indicator('{', True, whitespace=True) + self.flow_level += 1 + self.increase_indent(flow=True) + self.state = self.expect_first_flow_mapping_key + + def expect_first_flow_mapping_key(self): + if isinstance(self.event, MappingEndEvent): + self.indent = self.indents.pop() + self.flow_level -= 1 + self.write_indicator('}', False) + self.state = self.states.pop() + else: + if self.canonical or self.column > self.best_width: + self.write_indent() + if not self.canonical and self.check_simple_key(): + self.states.append(self.expect_flow_mapping_simple_value) + self.expect_node(mapping=True, simple_key=True) + else: + self.write_indicator('?', True) + self.states.append(self.expect_flow_mapping_value) + self.expect_node(mapping=True) + + def expect_flow_mapping_key(self): + if isinstance(self.event, MappingEndEvent): + self.indent = self.indents.pop() + self.flow_level -= 1 + if self.canonical: + self.write_indicator(',', False) + self.write_indent() + self.write_indicator('}', False) + self.state = self.states.pop() + else: + self.write_indicator(',', False) + if self.canonical or self.column > self.best_width: + self.write_indent() + if not self.canonical and self.check_simple_key(): + self.states.append(self.expect_flow_mapping_simple_value) + self.expect_node(mapping=True, simple_key=True) + else: + self.write_indicator('?', True) + self.states.append(self.expect_flow_mapping_value) + self.expect_node(mapping=True) + + def expect_flow_mapping_simple_value(self): + self.write_indicator(':', False) + self.states.append(self.expect_flow_mapping_key) + self.expect_node(mapping=True) + + def expect_flow_mapping_value(self): + if self.canonical or self.column > self.best_width: + self.write_indent() + self.write_indicator(':', True) + self.states.append(self.expect_flow_mapping_key) + self.expect_node(mapping=True) + + # Block sequence handlers. + + def expect_block_sequence(self): + indentless = (self.mapping_context and not self.indention) + self.increase_indent(flow=False, indentless=indentless) + self.state = self.expect_first_block_sequence_item + + def expect_first_block_sequence_item(self): + return self.expect_block_sequence_item(first=True) + + def expect_block_sequence_item(self, first=False): + if not first and isinstance(self.event, SequenceEndEvent): + self.indent = self.indents.pop() + self.state = self.states.pop() + else: + self.write_indent() + self.write_indicator('-', True, indention=True) + self.states.append(self.expect_block_sequence_item) + self.expect_node(sequence=True) + + # Block mapping handlers. + + def expect_block_mapping(self): + self.increase_indent(flow=False) + self.state = self.expect_first_block_mapping_key + + def expect_first_block_mapping_key(self): + return self.expect_block_mapping_key(first=True) + + def expect_block_mapping_key(self, first=False): + if not first and isinstance(self.event, MappingEndEvent): + self.indent = self.indents.pop() + self.state = self.states.pop() + else: + self.write_indent() + if self.check_simple_key(): + self.states.append(self.expect_block_mapping_simple_value) + self.expect_node(mapping=True, simple_key=True) + else: + self.write_indicator('?', True, indention=True) + self.states.append(self.expect_block_mapping_value) + self.expect_node(mapping=True) + + def expect_block_mapping_simple_value(self): + self.write_indicator(':', False) + self.states.append(self.expect_block_mapping_key) + self.expect_node(mapping=True) + + def expect_block_mapping_value(self): + self.write_indent() + self.write_indicator(':', True, indention=True) + self.states.append(self.expect_block_mapping_key) + self.expect_node(mapping=True) + + # Checkers. + + def check_empty_sequence(self): + return (isinstance(self.event, SequenceStartEvent) and self.events + and isinstance(self.events[0], SequenceEndEvent)) + + def check_empty_mapping(self): + return (isinstance(self.event, MappingStartEvent) and self.events + and isinstance(self.events[0], MappingEndEvent)) + + def check_empty_document(self): + if not isinstance(self.event, DocumentStartEvent) or not self.events: + return False + event = self.events[0] + return (isinstance(event, ScalarEvent) and event.anchor is None + and event.tag is None and event.implicit and event.value == '') + + def check_simple_key(self): + length = 0 + if isinstance(self.event, NodeEvent) and self.event.anchor is not None: + if self.prepared_anchor is None: + self.prepared_anchor = self.prepare_anchor(self.event.anchor) + length += len(self.prepared_anchor) + if isinstance(self.event, (ScalarEvent, CollectionStartEvent)) \ + and self.event.tag is not None: + if self.prepared_tag is None: + self.prepared_tag = self.prepare_tag(self.event.tag) + length += len(self.prepared_tag) + if isinstance(self.event, ScalarEvent): + if self.analysis is None: + self.analysis = self.analyze_scalar(self.event.value) + length += len(self.analysis.scalar) + return (length < 128 and (isinstance(self.event, AliasEvent) + or (isinstance(self.event, ScalarEvent) + and not self.analysis.empty and not self.analysis.multiline) + or self.check_empty_sequence() or self.check_empty_mapping())) + + # Anchor, Tag, and Scalar processors. + + def process_anchor(self, indicator): + if self.event.anchor is None: + self.prepared_anchor = None + return + if self.prepared_anchor is None: + self.prepared_anchor = self.prepare_anchor(self.event.anchor) + if self.prepared_anchor: + self.write_indicator(indicator+self.prepared_anchor, True) + self.prepared_anchor = None + + def process_tag(self): + tag = self.event.tag + if isinstance(self.event, ScalarEvent): + if self.style is None: + self.style = self.choose_scalar_style() + if ((not self.canonical or tag is None) and + ((self.style == '' and self.event.implicit[0]) + or (self.style != '' and self.event.implicit[1]))): + self.prepared_tag = None + return + if self.event.implicit[0] and tag is None: + tag = '!' + self.prepared_tag = None + else: + if (not self.canonical or tag is None) and self.event.implicit: + self.prepared_tag = None + return + if tag is None: + raise EmitterError("tag is not specified") + if self.prepared_tag is None: + self.prepared_tag = self.prepare_tag(tag) + if self.prepared_tag: + self.write_indicator(self.prepared_tag, True) + self.prepared_tag = None + + def choose_scalar_style(self): + if self.analysis is None: + self.analysis = self.analyze_scalar(self.event.value) + if self.event.style == '"' or self.canonical: + return '"' + if not self.event.style and self.event.implicit[0]: + if (not (self.simple_key_context and + (self.analysis.empty or self.analysis.multiline)) + and (self.flow_level and self.analysis.allow_flow_plain + or (not self.flow_level and self.analysis.allow_block_plain))): + return '' + if self.event.style and self.event.style in '|>': + if (not self.flow_level and not self.simple_key_context + and self.analysis.allow_block): + return self.event.style + if not self.event.style or self.event.style == '\'': + if (self.analysis.allow_single_quoted and + not (self.simple_key_context and self.analysis.multiline)): + return '\'' + return '"' + + def process_scalar(self): + if self.analysis is None: + self.analysis = self.analyze_scalar(self.event.value) + if self.style is None: + self.style = self.choose_scalar_style() + split = (not self.simple_key_context) + #if self.analysis.multiline and split \ + # and (not self.style or self.style in '\'\"'): + # self.write_indent() + if self.style == '"': + self.write_double_quoted(self.analysis.scalar, split) + elif self.style == '\'': + self.write_single_quoted(self.analysis.scalar, split) + elif self.style == '>': + self.write_folded(self.analysis.scalar) + elif self.style == '|': + self.write_literal(self.analysis.scalar) + else: + self.write_plain(self.analysis.scalar, split) + self.analysis = None + self.style = None + + # Analyzers. + + def prepare_version(self, version): + major, minor = version + if major != 1: + raise EmitterError("unsupported YAML version: %d.%d" % (major, minor)) + return '%d.%d' % (major, minor) + + def prepare_tag_handle(self, handle): + if not handle: + raise EmitterError("tag handle must not be empty") + if handle[0] != '!' or handle[-1] != '!': + raise EmitterError("tag handle must start and end with '!': %r" % handle) + for ch in handle[1:-1]: + if not ('0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ + or ch in '-_'): + raise EmitterError("invalid character %r in the tag handle: %r" + % (ch, handle)) + return handle + + def prepare_tag_prefix(self, prefix): + if not prefix: + raise EmitterError("tag prefix must not be empty") + chunks = [] + start = end = 0 + if prefix[0] == '!': + end = 1 + while end < len(prefix): + ch = prefix[end] + if '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ + or ch in '-;/?!:@&=+$,_.~*\'()[]': + end += 1 + else: + if start < end: + chunks.append(prefix[start:end]) + start = end = end+1 + data = ch.encode('utf-8') + for ch in data: + chunks.append('%%%02X' % ord(ch)) + if start < end: + chunks.append(prefix[start:end]) + return ''.join(chunks) + + def prepare_tag(self, tag): + if not tag: + raise EmitterError("tag must not be empty") + if tag == '!': + return tag + handle = None + suffix = tag + prefixes = sorted(self.tag_prefixes.keys()) + for prefix in prefixes: + if tag.startswith(prefix) \ + and (prefix == '!' or len(prefix) < len(tag)): + handle = self.tag_prefixes[prefix] + suffix = tag[len(prefix):] + chunks = [] + start = end = 0 + while end < len(suffix): + ch = suffix[end] + if '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ + or ch in '-;/?:@&=+$,_.~*\'()[]' \ + or (ch == '!' and handle != '!'): + end += 1 + else: + if start < end: + chunks.append(suffix[start:end]) + start = end = end+1 + data = ch.encode('utf-8') + for ch in data: + chunks.append('%%%02X' % ch) + if start < end: + chunks.append(suffix[start:end]) + suffix_text = ''.join(chunks) + if handle: + return '%s%s' % (handle, suffix_text) + else: + return '!<%s>' % suffix_text + + def prepare_anchor(self, anchor): + if not anchor: + raise EmitterError("anchor must not be empty") + for ch in anchor: + if not ('0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ + or ch in '-_'): + raise EmitterError("invalid character %r in the anchor: %r" + % (ch, anchor)) + return anchor + + def analyze_scalar(self, scalar): + + # Empty scalar is a special case. + if not scalar: + return ScalarAnalysis(scalar=scalar, empty=True, multiline=False, + allow_flow_plain=False, allow_block_plain=True, + allow_single_quoted=True, allow_double_quoted=True, + allow_block=False) + + # Indicators and special characters. + block_indicators = False + flow_indicators = False + line_breaks = False + special_characters = False + + # Important whitespace combinations. + leading_space = False + leading_break = False + trailing_space = False + trailing_break = False + break_space = False + space_break = False + + # Check document indicators. + if scalar.startswith('---') or scalar.startswith('...'): + block_indicators = True + flow_indicators = True + + # First character or preceded by a whitespace. + preceded_by_whitespace = True + + # Last character or followed by a whitespace. + followed_by_whitespace = (len(scalar) == 1 or + scalar[1] in '\0 \t\r\n\x85\u2028\u2029') + + # The previous character is a space. + previous_space = False + + # The previous character is a break. + previous_break = False + + index = 0 + while index < len(scalar): + ch = scalar[index] + + # Check for indicators. + if index == 0: + # Leading indicators are special characters. + if ch in '#,[]{}&*!|>\'\"%@`': + flow_indicators = True + block_indicators = True + if ch in '?:': + flow_indicators = True + if followed_by_whitespace: + block_indicators = True + if ch == '-' and followed_by_whitespace: + flow_indicators = True + block_indicators = True + else: + # Some indicators cannot appear within a scalar as well. + if ch in ',?[]{}': + flow_indicators = True + if ch == ':': + flow_indicators = True + if followed_by_whitespace: + block_indicators = True + if ch == '#' and preceded_by_whitespace: + flow_indicators = True + block_indicators = True + + # Check for line breaks, special, and unicode characters. + if ch in '\n\x85\u2028\u2029': + line_breaks = True + if not (ch == '\n' or '\x20' <= ch <= '\x7E'): + if (ch == '\x85' or '\xA0' <= ch <= '\uD7FF' + or '\uE000' <= ch <= '\uFFFD' + or '\U00010000' <= ch < '\U0010ffff') and ch != '\uFEFF': + unicode_characters = True + if not self.allow_unicode: + special_characters = True + else: + special_characters = True + + # Detect important whitespace combinations. + if ch == ' ': + if index == 0: + leading_space = True + if index == len(scalar)-1: + trailing_space = True + if previous_break: + break_space = True + previous_space = True + previous_break = False + elif ch in '\n\x85\u2028\u2029': + if index == 0: + leading_break = True + if index == len(scalar)-1: + trailing_break = True + if previous_space: + space_break = True + previous_space = False + previous_break = True + else: + previous_space = False + previous_break = False + + # Prepare for the next character. + index += 1 + preceded_by_whitespace = (ch in '\0 \t\r\n\x85\u2028\u2029') + followed_by_whitespace = (index+1 >= len(scalar) or + scalar[index+1] in '\0 \t\r\n\x85\u2028\u2029') + + # Let's decide what styles are allowed. + allow_flow_plain = True + allow_block_plain = True + allow_single_quoted = True + allow_double_quoted = True + allow_block = True + + # Leading and trailing whitespaces are bad for plain scalars. + if (leading_space or leading_break + or trailing_space or trailing_break): + allow_flow_plain = allow_block_plain = False + + # We do not permit trailing spaces for block scalars. + if trailing_space: + allow_block = False + + # Spaces at the beginning of a new line are only acceptable for block + # scalars. + if break_space: + allow_flow_plain = allow_block_plain = allow_single_quoted = False + + # Spaces followed by breaks, as well as special character are only + # allowed for double quoted scalars. + if space_break or special_characters: + allow_flow_plain = allow_block_plain = \ + allow_single_quoted = allow_block = False + + # Although the plain scalar writer supports breaks, we never emit + # multiline plain scalars. + if line_breaks: + allow_flow_plain = allow_block_plain = False + + # Flow indicators are forbidden for flow plain scalars. + if flow_indicators: + allow_flow_plain = False + + # Block indicators are forbidden for block plain scalars. + if block_indicators: + allow_block_plain = False + + return ScalarAnalysis(scalar=scalar, + empty=False, multiline=line_breaks, + allow_flow_plain=allow_flow_plain, + allow_block_plain=allow_block_plain, + allow_single_quoted=allow_single_quoted, + allow_double_quoted=allow_double_quoted, + allow_block=allow_block) + + # Writers. + + def flush_stream(self): + if hasattr(self.stream, 'flush'): + self.stream.flush() + + def write_stream_start(self): + # Write BOM if needed. + if self.encoding and self.encoding.startswith('utf-16'): + self.stream.write('\uFEFF'.encode(self.encoding)) + + def write_stream_end(self): + self.flush_stream() + + def write_indicator(self, indicator, need_whitespace, + whitespace=False, indention=False): + if self.whitespace or not need_whitespace: + data = indicator + else: + data = ' '+indicator + self.whitespace = whitespace + self.indention = self.indention and indention + self.column += len(data) + self.open_ended = False + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + + def write_indent(self): + indent = self.indent or 0 + if not self.indention or self.column > indent \ + or (self.column == indent and not self.whitespace): + self.write_line_break() + if self.column < indent: + self.whitespace = True + data = ' '*(indent-self.column) + self.column = indent + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + + def write_line_break(self, data=None): + if data is None: + data = self.best_line_break + self.whitespace = True + self.indention = True + self.line += 1 + self.column = 0 + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + + def write_version_directive(self, version_text): + data = '%%YAML %s' % version_text + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + self.write_line_break() + + def write_tag_directive(self, handle_text, prefix_text): + data = '%%TAG %s %s' % (handle_text, prefix_text) + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + self.write_line_break() + + # Scalar streams. + + def write_single_quoted(self, text, split=True): + self.write_indicator('\'', True) + spaces = False + breaks = False + start = end = 0 + while end <= len(text): + ch = None + if end < len(text): + ch = text[end] + if spaces: + if ch is None or ch != ' ': + if start+1 == end and self.column > self.best_width and split \ + and start != 0 and end != len(text): + self.write_indent() + else: + data = text[start:end] + self.column += len(data) + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + start = end + elif breaks: + if ch is None or ch not in '\n\x85\u2028\u2029': + if text[start] == '\n': + self.write_line_break() + for br in text[start:end]: + if br == '\n': + self.write_line_break() + else: + self.write_line_break(br) + self.write_indent() + start = end + else: + if ch is None or ch in ' \n\x85\u2028\u2029' or ch == '\'': + if start < end: + data = text[start:end] + self.column += len(data) + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + start = end + if ch == '\'': + data = '\'\'' + self.column += 2 + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + start = end + 1 + if ch is not None: + spaces = (ch == ' ') + breaks = (ch in '\n\x85\u2028\u2029') + end += 1 + self.write_indicator('\'', False) + + ESCAPE_REPLACEMENTS = { + '\0': '0', + '\x07': 'a', + '\x08': 'b', + '\x09': 't', + '\x0A': 'n', + '\x0B': 'v', + '\x0C': 'f', + '\x0D': 'r', + '\x1B': 'e', + '\"': '\"', + '\\': '\\', + '\x85': 'N', + '\xA0': '_', + '\u2028': 'L', + '\u2029': 'P', + } + + def write_double_quoted(self, text, split=True): + self.write_indicator('"', True) + start = end = 0 + while end <= len(text): + ch = None + if end < len(text): + ch = text[end] + if ch is None or ch in '"\\\x85\u2028\u2029\uFEFF' \ + or not ('\x20' <= ch <= '\x7E' + or (self.allow_unicode + and ('\xA0' <= ch <= '\uD7FF' + or '\uE000' <= ch <= '\uFFFD'))): + if start < end: + data = text[start:end] + self.column += len(data) + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + start = end + if ch is not None: + if ch in self.ESCAPE_REPLACEMENTS: + data = '\\'+self.ESCAPE_REPLACEMENTS[ch] + elif ch <= '\xFF': + data = '\\x%02X' % ord(ch) + elif ch <= '\uFFFF': + data = '\\u%04X' % ord(ch) + else: + data = '\\U%08X' % ord(ch) + self.column += len(data) + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + start = end+1 + if 0 < end < len(text)-1 and (ch == ' ' or start >= end) \ + and self.column+(end-start) > self.best_width and split: + data = text[start:end]+'\\' + if start < end: + start = end + self.column += len(data) + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + self.write_indent() + self.whitespace = False + self.indention = False + if text[start] == ' ': + data = '\\' + self.column += len(data) + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + end += 1 + self.write_indicator('"', False) + + def determine_block_hints(self, text): + hints = '' + if text: + if text[0] in ' \n\x85\u2028\u2029': + hints += str(self.best_indent) + if text[-1] not in '\n\x85\u2028\u2029': + hints += '-' + elif len(text) == 1 or text[-2] in '\n\x85\u2028\u2029': + hints += '+' + return hints + + def write_folded(self, text): + hints = self.determine_block_hints(text) + self.write_indicator('>'+hints, True) + if hints[-1:] == '+': + self.open_ended = True + self.write_line_break() + leading_space = True + spaces = False + breaks = True + start = end = 0 + while end <= len(text): + ch = None + if end < len(text): + ch = text[end] + if breaks: + if ch is None or ch not in '\n\x85\u2028\u2029': + if not leading_space and ch is not None and ch != ' ' \ + and text[start] == '\n': + self.write_line_break() + leading_space = (ch == ' ') + for br in text[start:end]: + if br == '\n': + self.write_line_break() + else: + self.write_line_break(br) + if ch is not None: + self.write_indent() + start = end + elif spaces: + if ch != ' ': + if start+1 == end and self.column > self.best_width: + self.write_indent() + else: + data = text[start:end] + self.column += len(data) + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + start = end + else: + if ch is None or ch in ' \n\x85\u2028\u2029': + data = text[start:end] + self.column += len(data) + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + if ch is None: + self.write_line_break() + start = end + if ch is not None: + breaks = (ch in '\n\x85\u2028\u2029') + spaces = (ch == ' ') + end += 1 + + def write_literal(self, text): + hints = self.determine_block_hints(text) + self.write_indicator('|'+hints, True) + if hints[-1:] == '+': + self.open_ended = True + self.write_line_break() + breaks = True + start = end = 0 + while end <= len(text): + ch = None + if end < len(text): + ch = text[end] + if breaks: + if ch is None or ch not in '\n\x85\u2028\u2029': + for br in text[start:end]: + if br == '\n': + self.write_line_break() + else: + self.write_line_break(br) + if ch is not None: + self.write_indent() + start = end + else: + if ch is None or ch in '\n\x85\u2028\u2029': + data = text[start:end] + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + if ch is None: + self.write_line_break() + start = end + if ch is not None: + breaks = (ch in '\n\x85\u2028\u2029') + end += 1 + + def write_plain(self, text, split=True): + if self.root_context: + self.open_ended = True + if not text: + return + if not self.whitespace: + data = ' ' + self.column += len(data) + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + self.whitespace = False + self.indention = False + spaces = False + breaks = False + start = end = 0 + while end <= len(text): + ch = None + if end < len(text): + ch = text[end] + if spaces: + if ch != ' ': + if start+1 == end and self.column > self.best_width and split: + self.write_indent() + self.whitespace = False + self.indention = False + else: + data = text[start:end] + self.column += len(data) + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + start = end + elif breaks: + if ch not in '\n\x85\u2028\u2029': + if text[start] == '\n': + self.write_line_break() + for br in text[start:end]: + if br == '\n': + self.write_line_break() + else: + self.write_line_break(br) + self.write_indent() + self.whitespace = False + self.indention = False + start = end + else: + if ch is None or ch in ' \n\x85\u2028\u2029': + data = text[start:end] + self.column += len(data) + if self.encoding: + data = data.encode(self.encoding) + self.stream.write(data) + start = end + if ch is not None: + spaces = (ch == ' ') + breaks = (ch in '\n\x85\u2028\u2029') + end += 1 diff --git a/libs3/yaml/error.py b/libs3/yaml/error.py new file mode 100644 index 000000000..b796b4dc5 --- /dev/null +++ b/libs3/yaml/error.py @@ -0,0 +1,75 @@ + +__all__ = ['Mark', 'YAMLError', 'MarkedYAMLError'] + +class Mark: + + def __init__(self, name, index, line, column, buffer, pointer): + self.name = name + self.index = index + self.line = line + self.column = column + self.buffer = buffer + self.pointer = pointer + + def get_snippet(self, indent=4, max_length=75): + if self.buffer is None: + return None + head = '' + start = self.pointer + while start > 0 and self.buffer[start-1] not in '\0\r\n\x85\u2028\u2029': + start -= 1 + if self.pointer-start > max_length/2-1: + head = ' ... ' + start += 5 + break + tail = '' + end = self.pointer + while end < len(self.buffer) and self.buffer[end] not in '\0\r\n\x85\u2028\u2029': + end += 1 + if end-self.pointer > max_length/2-1: + tail = ' ... ' + end -= 5 + break + snippet = self.buffer[start:end] + return ' '*indent + head + snippet + tail + '\n' \ + + ' '*(indent+self.pointer-start+len(head)) + '^' + + def __str__(self): + snippet = self.get_snippet() + where = " in \"%s\", line %d, column %d" \ + % (self.name, self.line+1, self.column+1) + if snippet is not None: + where += ":\n"+snippet + return where + +class YAMLError(Exception): + pass + +class MarkedYAMLError(YAMLError): + + def __init__(self, context=None, context_mark=None, + problem=None, problem_mark=None, note=None): + self.context = context + self.context_mark = context_mark + self.problem = problem + self.problem_mark = problem_mark + self.note = note + + def __str__(self): + lines = [] + if self.context is not None: + lines.append(self.context) + if self.context_mark is not None \ + and (self.problem is None or self.problem_mark is None + or self.context_mark.name != self.problem_mark.name + or self.context_mark.line != self.problem_mark.line + or self.context_mark.column != self.problem_mark.column): + lines.append(str(self.context_mark)) + if self.problem is not None: + lines.append(self.problem) + if self.problem_mark is not None: + lines.append(str(self.problem_mark)) + if self.note is not None: + lines.append(self.note) + return '\n'.join(lines) + diff --git a/libs3/yaml/events.py b/libs3/yaml/events.py new file mode 100644 index 000000000..f79ad389c --- /dev/null +++ b/libs3/yaml/events.py @@ -0,0 +1,86 @@ + +# Abstract classes. + +class Event(object): + def __init__(self, start_mark=None, end_mark=None): + self.start_mark = start_mark + self.end_mark = end_mark + def __repr__(self): + attributes = [key for key in ['anchor', 'tag', 'implicit', 'value'] + if hasattr(self, key)] + arguments = ', '.join(['%s=%r' % (key, getattr(self, key)) + for key in attributes]) + return '%s(%s)' % (self.__class__.__name__, arguments) + +class NodeEvent(Event): + def __init__(self, anchor, start_mark=None, end_mark=None): + self.anchor = anchor + self.start_mark = start_mark + self.end_mark = end_mark + +class CollectionStartEvent(NodeEvent): + def __init__(self, anchor, tag, implicit, start_mark=None, end_mark=None, + flow_style=None): + self.anchor = anchor + self.tag = tag + self.implicit = implicit + self.start_mark = start_mark + self.end_mark = end_mark + self.flow_style = flow_style + +class CollectionEndEvent(Event): + pass + +# Implementations. + +class StreamStartEvent(Event): + def __init__(self, start_mark=None, end_mark=None, encoding=None): + self.start_mark = start_mark + self.end_mark = end_mark + self.encoding = encoding + +class StreamEndEvent(Event): + pass + +class DocumentStartEvent(Event): + def __init__(self, start_mark=None, end_mark=None, + explicit=None, version=None, tags=None): + self.start_mark = start_mark + self.end_mark = end_mark + self.explicit = explicit + self.version = version + self.tags = tags + +class DocumentEndEvent(Event): + def __init__(self, start_mark=None, end_mark=None, + explicit=None): + self.start_mark = start_mark + self.end_mark = end_mark + self.explicit = explicit + +class AliasEvent(NodeEvent): + pass + +class ScalarEvent(NodeEvent): + def __init__(self, anchor, tag, implicit, value, + start_mark=None, end_mark=None, style=None): + self.anchor = anchor + self.tag = tag + self.implicit = implicit + self.value = value + self.start_mark = start_mark + self.end_mark = end_mark + self.style = style + +class SequenceStartEvent(CollectionStartEvent): + pass + +class SequenceEndEvent(CollectionEndEvent): + pass + +class MappingStartEvent(CollectionStartEvent): + pass + +class MappingEndEvent(CollectionEndEvent): + pass + diff --git a/libs3/yaml/loader.py b/libs3/yaml/loader.py new file mode 100644 index 000000000..414cb2c15 --- /dev/null +++ b/libs3/yaml/loader.py @@ -0,0 +1,63 @@ + +__all__ = ['BaseLoader', 'FullLoader', 'SafeLoader', 'Loader', 'UnsafeLoader'] + +from .reader import * +from .scanner import * +from .parser import * +from .composer import * +from .constructor import * +from .resolver import * + +class BaseLoader(Reader, Scanner, Parser, Composer, BaseConstructor, BaseResolver): + + def __init__(self, stream): + Reader.__init__(self, stream) + Scanner.__init__(self) + Parser.__init__(self) + Composer.__init__(self) + BaseConstructor.__init__(self) + BaseResolver.__init__(self) + +class FullLoader(Reader, Scanner, Parser, Composer, FullConstructor, Resolver): + + def __init__(self, stream): + Reader.__init__(self, stream) + Scanner.__init__(self) + Parser.__init__(self) + Composer.__init__(self) + FullConstructor.__init__(self) + Resolver.__init__(self) + +class SafeLoader(Reader, Scanner, Parser, Composer, SafeConstructor, Resolver): + + def __init__(self, stream): + Reader.__init__(self, stream) + Scanner.__init__(self) + Parser.__init__(self) + Composer.__init__(self) + SafeConstructor.__init__(self) + Resolver.__init__(self) + +class Loader(Reader, Scanner, Parser, Composer, Constructor, Resolver): + + def __init__(self, stream): + Reader.__init__(self, stream) + Scanner.__init__(self) + Parser.__init__(self) + Composer.__init__(self) + Constructor.__init__(self) + Resolver.__init__(self) + +# UnsafeLoader is the same as Loader (which is and was always unsafe on +# untrusted input). Use of either Loader or UnsafeLoader should be rare, since +# FullLoad should be able to load almost all YAML safely. Loader is left intact +# to ensure backwards compatability. +class UnsafeLoader(Reader, Scanner, Parser, Composer, Constructor, Resolver): + + def __init__(self, stream): + Reader.__init__(self, stream) + Scanner.__init__(self) + Parser.__init__(self) + Composer.__init__(self) + Constructor.__init__(self) + Resolver.__init__(self) diff --git a/libs3/yaml/nodes.py b/libs3/yaml/nodes.py new file mode 100644 index 000000000..c4f070c41 --- /dev/null +++ b/libs3/yaml/nodes.py @@ -0,0 +1,49 @@ + +class Node(object): + def __init__(self, tag, value, start_mark, end_mark): + self.tag = tag + self.value = value + self.start_mark = start_mark + self.end_mark = end_mark + def __repr__(self): + value = self.value + #if isinstance(value, list): + # if len(value) == 0: + # value = '<empty>' + # elif len(value) == 1: + # value = '<1 item>' + # else: + # value = '<%d items>' % len(value) + #else: + # if len(value) > 75: + # value = repr(value[:70]+u' ... ') + # else: + # value = repr(value) + value = repr(value) + return '%s(tag=%r, value=%s)' % (self.__class__.__name__, self.tag, value) + +class ScalarNode(Node): + id = 'scalar' + def __init__(self, tag, value, + start_mark=None, end_mark=None, style=None): + self.tag = tag + self.value = value + self.start_mark = start_mark + self.end_mark = end_mark + self.style = style + +class CollectionNode(Node): + def __init__(self, tag, value, + start_mark=None, end_mark=None, flow_style=None): + self.tag = tag + self.value = value + self.start_mark = start_mark + self.end_mark = end_mark + self.flow_style = flow_style + +class SequenceNode(CollectionNode): + id = 'sequence' + +class MappingNode(CollectionNode): + id = 'mapping' + diff --git a/libs3/yaml/parser.py b/libs3/yaml/parser.py new file mode 100644 index 000000000..13a5995d2 --- /dev/null +++ b/libs3/yaml/parser.py @@ -0,0 +1,589 @@ + +# The following YAML grammar is LL(1) and is parsed by a recursive descent +# parser. +# +# stream ::= STREAM-START implicit_document? explicit_document* STREAM-END +# implicit_document ::= block_node DOCUMENT-END* +# explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END* +# block_node_or_indentless_sequence ::= +# ALIAS +# | properties (block_content | indentless_block_sequence)? +# | block_content +# | indentless_block_sequence +# block_node ::= ALIAS +# | properties block_content? +# | block_content +# flow_node ::= ALIAS +# | properties flow_content? +# | flow_content +# properties ::= TAG ANCHOR? | ANCHOR TAG? +# block_content ::= block_collection | flow_collection | SCALAR +# flow_content ::= flow_collection | SCALAR +# block_collection ::= block_sequence | block_mapping +# flow_collection ::= flow_sequence | flow_mapping +# block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END +# indentless_sequence ::= (BLOCK-ENTRY block_node?)+ +# block_mapping ::= BLOCK-MAPPING_START +# ((KEY block_node_or_indentless_sequence?)? +# (VALUE block_node_or_indentless_sequence?)?)* +# BLOCK-END +# flow_sequence ::= FLOW-SEQUENCE-START +# (flow_sequence_entry FLOW-ENTRY)* +# flow_sequence_entry? +# FLOW-SEQUENCE-END +# flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)? +# flow_mapping ::= FLOW-MAPPING-START +# (flow_mapping_entry FLOW-ENTRY)* +# flow_mapping_entry? +# FLOW-MAPPING-END +# flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)? +# +# FIRST sets: +# +# stream: { STREAM-START } +# explicit_document: { DIRECTIVE DOCUMENT-START } +# implicit_document: FIRST(block_node) +# block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START } +# flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START } +# block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR } +# flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR } +# block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START } +# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START } +# block_sequence: { BLOCK-SEQUENCE-START } +# block_mapping: { BLOCK-MAPPING-START } +# block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY } +# indentless_sequence: { ENTRY } +# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START } +# flow_sequence: { FLOW-SEQUENCE-START } +# flow_mapping: { FLOW-MAPPING-START } +# flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY } +# flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY } + +__all__ = ['Parser', 'ParserError'] + +from .error import MarkedYAMLError +from .tokens import * +from .events import * +from .scanner import * + +class ParserError(MarkedYAMLError): + pass + +class Parser: + # Since writing a recursive-descendant parser is a straightforward task, we + # do not give many comments here. + + DEFAULT_TAGS = { + '!': '!', + '!!': 'tag:yaml.org,2002:', + } + + def __init__(self): + self.current_event = None + self.yaml_version = None + self.tag_handles = {} + self.states = [] + self.marks = [] + self.state = self.parse_stream_start + + def dispose(self): + # Reset the state attributes (to clear self-references) + self.states = [] + self.state = None + + def check_event(self, *choices): + # Check the type of the next event. + if self.current_event is None: + if self.state: + self.current_event = self.state() + if self.current_event is not None: + if not choices: + return True + for choice in choices: + if isinstance(self.current_event, choice): + return True + return False + + def peek_event(self): + # Get the next event. + if self.current_event is None: + if self.state: + self.current_event = self.state() + return self.current_event + + def get_event(self): + # Get the next event and proceed further. + if self.current_event is None: + if self.state: + self.current_event = self.state() + value = self.current_event + self.current_event = None + return value + + # stream ::= STREAM-START implicit_document? explicit_document* STREAM-END + # implicit_document ::= block_node DOCUMENT-END* + # explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END* + + def parse_stream_start(self): + + # Parse the stream start. + token = self.get_token() + event = StreamStartEvent(token.start_mark, token.end_mark, + encoding=token.encoding) + + # Prepare the next state. + self.state = self.parse_implicit_document_start + + return event + + def parse_implicit_document_start(self): + + # Parse an implicit document. + if not self.check_token(DirectiveToken, DocumentStartToken, + StreamEndToken): + self.tag_handles = self.DEFAULT_TAGS + token = self.peek_token() + start_mark = end_mark = token.start_mark + event = DocumentStartEvent(start_mark, end_mark, + explicit=False) + + # Prepare the next state. + self.states.append(self.parse_document_end) + self.state = self.parse_block_node + + return event + + else: + return self.parse_document_start() + + def parse_document_start(self): + + # Parse any extra document end indicators. + while self.check_token(DocumentEndToken): + self.get_token() + + # Parse an explicit document. + if not self.check_token(StreamEndToken): + token = self.peek_token() + start_mark = token.start_mark + version, tags = self.process_directives() + if not self.check_token(DocumentStartToken): + raise ParserError(None, None, + "expected '<document start>', but found %r" + % self.peek_token().id, + self.peek_token().start_mark) + token = self.get_token() + end_mark = token.end_mark + event = DocumentStartEvent(start_mark, end_mark, + explicit=True, version=version, tags=tags) + self.states.append(self.parse_document_end) + self.state = self.parse_document_content + else: + # Parse the end of the stream. + token = self.get_token() + event = StreamEndEvent(token.start_mark, token.end_mark) + assert not self.states + assert not self.marks + self.state = None + return event + + def parse_document_end(self): + + # Parse the document end. + token = self.peek_token() + start_mark = end_mark = token.start_mark + explicit = False + if self.check_token(DocumentEndToken): + token = self.get_token() + end_mark = token.end_mark + explicit = True + event = DocumentEndEvent(start_mark, end_mark, + explicit=explicit) + + # Prepare the next state. + self.state = self.parse_document_start + + return event + + def parse_document_content(self): + if self.check_token(DirectiveToken, + DocumentStartToken, DocumentEndToken, StreamEndToken): + event = self.process_empty_scalar(self.peek_token().start_mark) + self.state = self.states.pop() + return event + else: + return self.parse_block_node() + + def process_directives(self): + self.yaml_version = None + self.tag_handles = {} + while self.check_token(DirectiveToken): + token = self.get_token() + if token.name == 'YAML': + if self.yaml_version is not None: + raise ParserError(None, None, + "found duplicate YAML directive", token.start_mark) + major, minor = token.value + if major != 1: + raise ParserError(None, None, + "found incompatible YAML document (version 1.* is required)", + token.start_mark) + self.yaml_version = token.value + elif token.name == 'TAG': + handle, prefix = token.value + if handle in self.tag_handles: + raise ParserError(None, None, + "duplicate tag handle %r" % handle, + token.start_mark) + self.tag_handles[handle] = prefix + if self.tag_handles: + value = self.yaml_version, self.tag_handles.copy() + else: + value = self.yaml_version, None + for key in self.DEFAULT_TAGS: + if key not in self.tag_handles: + self.tag_handles[key] = self.DEFAULT_TAGS[key] + return value + + # block_node_or_indentless_sequence ::= ALIAS + # | properties (block_content | indentless_block_sequence)? + # | block_content + # | indentless_block_sequence + # block_node ::= ALIAS + # | properties block_content? + # | block_content + # flow_node ::= ALIAS + # | properties flow_content? + # | flow_content + # properties ::= TAG ANCHOR? | ANCHOR TAG? + # block_content ::= block_collection | flow_collection | SCALAR + # flow_content ::= flow_collection | SCALAR + # block_collection ::= block_sequence | block_mapping + # flow_collection ::= flow_sequence | flow_mapping + + def parse_block_node(self): + return self.parse_node(block=True) + + def parse_flow_node(self): + return self.parse_node() + + def parse_block_node_or_indentless_sequence(self): + return self.parse_node(block=True, indentless_sequence=True) + + def parse_node(self, block=False, indentless_sequence=False): + if self.check_token(AliasToken): + token = self.get_token() + event = AliasEvent(token.value, token.start_mark, token.end_mark) + self.state = self.states.pop() + else: + anchor = None + tag = None + start_mark = end_mark = tag_mark = None + if self.check_token(AnchorToken): + token = self.get_token() + start_mark = token.start_mark + end_mark = token.end_mark + anchor = token.value + if self.check_token(TagToken): + token = self.get_token() + tag_mark = token.start_mark + end_mark = token.end_mark + tag = token.value + elif self.check_token(TagToken): + token = self.get_token() + start_mark = tag_mark = token.start_mark + end_mark = token.end_mark + tag = token.value + if self.check_token(AnchorToken): + token = self.get_token() + end_mark = token.end_mark + anchor = token.value + if tag is not None: + handle, suffix = tag + if handle is not None: + if handle not in self.tag_handles: + raise ParserError("while parsing a node", start_mark, + "found undefined tag handle %r" % handle, + tag_mark) + tag = self.tag_handles[handle]+suffix + else: + tag = suffix + #if tag == '!': + # raise ParserError("while parsing a node", start_mark, + # "found non-specific tag '!'", tag_mark, + # "Please check 'http://pyyaml.org/wiki/YAMLNonSpecificTag' and share your opinion.") + if start_mark is None: + start_mark = end_mark = self.peek_token().start_mark + event = None + implicit = (tag is None or tag == '!') + if indentless_sequence and self.check_token(BlockEntryToken): + end_mark = self.peek_token().end_mark + event = SequenceStartEvent(anchor, tag, implicit, + start_mark, end_mark) + self.state = self.parse_indentless_sequence_entry + else: + if self.check_token(ScalarToken): + token = self.get_token() + end_mark = token.end_mark + if (token.plain and tag is None) or tag == '!': + implicit = (True, False) + elif tag is None: + implicit = (False, True) + else: + implicit = (False, False) + event = ScalarEvent(anchor, tag, implicit, token.value, + start_mark, end_mark, style=token.style) + self.state = self.states.pop() + elif self.check_token(FlowSequenceStartToken): + end_mark = self.peek_token().end_mark + event = SequenceStartEvent(anchor, tag, implicit, + start_mark, end_mark, flow_style=True) + self.state = self.parse_flow_sequence_first_entry + elif self.check_token(FlowMappingStartToken): + end_mark = self.peek_token().end_mark + event = MappingStartEvent(anchor, tag, implicit, + start_mark, end_mark, flow_style=True) + self.state = self.parse_flow_mapping_first_key + elif block and self.check_token(BlockSequenceStartToken): + end_mark = self.peek_token().start_mark + event = SequenceStartEvent(anchor, tag, implicit, + start_mark, end_mark, flow_style=False) + self.state = self.parse_block_sequence_first_entry + elif block and self.check_token(BlockMappingStartToken): + end_mark = self.peek_token().start_mark + event = MappingStartEvent(anchor, tag, implicit, + start_mark, end_mark, flow_style=False) + self.state = self.parse_block_mapping_first_key + elif anchor is not None or tag is not None: + # Empty scalars are allowed even if a tag or an anchor is + # specified. + event = ScalarEvent(anchor, tag, (implicit, False), '', + start_mark, end_mark) + self.state = self.states.pop() + else: + if block: + node = 'block' + else: + node = 'flow' + token = self.peek_token() + raise ParserError("while parsing a %s node" % node, start_mark, + "expected the node content, but found %r" % token.id, + token.start_mark) + return event + + # block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END + + def parse_block_sequence_first_entry(self): + token = self.get_token() + self.marks.append(token.start_mark) + return self.parse_block_sequence_entry() + + def parse_block_sequence_entry(self): + if self.check_token(BlockEntryToken): + token = self.get_token() + if not self.check_token(BlockEntryToken, BlockEndToken): + self.states.append(self.parse_block_sequence_entry) + return self.parse_block_node() + else: + self.state = self.parse_block_sequence_entry + return self.process_empty_scalar(token.end_mark) + if not self.check_token(BlockEndToken): + token = self.peek_token() + raise ParserError("while parsing a block collection", self.marks[-1], + "expected <block end>, but found %r" % token.id, token.start_mark) + token = self.get_token() + event = SequenceEndEvent(token.start_mark, token.end_mark) + self.state = self.states.pop() + self.marks.pop() + return event + + # indentless_sequence ::= (BLOCK-ENTRY block_node?)+ + + def parse_indentless_sequence_entry(self): + if self.check_token(BlockEntryToken): + token = self.get_token() + if not self.check_token(BlockEntryToken, + KeyToken, ValueToken, BlockEndToken): + self.states.append(self.parse_indentless_sequence_entry) + return self.parse_block_node() + else: + self.state = self.parse_indentless_sequence_entry + return self.process_empty_scalar(token.end_mark) + token = self.peek_token() + event = SequenceEndEvent(token.start_mark, token.start_mark) + self.state = self.states.pop() + return event + + # block_mapping ::= BLOCK-MAPPING_START + # ((KEY block_node_or_indentless_sequence?)? + # (VALUE block_node_or_indentless_sequence?)?)* + # BLOCK-END + + def parse_block_mapping_first_key(self): + token = self.get_token() + self.marks.append(token.start_mark) + return self.parse_block_mapping_key() + + def parse_block_mapping_key(self): + if self.check_token(KeyToken): + token = self.get_token() + if not self.check_token(KeyToken, ValueToken, BlockEndToken): + self.states.append(self.parse_block_mapping_value) + return self.parse_block_node_or_indentless_sequence() + else: + self.state = self.parse_block_mapping_value + return self.process_empty_scalar(token.end_mark) + if not self.check_token(BlockEndToken): + token = self.peek_token() + raise ParserError("while parsing a block mapping", self.marks[-1], + "expected <block end>, but found %r" % token.id, token.start_mark) + token = self.get_token() + event = MappingEndEvent(token.start_mark, token.end_mark) + self.state = self.states.pop() + self.marks.pop() + return event + + def parse_block_mapping_value(self): + if self.check_token(ValueToken): + token = self.get_token() + if not self.check_token(KeyToken, ValueToken, BlockEndToken): + self.states.append(self.parse_block_mapping_key) + return self.parse_block_node_or_indentless_sequence() + else: + self.state = self.parse_block_mapping_key + return self.process_empty_scalar(token.end_mark) + else: + self.state = self.parse_block_mapping_key + token = self.peek_token() + return self.process_empty_scalar(token.start_mark) + + # flow_sequence ::= FLOW-SEQUENCE-START + # (flow_sequence_entry FLOW-ENTRY)* + # flow_sequence_entry? + # FLOW-SEQUENCE-END + # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)? + # + # Note that while production rules for both flow_sequence_entry and + # flow_mapping_entry are equal, their interpretations are different. + # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?` + # generate an inline mapping (set syntax). + + def parse_flow_sequence_first_entry(self): + token = self.get_token() + self.marks.append(token.start_mark) + return self.parse_flow_sequence_entry(first=True) + + def parse_flow_sequence_entry(self, first=False): + if not self.check_token(FlowSequenceEndToken): + if not first: + if self.check_token(FlowEntryToken): + self.get_token() + else: + token = self.peek_token() + raise ParserError("while parsing a flow sequence", self.marks[-1], + "expected ',' or ']', but got %r" % token.id, token.start_mark) + + if self.check_token(KeyToken): + token = self.peek_token() + event = MappingStartEvent(None, None, True, + token.start_mark, token.end_mark, + flow_style=True) + self.state = self.parse_flow_sequence_entry_mapping_key + return event + elif not self.check_token(FlowSequenceEndToken): + self.states.append(self.parse_flow_sequence_entry) + return self.parse_flow_node() + token = self.get_token() + event = SequenceEndEvent(token.start_mark, token.end_mark) + self.state = self.states.pop() + self.marks.pop() + return event + + def parse_flow_sequence_entry_mapping_key(self): + token = self.get_token() + if not self.check_token(ValueToken, + FlowEntryToken, FlowSequenceEndToken): + self.states.append(self.parse_flow_sequence_entry_mapping_value) + return self.parse_flow_node() + else: + self.state = self.parse_flow_sequence_entry_mapping_value + return self.process_empty_scalar(token.end_mark) + + def parse_flow_sequence_entry_mapping_value(self): + if self.check_token(ValueToken): + token = self.get_token() + if not self.check_token(FlowEntryToken, FlowSequenceEndToken): + self.states.append(self.parse_flow_sequence_entry_mapping_end) + return self.parse_flow_node() + else: + self.state = self.parse_flow_sequence_entry_mapping_end + return self.process_empty_scalar(token.end_mark) + else: + self.state = self.parse_flow_sequence_entry_mapping_end + token = self.peek_token() + return self.process_empty_scalar(token.start_mark) + + def parse_flow_sequence_entry_mapping_end(self): + self.state = self.parse_flow_sequence_entry + token = self.peek_token() + return MappingEndEvent(token.start_mark, token.start_mark) + + # flow_mapping ::= FLOW-MAPPING-START + # (flow_mapping_entry FLOW-ENTRY)* + # flow_mapping_entry? + # FLOW-MAPPING-END + # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)? + + def parse_flow_mapping_first_key(self): + token = self.get_token() + self.marks.append(token.start_mark) + return self.parse_flow_mapping_key(first=True) + + def parse_flow_mapping_key(self, first=False): + if not self.check_token(FlowMappingEndToken): + if not first: + if self.check_token(FlowEntryToken): + self.get_token() + else: + token = self.peek_token() + raise ParserError("while parsing a flow mapping", self.marks[-1], + "expected ',' or '}', but got %r" % token.id, token.start_mark) + if self.check_token(KeyToken): + token = self.get_token() + if not self.check_token(ValueToken, + FlowEntryToken, FlowMappingEndToken): + self.states.append(self.parse_flow_mapping_value) + return self.parse_flow_node() + else: + self.state = self.parse_flow_mapping_value + return self.process_empty_scalar(token.end_mark) + elif not self.check_token(FlowMappingEndToken): + self.states.append(self.parse_flow_mapping_empty_value) + return self.parse_flow_node() + token = self.get_token() + event = MappingEndEvent(token.start_mark, token.end_mark) + self.state = self.states.pop() + self.marks.pop() + return event + + def parse_flow_mapping_value(self): + if self.check_token(ValueToken): + token = self.get_token() + if not self.check_token(FlowEntryToken, FlowMappingEndToken): + self.states.append(self.parse_flow_mapping_key) + return self.parse_flow_node() + else: + self.state = self.parse_flow_mapping_key + return self.process_empty_scalar(token.end_mark) + else: + self.state = self.parse_flow_mapping_key + token = self.peek_token() + return self.process_empty_scalar(token.start_mark) + + def parse_flow_mapping_empty_value(self): + self.state = self.parse_flow_mapping_key + return self.process_empty_scalar(self.peek_token().start_mark) + + def process_empty_scalar(self, mark): + return ScalarEvent(None, None, (True, False), '', mark, mark) + diff --git a/libs3/yaml/reader.py b/libs3/yaml/reader.py new file mode 100644 index 000000000..774b0219b --- /dev/null +++ b/libs3/yaml/reader.py @@ -0,0 +1,185 @@ +# This module contains abstractions for the input stream. You don't have to +# looks further, there are no pretty code. +# +# We define two classes here. +# +# Mark(source, line, column) +# It's just a record and its only use is producing nice error messages. +# Parser does not use it for any other purposes. +# +# Reader(source, data) +# Reader determines the encoding of `data` and converts it to unicode. +# Reader provides the following methods and attributes: +# reader.peek(length=1) - return the next `length` characters +# reader.forward(length=1) - move the current position to `length` characters. +# reader.index - the number of the current character. +# reader.line, stream.column - the line and the column of the current character. + +__all__ = ['Reader', 'ReaderError'] + +from .error import YAMLError, Mark + +import codecs, re + +class ReaderError(YAMLError): + + def __init__(self, name, position, character, encoding, reason): + self.name = name + self.character = character + self.position = position + self.encoding = encoding + self.reason = reason + + def __str__(self): + if isinstance(self.character, bytes): + return "'%s' codec can't decode byte #x%02x: %s\n" \ + " in \"%s\", position %d" \ + % (self.encoding, ord(self.character), self.reason, + self.name, self.position) + else: + return "unacceptable character #x%04x: %s\n" \ + " in \"%s\", position %d" \ + % (self.character, self.reason, + self.name, self.position) + +class Reader(object): + # Reader: + # - determines the data encoding and converts it to a unicode string, + # - checks if characters are in allowed range, + # - adds '\0' to the end. + + # Reader accepts + # - a `bytes` object, + # - a `str` object, + # - a file-like object with its `read` method returning `str`, + # - a file-like object with its `read` method returning `unicode`. + + # Yeah, it's ugly and slow. + + def __init__(self, stream): + self.name = None + self.stream = None + self.stream_pointer = 0 + self.eof = True + self.buffer = '' + self.pointer = 0 + self.raw_buffer = None + self.raw_decode = None + self.encoding = None + self.index = 0 + self.line = 0 + self.column = 0 + if isinstance(stream, str): + self.name = "<unicode string>" + self.check_printable(stream) + self.buffer = stream+'\0' + elif isinstance(stream, bytes): + self.name = "<byte string>" + self.raw_buffer = stream + self.determine_encoding() + else: + self.stream = stream + self.name = getattr(stream, 'name', "<file>") + self.eof = False + self.raw_buffer = None + self.determine_encoding() + + def peek(self, index=0): + try: + return self.buffer[self.pointer+index] + except IndexError: + self.update(index+1) + return self.buffer[self.pointer+index] + + def prefix(self, length=1): + if self.pointer+length >= len(self.buffer): + self.update(length) + return self.buffer[self.pointer:self.pointer+length] + + def forward(self, length=1): + if self.pointer+length+1 >= len(self.buffer): + self.update(length+1) + while length: + ch = self.buffer[self.pointer] + self.pointer += 1 + self.index += 1 + if ch in '\n\x85\u2028\u2029' \ + or (ch == '\r' and self.buffer[self.pointer] != '\n'): + self.line += 1 + self.column = 0 + elif ch != '\uFEFF': + self.column += 1 + length -= 1 + + def get_mark(self): + if self.stream is None: + return Mark(self.name, self.index, self.line, self.column, + self.buffer, self.pointer) + else: + return Mark(self.name, self.index, self.line, self.column, + None, None) + + def determine_encoding(self): + while not self.eof and (self.raw_buffer is None or len(self.raw_buffer) < 2): + self.update_raw() + if isinstance(self.raw_buffer, bytes): + if self.raw_buffer.startswith(codecs.BOM_UTF16_LE): + self.raw_decode = codecs.utf_16_le_decode + self.encoding = 'utf-16-le' + elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE): + self.raw_decode = codecs.utf_16_be_decode + self.encoding = 'utf-16-be' + else: + self.raw_decode = codecs.utf_8_decode + self.encoding = 'utf-8' + self.update(1) + + NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]') + def check_printable(self, data): + match = self.NON_PRINTABLE.search(data) + if match: + character = match.group() + position = self.index+(len(self.buffer)-self.pointer)+match.start() + raise ReaderError(self.name, position, ord(character), + 'unicode', "special characters are not allowed") + + def update(self, length): + if self.raw_buffer is None: + return + self.buffer = self.buffer[self.pointer:] + self.pointer = 0 + while len(self.buffer) < length: + if not self.eof: + self.update_raw() + if self.raw_decode is not None: + try: + data, converted = self.raw_decode(self.raw_buffer, + 'strict', self.eof) + except UnicodeDecodeError as exc: + character = self.raw_buffer[exc.start] + if self.stream is not None: + position = self.stream_pointer-len(self.raw_buffer)+exc.start + else: + position = exc.start + raise ReaderError(self.name, position, character, + exc.encoding, exc.reason) + else: + data = self.raw_buffer + converted = len(data) + self.check_printable(data) + self.buffer += data + self.raw_buffer = self.raw_buffer[converted:] + if self.eof: + self.buffer += '\0' + self.raw_buffer = None + break + + def update_raw(self, size=4096): + data = self.stream.read(size) + if self.raw_buffer is None: + self.raw_buffer = data + else: + self.raw_buffer += data + self.stream_pointer += len(data) + if not data: + self.eof = True diff --git a/libs3/yaml/representer.py b/libs3/yaml/representer.py new file mode 100644 index 000000000..dd144017b --- /dev/null +++ b/libs3/yaml/representer.py @@ -0,0 +1,389 @@ + +__all__ = ['BaseRepresenter', 'SafeRepresenter', 'Representer', + 'RepresenterError'] + +from .error import * +from .nodes import * + +import datetime, sys, copyreg, types, base64, collections + +class RepresenterError(YAMLError): + pass + +class BaseRepresenter: + + yaml_representers = {} + yaml_multi_representers = {} + + def __init__(self, default_style=None, default_flow_style=False, sort_keys=True): + self.default_style = default_style + self.sort_keys = sort_keys + self.default_flow_style = default_flow_style + self.represented_objects = {} + self.object_keeper = [] + self.alias_key = None + + def represent(self, data): + node = self.represent_data(data) + self.serialize(node) + self.represented_objects = {} + self.object_keeper = [] + self.alias_key = None + + def represent_data(self, data): + if self.ignore_aliases(data): + self.alias_key = None + else: + self.alias_key = id(data) + if self.alias_key is not None: + if self.alias_key in self.represented_objects: + node = self.represented_objects[self.alias_key] + #if node is None: + # raise RepresenterError("recursive objects are not allowed: %r" % data) + return node + #self.represented_objects[alias_key] = None + self.object_keeper.append(data) + data_types = type(data).__mro__ + if data_types[0] in self.yaml_representers: + node = self.yaml_representers[data_types[0]](self, data) + else: + for data_type in data_types: + if data_type in self.yaml_multi_representers: + node = self.yaml_multi_representers[data_type](self, data) + break + else: + if None in self.yaml_multi_representers: + node = self.yaml_multi_representers[None](self, data) + elif None in self.yaml_representers: + node = self.yaml_representers[None](self, data) + else: + node = ScalarNode(None, str(data)) + #if alias_key is not None: + # self.represented_objects[alias_key] = node + return node + + @classmethod + def add_representer(cls, data_type, representer): + if not 'yaml_representers' in cls.__dict__: + cls.yaml_representers = cls.yaml_representers.copy() + cls.yaml_representers[data_type] = representer + + @classmethod + def add_multi_representer(cls, data_type, representer): + if not 'yaml_multi_representers' in cls.__dict__: + cls.yaml_multi_representers = cls.yaml_multi_representers.copy() + cls.yaml_multi_representers[data_type] = representer + + def represent_scalar(self, tag, value, style=None): + if style is None: + style = self.default_style + node = ScalarNode(tag, value, style=style) + if self.alias_key is not None: + self.represented_objects[self.alias_key] = node + return node + + def represent_sequence(self, tag, sequence, flow_style=None): + value = [] + node = SequenceNode(tag, value, flow_style=flow_style) + if self.alias_key is not None: + self.represented_objects[self.alias_key] = node + best_style = True + for item in sequence: + node_item = self.represent_data(item) + if not (isinstance(node_item, ScalarNode) and not node_item.style): + best_style = False + value.append(node_item) + if flow_style is None: + if self.default_flow_style is not None: + node.flow_style = self.default_flow_style + else: + node.flow_style = best_style + return node + + def represent_mapping(self, tag, mapping, flow_style=None): + value = [] + node = MappingNode(tag, value, flow_style=flow_style) + if self.alias_key is not None: + self.represented_objects[self.alias_key] = node + best_style = True + if hasattr(mapping, 'items'): + mapping = list(mapping.items()) + if self.sort_keys: + try: + mapping = sorted(mapping) + except TypeError: + pass + for item_key, item_value in mapping: + node_key = self.represent_data(item_key) + node_value = self.represent_data(item_value) + if not (isinstance(node_key, ScalarNode) and not node_key.style): + best_style = False + if not (isinstance(node_value, ScalarNode) and not node_value.style): + best_style = False + value.append((node_key, node_value)) + if flow_style is None: + if self.default_flow_style is not None: + node.flow_style = self.default_flow_style + else: + node.flow_style = best_style + return node + + def ignore_aliases(self, data): + return False + +class SafeRepresenter(BaseRepresenter): + + def ignore_aliases(self, data): + if data is None: + return True + if isinstance(data, tuple) and data == (): + return True + if isinstance(data, (str, bytes, bool, int, float)): + return True + + def represent_none(self, data): + return self.represent_scalar('tag:yaml.org,2002:null', 'null') + + def represent_str(self, data): + return self.represent_scalar('tag:yaml.org,2002:str', data) + + def represent_binary(self, data): + if hasattr(base64, 'encodebytes'): + data = base64.encodebytes(data).decode('ascii') + else: + data = base64.encodestring(data).decode('ascii') + return self.represent_scalar('tag:yaml.org,2002:binary', data, style='|') + + def represent_bool(self, data): + if data: + value = 'true' + else: + value = 'false' + return self.represent_scalar('tag:yaml.org,2002:bool', value) + + def represent_int(self, data): + return self.represent_scalar('tag:yaml.org,2002:int', str(data)) + + inf_value = 1e300 + while repr(inf_value) != repr(inf_value*inf_value): + inf_value *= inf_value + + def represent_float(self, data): + if data != data or (data == 0.0 and data == 1.0): + value = '.nan' + elif data == self.inf_value: + value = '.inf' + elif data == -self.inf_value: + value = '-.inf' + else: + value = repr(data).lower() + # Note that in some cases `repr(data)` represents a float number + # without the decimal parts. For instance: + # >>> repr(1e17) + # '1e17' + # Unfortunately, this is not a valid float representation according + # to the definition of the `!!float` tag. We fix this by adding + # '.0' before the 'e' symbol. + if '.' not in value and 'e' in value: + value = value.replace('e', '.0e', 1) + return self.represent_scalar('tag:yaml.org,2002:float', value) + + def represent_list(self, data): + #pairs = (len(data) > 0 and isinstance(data, list)) + #if pairs: + # for item in data: + # if not isinstance(item, tuple) or len(item) != 2: + # pairs = False + # break + #if not pairs: + return self.represent_sequence('tag:yaml.org,2002:seq', data) + #value = [] + #for item_key, item_value in data: + # value.append(self.represent_mapping(u'tag:yaml.org,2002:map', + # [(item_key, item_value)])) + #return SequenceNode(u'tag:yaml.org,2002:pairs', value) + + def represent_dict(self, data): + return self.represent_mapping('tag:yaml.org,2002:map', data) + + def represent_set(self, data): + value = {} + for key in data: + value[key] = None + return self.represent_mapping('tag:yaml.org,2002:set', value) + + def represent_date(self, data): + value = data.isoformat() + return self.represent_scalar('tag:yaml.org,2002:timestamp', value) + + def represent_datetime(self, data): + value = data.isoformat(' ') + return self.represent_scalar('tag:yaml.org,2002:timestamp', value) + + def represent_yaml_object(self, tag, data, cls, flow_style=None): + if hasattr(data, '__getstate__'): + state = data.__getstate__() + else: + state = data.__dict__.copy() + return self.represent_mapping(tag, state, flow_style=flow_style) + + def represent_undefined(self, data): + raise RepresenterError("cannot represent an object", data) + +SafeRepresenter.add_representer(type(None), + SafeRepresenter.represent_none) + +SafeRepresenter.add_representer(str, + SafeRepresenter.represent_str) + +SafeRepresenter.add_representer(bytes, + SafeRepresenter.represent_binary) + +SafeRepresenter.add_representer(bool, + SafeRepresenter.represent_bool) + +SafeRepresenter.add_representer(int, + SafeRepresenter.represent_int) + +SafeRepresenter.add_representer(float, + SafeRepresenter.represent_float) + +SafeRepresenter.add_representer(list, + SafeRepresenter.represent_list) + +SafeRepresenter.add_representer(tuple, + SafeRepresenter.represent_list) + +SafeRepresenter.add_representer(dict, + SafeRepresenter.represent_dict) + +SafeRepresenter.add_representer(set, + SafeRepresenter.represent_set) + +SafeRepresenter.add_representer(datetime.date, + SafeRepresenter.represent_date) + +SafeRepresenter.add_representer(datetime.datetime, + SafeRepresenter.represent_datetime) + +SafeRepresenter.add_representer(None, + SafeRepresenter.represent_undefined) + +class Representer(SafeRepresenter): + + def represent_complex(self, data): + if data.imag == 0.0: + data = '%r' % data.real + elif data.real == 0.0: + data = '%rj' % data.imag + elif data.imag > 0: + data = '%r+%rj' % (data.real, data.imag) + else: + data = '%r%rj' % (data.real, data.imag) + return self.represent_scalar('tag:yaml.org,2002:python/complex', data) + + def represent_tuple(self, data): + return self.represent_sequence('tag:yaml.org,2002:python/tuple', data) + + def represent_name(self, data): + name = '%s.%s' % (data.__module__, data.__name__) + return self.represent_scalar('tag:yaml.org,2002:python/name:'+name, '') + + def represent_module(self, data): + return self.represent_scalar( + 'tag:yaml.org,2002:python/module:'+data.__name__, '') + + def represent_object(self, data): + # We use __reduce__ API to save the data. data.__reduce__ returns + # a tuple of length 2-5: + # (function, args, state, listitems, dictitems) + + # For reconstructing, we calls function(*args), then set its state, + # listitems, and dictitems if they are not None. + + # A special case is when function.__name__ == '__newobj__'. In this + # case we create the object with args[0].__new__(*args). + + # Another special case is when __reduce__ returns a string - we don't + # support it. + + # We produce a !!python/object, !!python/object/new or + # !!python/object/apply node. + + cls = type(data) + if cls in copyreg.dispatch_table: + reduce = copyreg.dispatch_table[cls](data) + elif hasattr(data, '__reduce_ex__'): + reduce = data.__reduce_ex__(2) + elif hasattr(data, '__reduce__'): + reduce = data.__reduce__() + else: + raise RepresenterError("cannot represent an object", data) + reduce = (list(reduce)+[None]*5)[:5] + function, args, state, listitems, dictitems = reduce + args = list(args) + if state is None: + state = {} + if listitems is not None: + listitems = list(listitems) + if dictitems is not None: + dictitems = dict(dictitems) + if function.__name__ == '__newobj__': + function = args[0] + args = args[1:] + tag = 'tag:yaml.org,2002:python/object/new:' + newobj = True + else: + tag = 'tag:yaml.org,2002:python/object/apply:' + newobj = False + function_name = '%s.%s' % (function.__module__, function.__name__) + if not args and not listitems and not dictitems \ + and isinstance(state, dict) and newobj: + return self.represent_mapping( + 'tag:yaml.org,2002:python/object:'+function_name, state) + if not listitems and not dictitems \ + and isinstance(state, dict) and not state: + return self.represent_sequence(tag+function_name, args) + value = {} + if args: + value['args'] = args + if state or not isinstance(state, dict): + value['state'] = state + if listitems: + value['listitems'] = listitems + if dictitems: + value['dictitems'] = dictitems + return self.represent_mapping(tag+function_name, value) + + def represent_ordered_dict(self, data): + # Provide uniform representation across different Python versions. + data_type = type(data) + tag = 'tag:yaml.org,2002:python/object/apply:%s.%s' \ + % (data_type.__module__, data_type.__name__) + items = [[key, value] for key, value in data.items()] + return self.represent_sequence(tag, [items]) + +Representer.add_representer(complex, + Representer.represent_complex) + +Representer.add_representer(tuple, + Representer.represent_tuple) + +Representer.add_representer(type, + Representer.represent_name) + +Representer.add_representer(collections.OrderedDict, + Representer.represent_ordered_dict) + +Representer.add_representer(types.FunctionType, + Representer.represent_name) + +Representer.add_representer(types.BuiltinFunctionType, + Representer.represent_name) + +Representer.add_representer(types.ModuleType, + Representer.represent_module) + +Representer.add_multi_representer(object, + Representer.represent_object) + diff --git a/libs3/yaml/resolver.py b/libs3/yaml/resolver.py new file mode 100644 index 000000000..02b82e73e --- /dev/null +++ b/libs3/yaml/resolver.py @@ -0,0 +1,227 @@ + +__all__ = ['BaseResolver', 'Resolver'] + +from .error import * +from .nodes import * + +import re + +class ResolverError(YAMLError): + pass + +class BaseResolver: + + DEFAULT_SCALAR_TAG = 'tag:yaml.org,2002:str' + DEFAULT_SEQUENCE_TAG = 'tag:yaml.org,2002:seq' + DEFAULT_MAPPING_TAG = 'tag:yaml.org,2002:map' + + yaml_implicit_resolvers = {} + yaml_path_resolvers = {} + + def __init__(self): + self.resolver_exact_paths = [] + self.resolver_prefix_paths = [] + + @classmethod + def add_implicit_resolver(cls, tag, regexp, first): + if not 'yaml_implicit_resolvers' in cls.__dict__: + implicit_resolvers = {} + for key in cls.yaml_implicit_resolvers: + implicit_resolvers[key] = cls.yaml_implicit_resolvers[key][:] + cls.yaml_implicit_resolvers = implicit_resolvers + if first is None: + first = [None] + for ch in first: + cls.yaml_implicit_resolvers.setdefault(ch, []).append((tag, regexp)) + + @classmethod + def add_path_resolver(cls, tag, path, kind=None): + # Note: `add_path_resolver` is experimental. The API could be changed. + # `new_path` is a pattern that is matched against the path from the + # root to the node that is being considered. `node_path` elements are + # tuples `(node_check, index_check)`. `node_check` is a node class: + # `ScalarNode`, `SequenceNode`, `MappingNode` or `None`. `None` + # matches any kind of a node. `index_check` could be `None`, a boolean + # value, a string value, or a number. `None` and `False` match against + # any _value_ of sequence and mapping nodes. `True` matches against + # any _key_ of a mapping node. A string `index_check` matches against + # a mapping value that corresponds to a scalar key which content is + # equal to the `index_check` value. An integer `index_check` matches + # against a sequence value with the index equal to `index_check`. + if not 'yaml_path_resolvers' in cls.__dict__: + cls.yaml_path_resolvers = cls.yaml_path_resolvers.copy() + new_path = [] + for element in path: + if isinstance(element, (list, tuple)): + if len(element) == 2: + node_check, index_check = element + elif len(element) == 1: + node_check = element[0] + index_check = True + else: + raise ResolverError("Invalid path element: %s" % element) + else: + node_check = None + index_check = element + if node_check is str: + node_check = ScalarNode + elif node_check is list: + node_check = SequenceNode + elif node_check is dict: + node_check = MappingNode + elif node_check not in [ScalarNode, SequenceNode, MappingNode] \ + and not isinstance(node_check, str) \ + and node_check is not None: + raise ResolverError("Invalid node checker: %s" % node_check) + if not isinstance(index_check, (str, int)) \ + and index_check is not None: + raise ResolverError("Invalid index checker: %s" % index_check) + new_path.append((node_check, index_check)) + if kind is str: + kind = ScalarNode + elif kind is list: + kind = SequenceNode + elif kind is dict: + kind = MappingNode + elif kind not in [ScalarNode, SequenceNode, MappingNode] \ + and kind is not None: + raise ResolverError("Invalid node kind: %s" % kind) + cls.yaml_path_resolvers[tuple(new_path), kind] = tag + + def descend_resolver(self, current_node, current_index): + if not self.yaml_path_resolvers: + return + exact_paths = {} + prefix_paths = [] + if current_node: + depth = len(self.resolver_prefix_paths) + for path, kind in self.resolver_prefix_paths[-1]: + if self.check_resolver_prefix(depth, path, kind, + current_node, current_index): + if len(path) > depth: + prefix_paths.append((path, kind)) + else: + exact_paths[kind] = self.yaml_path_resolvers[path, kind] + else: + for path, kind in self.yaml_path_resolvers: + if not path: + exact_paths[kind] = self.yaml_path_resolvers[path, kind] + else: + prefix_paths.append((path, kind)) + self.resolver_exact_paths.append(exact_paths) + self.resolver_prefix_paths.append(prefix_paths) + + def ascend_resolver(self): + if not self.yaml_path_resolvers: + return + self.resolver_exact_paths.pop() + self.resolver_prefix_paths.pop() + + def check_resolver_prefix(self, depth, path, kind, + current_node, current_index): + node_check, index_check = path[depth-1] + if isinstance(node_check, str): + if current_node.tag != node_check: + return + elif node_check is not None: + if not isinstance(current_node, node_check): + return + if index_check is True and current_index is not None: + return + if (index_check is False or index_check is None) \ + and current_index is None: + return + if isinstance(index_check, str): + if not (isinstance(current_index, ScalarNode) + and index_check == current_index.value): + return + elif isinstance(index_check, int) and not isinstance(index_check, bool): + if index_check != current_index: + return + return True + + def resolve(self, kind, value, implicit): + if kind is ScalarNode and implicit[0]: + if value == '': + resolvers = self.yaml_implicit_resolvers.get('', []) + else: + resolvers = self.yaml_implicit_resolvers.get(value[0], []) + resolvers += self.yaml_implicit_resolvers.get(None, []) + for tag, regexp in resolvers: + if regexp.match(value): + return tag + implicit = implicit[1] + if self.yaml_path_resolvers: + exact_paths = self.resolver_exact_paths[-1] + if kind in exact_paths: + return exact_paths[kind] + if None in exact_paths: + return exact_paths[None] + if kind is ScalarNode: + return self.DEFAULT_SCALAR_TAG + elif kind is SequenceNode: + return self.DEFAULT_SEQUENCE_TAG + elif kind is MappingNode: + return self.DEFAULT_MAPPING_TAG + +class Resolver(BaseResolver): + pass + +Resolver.add_implicit_resolver( + 'tag:yaml.org,2002:bool', + re.compile(r'''^(?:yes|Yes|YES|no|No|NO + |true|True|TRUE|false|False|FALSE + |on|On|ON|off|Off|OFF)$''', re.X), + list('yYnNtTfFoO')) + +Resolver.add_implicit_resolver( + 'tag:yaml.org,2002:float', + re.compile(r'''^(?:[-+]?(?:[0-9][0-9_]*)\.[0-9_]*(?:[eE][-+][0-9]+)? + |\.[0-9_]+(?:[eE][-+][0-9]+)? + |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\.[0-9_]* + |[-+]?\.(?:inf|Inf|INF) + |\.(?:nan|NaN|NAN))$''', re.X), + list('-+0123456789.')) + +Resolver.add_implicit_resolver( + 'tag:yaml.org,2002:int', + re.compile(r'''^(?:[-+]?0b[0-1_]+ + |[-+]?0[0-7_]+ + |[-+]?(?:0|[1-9][0-9_]*) + |[-+]?0x[0-9a-fA-F_]+ + |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9])+)$''', re.X), + list('-+0123456789')) + +Resolver.add_implicit_resolver( + 'tag:yaml.org,2002:merge', + re.compile(r'^(?:<<)$'), + ['<']) + +Resolver.add_implicit_resolver( + 'tag:yaml.org,2002:null', + re.compile(r'''^(?: ~ + |null|Null|NULL + | )$''', re.X), + ['~', 'n', 'N', '']) + +Resolver.add_implicit_resolver( + 'tag:yaml.org,2002:timestamp', + re.compile(r'''^(?:[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] + |[0-9][0-9][0-9][0-9] -[0-9][0-9]? -[0-9][0-9]? + (?:[Tt]|[ \t]+)[0-9][0-9]? + :[0-9][0-9] :[0-9][0-9] (?:\.[0-9]*)? + (?:[ \t]*(?:Z|[-+][0-9][0-9]?(?::[0-9][0-9])?))?)$''', re.X), + list('0123456789')) + +Resolver.add_implicit_resolver( + 'tag:yaml.org,2002:value', + re.compile(r'^(?:=)$'), + ['=']) + +# The following resolver is only for documentation purposes. It cannot work +# because plain scalars cannot start with '!', '&', or '*'. +Resolver.add_implicit_resolver( + 'tag:yaml.org,2002:yaml', + re.compile(r'^(?:!|&|\*)$'), + list('!&*')) + diff --git a/libs3/yaml/scanner.py b/libs3/yaml/scanner.py new file mode 100644 index 000000000..775dbcc6d --- /dev/null +++ b/libs3/yaml/scanner.py @@ -0,0 +1,1435 @@ + +# Scanner produces tokens of the following types: +# STREAM-START +# STREAM-END +# DIRECTIVE(name, value) +# DOCUMENT-START +# DOCUMENT-END +# BLOCK-SEQUENCE-START +# BLOCK-MAPPING-START +# BLOCK-END +# FLOW-SEQUENCE-START +# FLOW-MAPPING-START +# FLOW-SEQUENCE-END +# FLOW-MAPPING-END +# BLOCK-ENTRY +# FLOW-ENTRY +# KEY +# VALUE +# ALIAS(value) +# ANCHOR(value) +# TAG(value) +# SCALAR(value, plain, style) +# +# Read comments in the Scanner code for more details. +# + +__all__ = ['Scanner', 'ScannerError'] + +from .error import MarkedYAMLError +from .tokens import * + +class ScannerError(MarkedYAMLError): + pass + +class SimpleKey: + # See below simple keys treatment. + + def __init__(self, token_number, required, index, line, column, mark): + self.token_number = token_number + self.required = required + self.index = index + self.line = line + self.column = column + self.mark = mark + +class Scanner: + + def __init__(self): + """Initialize the scanner.""" + # It is assumed that Scanner and Reader will have a common descendant. + # Reader do the dirty work of checking for BOM and converting the + # input data to Unicode. It also adds NUL to the end. + # + # Reader supports the following methods + # self.peek(i=0) # peek the next i-th character + # self.prefix(l=1) # peek the next l characters + # self.forward(l=1) # read the next l characters and move the pointer. + + # Had we reached the end of the stream? + self.done = False + + # The number of unclosed '{' and '['. `flow_level == 0` means block + # context. + self.flow_level = 0 + + # List of processed tokens that are not yet emitted. + self.tokens = [] + + # Add the STREAM-START token. + self.fetch_stream_start() + + # Number of tokens that were emitted through the `get_token` method. + self.tokens_taken = 0 + + # The current indentation level. + self.indent = -1 + + # Past indentation levels. + self.indents = [] + + # Variables related to simple keys treatment. + + # A simple key is a key that is not denoted by the '?' indicator. + # Example of simple keys: + # --- + # block simple key: value + # ? not a simple key: + # : { flow simple key: value } + # We emit the KEY token before all keys, so when we find a potential + # simple key, we try to locate the corresponding ':' indicator. + # Simple keys should be limited to a single line and 1024 characters. + + # Can a simple key start at the current position? A simple key may + # start: + # - at the beginning of the line, not counting indentation spaces + # (in block context), + # - after '{', '[', ',' (in the flow context), + # - after '?', ':', '-' (in the block context). + # In the block context, this flag also signifies if a block collection + # may start at the current position. + self.allow_simple_key = True + + # Keep track of possible simple keys. This is a dictionary. The key + # is `flow_level`; there can be no more that one possible simple key + # for each level. The value is a SimpleKey record: + # (token_number, required, index, line, column, mark) + # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow), + # '[', or '{' tokens. + self.possible_simple_keys = {} + + # Public methods. + + def check_token(self, *choices): + # Check if the next token is one of the given types. + while self.need_more_tokens(): + self.fetch_more_tokens() + if self.tokens: + if not choices: + return True + for choice in choices: + if isinstance(self.tokens[0], choice): + return True + return False + + def peek_token(self): + # Return the next token, but do not delete if from the queue. + # Return None if no more tokens. + while self.need_more_tokens(): + self.fetch_more_tokens() + if self.tokens: + return self.tokens[0] + else: + return None + + def get_token(self): + # Return the next token. + while self.need_more_tokens(): + self.fetch_more_tokens() + if self.tokens: + self.tokens_taken += 1 + return self.tokens.pop(0) + + # Private methods. + + def need_more_tokens(self): + if self.done: + return False + if not self.tokens: + return True + # The current token may be a potential simple key, so we + # need to look further. + self.stale_possible_simple_keys() + if self.next_possible_simple_key() == self.tokens_taken: + return True + + def fetch_more_tokens(self): + + # Eat whitespaces and comments until we reach the next token. + self.scan_to_next_token() + + # Remove obsolete possible simple keys. + self.stale_possible_simple_keys() + + # Compare the current indentation and column. It may add some tokens + # and decrease the current indentation level. + self.unwind_indent(self.column) + + # Peek the next character. + ch = self.peek() + + # Is it the end of stream? + if ch == '\0': + return self.fetch_stream_end() + + # Is it a directive? + if ch == '%' and self.check_directive(): + return self.fetch_directive() + + # Is it the document start? + if ch == '-' and self.check_document_start(): + return self.fetch_document_start() + + # Is it the document end? + if ch == '.' and self.check_document_end(): + return self.fetch_document_end() + + # TODO: support for BOM within a stream. + #if ch == '\uFEFF': + # return self.fetch_bom() <-- issue BOMToken + + # Note: the order of the following checks is NOT significant. + + # Is it the flow sequence start indicator? + if ch == '[': + return self.fetch_flow_sequence_start() + + # Is it the flow mapping start indicator? + if ch == '{': + return self.fetch_flow_mapping_start() + + # Is it the flow sequence end indicator? + if ch == ']': + return self.fetch_flow_sequence_end() + + # Is it the flow mapping end indicator? + if ch == '}': + return self.fetch_flow_mapping_end() + + # Is it the flow entry indicator? + if ch == ',': + return self.fetch_flow_entry() + + # Is it the block entry indicator? + if ch == '-' and self.check_block_entry(): + return self.fetch_block_entry() + + # Is it the key indicator? + if ch == '?' and self.check_key(): + return self.fetch_key() + + # Is it the value indicator? + if ch == ':' and self.check_value(): + return self.fetch_value() + + # Is it an alias? + if ch == '*': + return self.fetch_alias() + + # Is it an anchor? + if ch == '&': + return self.fetch_anchor() + + # Is it a tag? + if ch == '!': + return self.fetch_tag() + + # Is it a literal scalar? + if ch == '|' and not self.flow_level: + return self.fetch_literal() + + # Is it a folded scalar? + if ch == '>' and not self.flow_level: + return self.fetch_folded() + + # Is it a single quoted scalar? + if ch == '\'': + return self.fetch_single() + + # Is it a double quoted scalar? + if ch == '\"': + return self.fetch_double() + + # It must be a plain scalar then. + if self.check_plain(): + return self.fetch_plain() + + # No? It's an error. Let's produce a nice error message. + raise ScannerError("while scanning for the next token", None, + "found character %r that cannot start any token" % ch, + self.get_mark()) + + # Simple keys treatment. + + def next_possible_simple_key(self): + # Return the number of the nearest possible simple key. Actually we + # don't need to loop through the whole dictionary. We may replace it + # with the following code: + # if not self.possible_simple_keys: + # return None + # return self.possible_simple_keys[ + # min(self.possible_simple_keys.keys())].token_number + min_token_number = None + for level in self.possible_simple_keys: + key = self.possible_simple_keys[level] + if min_token_number is None or key.token_number < min_token_number: + min_token_number = key.token_number + return min_token_number + + def stale_possible_simple_keys(self): + # Remove entries that are no longer possible simple keys. According to + # the YAML specification, simple keys + # - should be limited to a single line, + # - should be no longer than 1024 characters. + # Disabling this procedure will allow simple keys of any length and + # height (may cause problems if indentation is broken though). + for level in list(self.possible_simple_keys): + key = self.possible_simple_keys[level] + if key.line != self.line \ + or self.index-key.index > 1024: + if key.required: + raise ScannerError("while scanning a simple key", key.mark, + "could not find expected ':'", self.get_mark()) + del self.possible_simple_keys[level] + + def save_possible_simple_key(self): + # The next token may start a simple key. We check if it's possible + # and save its position. This function is called for + # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. + + # Check if a simple key is required at the current position. + required = not self.flow_level and self.indent == self.column + + # The next token might be a simple key. Let's save it's number and + # position. + if self.allow_simple_key: + self.remove_possible_simple_key() + token_number = self.tokens_taken+len(self.tokens) + key = SimpleKey(token_number, required, + self.index, self.line, self.column, self.get_mark()) + self.possible_simple_keys[self.flow_level] = key + + def remove_possible_simple_key(self): + # Remove the saved possible key position at the current flow level. + if self.flow_level in self.possible_simple_keys: + key = self.possible_simple_keys[self.flow_level] + + if key.required: + raise ScannerError("while scanning a simple key", key.mark, + "could not find expected ':'", self.get_mark()) + + del self.possible_simple_keys[self.flow_level] + + # Indentation functions. + + def unwind_indent(self, column): + + ## In flow context, tokens should respect indentation. + ## Actually the condition should be `self.indent >= column` according to + ## the spec. But this condition will prohibit intuitively correct + ## constructions such as + ## key : { + ## } + #if self.flow_level and self.indent > column: + # raise ScannerError(None, None, + # "invalid intendation or unclosed '[' or '{'", + # self.get_mark()) + + # In the flow context, indentation is ignored. We make the scanner less + # restrictive then specification requires. + if self.flow_level: + return + + # In block context, we may need to issue the BLOCK-END tokens. + while self.indent > column: + mark = self.get_mark() + self.indent = self.indents.pop() + self.tokens.append(BlockEndToken(mark, mark)) + + def add_indent(self, column): + # Check if we need to increase indentation. + if self.indent < column: + self.indents.append(self.indent) + self.indent = column + return True + return False + + # Fetchers. + + def fetch_stream_start(self): + # We always add STREAM-START as the first token and STREAM-END as the + # last token. + + # Read the token. + mark = self.get_mark() + + # Add STREAM-START. + self.tokens.append(StreamStartToken(mark, mark, + encoding=self.encoding)) + + + def fetch_stream_end(self): + + # Set the current intendation to -1. + self.unwind_indent(-1) + + # Reset simple keys. + self.remove_possible_simple_key() + self.allow_simple_key = False + self.possible_simple_keys = {} + + # Read the token. + mark = self.get_mark() + + # Add STREAM-END. + self.tokens.append(StreamEndToken(mark, mark)) + + # The steam is finished. + self.done = True + + def fetch_directive(self): + + # Set the current intendation to -1. + self.unwind_indent(-1) + + # Reset simple keys. + self.remove_possible_simple_key() + self.allow_simple_key = False + + # Scan and add DIRECTIVE. + self.tokens.append(self.scan_directive()) + + def fetch_document_start(self): + self.fetch_document_indicator(DocumentStartToken) + + def fetch_document_end(self): + self.fetch_document_indicator(DocumentEndToken) + + def fetch_document_indicator(self, TokenClass): + + # Set the current intendation to -1. + self.unwind_indent(-1) + + # Reset simple keys. Note that there could not be a block collection + # after '---'. + self.remove_possible_simple_key() + self.allow_simple_key = False + + # Add DOCUMENT-START or DOCUMENT-END. + start_mark = self.get_mark() + self.forward(3) + end_mark = self.get_mark() + self.tokens.append(TokenClass(start_mark, end_mark)) + + def fetch_flow_sequence_start(self): + self.fetch_flow_collection_start(FlowSequenceStartToken) + + def fetch_flow_mapping_start(self): + self.fetch_flow_collection_start(FlowMappingStartToken) + + def fetch_flow_collection_start(self, TokenClass): + + # '[' and '{' may start a simple key. + self.save_possible_simple_key() + + # Increase the flow level. + self.flow_level += 1 + + # Simple keys are allowed after '[' and '{'. + self.allow_simple_key = True + + # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START. + start_mark = self.get_mark() + self.forward() + end_mark = self.get_mark() + self.tokens.append(TokenClass(start_mark, end_mark)) + + def fetch_flow_sequence_end(self): + self.fetch_flow_collection_end(FlowSequenceEndToken) + + def fetch_flow_mapping_end(self): + self.fetch_flow_collection_end(FlowMappingEndToken) + + def fetch_flow_collection_end(self, TokenClass): + + # Reset possible simple key on the current level. + self.remove_possible_simple_key() + + # Decrease the flow level. + self.flow_level -= 1 + + # No simple keys after ']' or '}'. + self.allow_simple_key = False + + # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END. + start_mark = self.get_mark() + self.forward() + end_mark = self.get_mark() + self.tokens.append(TokenClass(start_mark, end_mark)) + + def fetch_flow_entry(self): + + # Simple keys are allowed after ','. + self.allow_simple_key = True + + # Reset possible simple key on the current level. + self.remove_possible_simple_key() + + # Add FLOW-ENTRY. + start_mark = self.get_mark() + self.forward() + end_mark = self.get_mark() + self.tokens.append(FlowEntryToken(start_mark, end_mark)) + + def fetch_block_entry(self): + + # Block context needs additional checks. + if not self.flow_level: + + # Are we allowed to start a new entry? + if not self.allow_simple_key: + raise ScannerError(None, None, + "sequence entries are not allowed here", + self.get_mark()) + + # We may need to add BLOCK-SEQUENCE-START. + if self.add_indent(self.column): + mark = self.get_mark() + self.tokens.append(BlockSequenceStartToken(mark, mark)) + + # It's an error for the block entry to occur in the flow context, + # but we let the parser detect this. + else: + pass + + # Simple keys are allowed after '-'. + self.allow_simple_key = True + + # Reset possible simple key on the current level. + self.remove_possible_simple_key() + + # Add BLOCK-ENTRY. + start_mark = self.get_mark() + self.forward() + end_mark = self.get_mark() + self.tokens.append(BlockEntryToken(start_mark, end_mark)) + + def fetch_key(self): + + # Block context needs additional checks. + if not self.flow_level: + + # Are we allowed to start a key (not necessary a simple)? + if not self.allow_simple_key: + raise ScannerError(None, None, + "mapping keys are not allowed here", + self.get_mark()) + + # We may need to add BLOCK-MAPPING-START. + if self.add_indent(self.column): + mark = self.get_mark() + self.tokens.append(BlockMappingStartToken(mark, mark)) + + # Simple keys are allowed after '?' in the block context. + self.allow_simple_key = not self.flow_level + + # Reset possible simple key on the current level. + self.remove_possible_simple_key() + + # Add KEY. + start_mark = self.get_mark() + self.forward() + end_mark = self.get_mark() + self.tokens.append(KeyToken(start_mark, end_mark)) + + def fetch_value(self): + + # Do we determine a simple key? + if self.flow_level in self.possible_simple_keys: + + # Add KEY. + key = self.possible_simple_keys[self.flow_level] + del self.possible_simple_keys[self.flow_level] + self.tokens.insert(key.token_number-self.tokens_taken, + KeyToken(key.mark, key.mark)) + + # If this key starts a new block mapping, we need to add + # BLOCK-MAPPING-START. + if not self.flow_level: + if self.add_indent(key.column): + self.tokens.insert(key.token_number-self.tokens_taken, + BlockMappingStartToken(key.mark, key.mark)) + + # There cannot be two simple keys one after another. + self.allow_simple_key = False + + # It must be a part of a complex key. + else: + + # Block context needs additional checks. + # (Do we really need them? They will be caught by the parser + # anyway.) + if not self.flow_level: + + # We are allowed to start a complex value if and only if + # we can start a simple key. + if not self.allow_simple_key: + raise ScannerError(None, None, + "mapping values are not allowed here", + self.get_mark()) + + # If this value starts a new block mapping, we need to add + # BLOCK-MAPPING-START. It will be detected as an error later by + # the parser. + if not self.flow_level: + if self.add_indent(self.column): + mark = self.get_mark() + self.tokens.append(BlockMappingStartToken(mark, mark)) + + # Simple keys are allowed after ':' in the block context. + self.allow_simple_key = not self.flow_level + + # Reset possible simple key on the current level. + self.remove_possible_simple_key() + + # Add VALUE. + start_mark = self.get_mark() + self.forward() + end_mark = self.get_mark() + self.tokens.append(ValueToken(start_mark, end_mark)) + + def fetch_alias(self): + + # ALIAS could be a simple key. + self.save_possible_simple_key() + + # No simple keys after ALIAS. + self.allow_simple_key = False + + # Scan and add ALIAS. + self.tokens.append(self.scan_anchor(AliasToken)) + + def fetch_anchor(self): + + # ANCHOR could start a simple key. + self.save_possible_simple_key() + + # No simple keys after ANCHOR. + self.allow_simple_key = False + + # Scan and add ANCHOR. + self.tokens.append(self.scan_anchor(AnchorToken)) + + def fetch_tag(self): + + # TAG could start a simple key. + self.save_possible_simple_key() + + # No simple keys after TAG. + self.allow_simple_key = False + + # Scan and add TAG. + self.tokens.append(self.scan_tag()) + + def fetch_literal(self): + self.fetch_block_scalar(style='|') + + def fetch_folded(self): + self.fetch_block_scalar(style='>') + + def fetch_block_scalar(self, style): + + # A simple key may follow a block scalar. + self.allow_simple_key = True + + # Reset possible simple key on the current level. + self.remove_possible_simple_key() + + # Scan and add SCALAR. + self.tokens.append(self.scan_block_scalar(style)) + + def fetch_single(self): + self.fetch_flow_scalar(style='\'') + + def fetch_double(self): + self.fetch_flow_scalar(style='"') + + def fetch_flow_scalar(self, style): + + # A flow scalar could be a simple key. + self.save_possible_simple_key() + + # No simple keys after flow scalars. + self.allow_simple_key = False + + # Scan and add SCALAR. + self.tokens.append(self.scan_flow_scalar(style)) + + def fetch_plain(self): + + # A plain scalar could be a simple key. + self.save_possible_simple_key() + + # No simple keys after plain scalars. But note that `scan_plain` will + # change this flag if the scan is finished at the beginning of the + # line. + self.allow_simple_key = False + + # Scan and add SCALAR. May change `allow_simple_key`. + self.tokens.append(self.scan_plain()) + + # Checkers. + + def check_directive(self): + + # DIRECTIVE: ^ '%' ... + # The '%' indicator is already checked. + if self.column == 0: + return True + + def check_document_start(self): + + # DOCUMENT-START: ^ '---' (' '|'\n') + if self.column == 0: + if self.prefix(3) == '---' \ + and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': + return True + + def check_document_end(self): + + # DOCUMENT-END: ^ '...' (' '|'\n') + if self.column == 0: + if self.prefix(3) == '...' \ + and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': + return True + + def check_block_entry(self): + + # BLOCK-ENTRY: '-' (' '|'\n') + return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' + + def check_key(self): + + # KEY(flow context): '?' + if self.flow_level: + return True + + # KEY(block context): '?' (' '|'\n') + else: + return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' + + def check_value(self): + + # VALUE(flow context): ':' + if self.flow_level: + return True + + # VALUE(block context): ':' (' '|'\n') + else: + return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' + + def check_plain(self): + + # A plain scalar may start with any non-space character except: + # '-', '?', ':', ',', '[', ']', '{', '}', + # '#', '&', '*', '!', '|', '>', '\'', '\"', + # '%', '@', '`'. + # + # It may also start with + # '-', '?', ':' + # if it is followed by a non-space character. + # + # Note that we limit the last rule to the block context (except the + # '-' character) because we want the flow context to be space + # independent. + ch = self.peek() + return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \ + or (self.peek(1) not in '\0 \t\r\n\x85\u2028\u2029' + and (ch == '-' or (not self.flow_level and ch in '?:'))) + + # Scanners. + + def scan_to_next_token(self): + # We ignore spaces, line breaks and comments. + # If we find a line break in the block context, we set the flag + # `allow_simple_key` on. + # The byte order mark is stripped if it's the first character in the + # stream. We do not yet support BOM inside the stream as the + # specification requires. Any such mark will be considered as a part + # of the document. + # + # TODO: We need to make tab handling rules more sane. A good rule is + # Tabs cannot precede tokens + # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, + # KEY(block), VALUE(block), BLOCK-ENTRY + # So the checking code is + # if <TAB>: + # self.allow_simple_keys = False + # We also need to add the check for `allow_simple_keys == True` to + # `unwind_indent` before issuing BLOCK-END. + # Scanners for block, flow, and plain scalars need to be modified. + + if self.index == 0 and self.peek() == '\uFEFF': + self.forward() + found = False + while not found: + while self.peek() == ' ': + self.forward() + if self.peek() == '#': + while self.peek() not in '\0\r\n\x85\u2028\u2029': + self.forward() + if self.scan_line_break(): + if not self.flow_level: + self.allow_simple_key = True + else: + found = True + + def scan_directive(self): + # See the specification for details. + start_mark = self.get_mark() + self.forward() + name = self.scan_directive_name(start_mark) + value = None + if name == 'YAML': + value = self.scan_yaml_directive_value(start_mark) + end_mark = self.get_mark() + elif name == 'TAG': + value = self.scan_tag_directive_value(start_mark) + end_mark = self.get_mark() + else: + end_mark = self.get_mark() + while self.peek() not in '\0\r\n\x85\u2028\u2029': + self.forward() + self.scan_directive_ignored_line(start_mark) + return DirectiveToken(name, value, start_mark, end_mark) + + def scan_directive_name(self, start_mark): + # See the specification for details. + length = 0 + ch = self.peek(length) + while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ + or ch in '-_': + length += 1 + ch = self.peek(length) + if not length: + raise ScannerError("while scanning a directive", start_mark, + "expected alphabetic or numeric character, but found %r" + % ch, self.get_mark()) + value = self.prefix(length) + self.forward(length) + ch = self.peek() + if ch not in '\0 \r\n\x85\u2028\u2029': + raise ScannerError("while scanning a directive", start_mark, + "expected alphabetic or numeric character, but found %r" + % ch, self.get_mark()) + return value + + def scan_yaml_directive_value(self, start_mark): + # See the specification for details. + while self.peek() == ' ': + self.forward() + major = self.scan_yaml_directive_number(start_mark) + if self.peek() != '.': + raise ScannerError("while scanning a directive", start_mark, + "expected a digit or '.', but found %r" % self.peek(), + self.get_mark()) + self.forward() + minor = self.scan_yaml_directive_number(start_mark) + if self.peek() not in '\0 \r\n\x85\u2028\u2029': + raise ScannerError("while scanning a directive", start_mark, + "expected a digit or ' ', but found %r" % self.peek(), + self.get_mark()) + return (major, minor) + + def scan_yaml_directive_number(self, start_mark): + # See the specification for details. + ch = self.peek() + if not ('0' <= ch <= '9'): + raise ScannerError("while scanning a directive", start_mark, + "expected a digit, but found %r" % ch, self.get_mark()) + length = 0 + while '0' <= self.peek(length) <= '9': + length += 1 + value = int(self.prefix(length)) + self.forward(length) + return value + + def scan_tag_directive_value(self, start_mark): + # See the specification for details. + while self.peek() == ' ': + self.forward() + handle = self.scan_tag_directive_handle(start_mark) + while self.peek() == ' ': + self.forward() + prefix = self.scan_tag_directive_prefix(start_mark) + return (handle, prefix) + + def scan_tag_directive_handle(self, start_mark): + # See the specification for details. + value = self.scan_tag_handle('directive', start_mark) + ch = self.peek() + if ch != ' ': + raise ScannerError("while scanning a directive", start_mark, + "expected ' ', but found %r" % ch, self.get_mark()) + return value + + def scan_tag_directive_prefix(self, start_mark): + # See the specification for details. + value = self.scan_tag_uri('directive', start_mark) + ch = self.peek() + if ch not in '\0 \r\n\x85\u2028\u2029': + raise ScannerError("while scanning a directive", start_mark, + "expected ' ', but found %r" % ch, self.get_mark()) + return value + + def scan_directive_ignored_line(self, start_mark): + # See the specification for details. + while self.peek() == ' ': + self.forward() + if self.peek() == '#': + while self.peek() not in '\0\r\n\x85\u2028\u2029': + self.forward() + ch = self.peek() + if ch not in '\0\r\n\x85\u2028\u2029': + raise ScannerError("while scanning a directive", start_mark, + "expected a comment or a line break, but found %r" + % ch, self.get_mark()) + self.scan_line_break() + + def scan_anchor(self, TokenClass): + # The specification does not restrict characters for anchors and + # aliases. This may lead to problems, for instance, the document: + # [ *alias, value ] + # can be interpreted in two ways, as + # [ "value" ] + # and + # [ *alias , "value" ] + # Therefore we restrict aliases to numbers and ASCII letters. + start_mark = self.get_mark() + indicator = self.peek() + if indicator == '*': + name = 'alias' + else: + name = 'anchor' + self.forward() + length = 0 + ch = self.peek(length) + while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ + or ch in '-_': + length += 1 + ch = self.peek(length) + if not length: + raise ScannerError("while scanning an %s" % name, start_mark, + "expected alphabetic or numeric character, but found %r" + % ch, self.get_mark()) + value = self.prefix(length) + self.forward(length) + ch = self.peek() + if ch not in '\0 \t\r\n\x85\u2028\u2029?:,]}%@`': + raise ScannerError("while scanning an %s" % name, start_mark, + "expected alphabetic or numeric character, but found %r" + % ch, self.get_mark()) + end_mark = self.get_mark() + return TokenClass(value, start_mark, end_mark) + + def scan_tag(self): + # See the specification for details. + start_mark = self.get_mark() + ch = self.peek(1) + if ch == '<': + handle = None + self.forward(2) + suffix = self.scan_tag_uri('tag', start_mark) + if self.peek() != '>': + raise ScannerError("while parsing a tag", start_mark, + "expected '>', but found %r" % self.peek(), + self.get_mark()) + self.forward() + elif ch in '\0 \t\r\n\x85\u2028\u2029': + handle = None + suffix = '!' + self.forward() + else: + length = 1 + use_handle = False + while ch not in '\0 \r\n\x85\u2028\u2029': + if ch == '!': + use_handle = True + break + length += 1 + ch = self.peek(length) + handle = '!' + if use_handle: + handle = self.scan_tag_handle('tag', start_mark) + else: + handle = '!' + self.forward() + suffix = self.scan_tag_uri('tag', start_mark) + ch = self.peek() + if ch not in '\0 \r\n\x85\u2028\u2029': + raise ScannerError("while scanning a tag", start_mark, + "expected ' ', but found %r" % ch, self.get_mark()) + value = (handle, suffix) + end_mark = self.get_mark() + return TagToken(value, start_mark, end_mark) + + def scan_block_scalar(self, style): + # See the specification for details. + + if style == '>': + folded = True + else: + folded = False + + chunks = [] + start_mark = self.get_mark() + + # Scan the header. + self.forward() + chomping, increment = self.scan_block_scalar_indicators(start_mark) + self.scan_block_scalar_ignored_line(start_mark) + + # Determine the indentation level and go to the first non-empty line. + min_indent = self.indent+1 + if min_indent < 1: + min_indent = 1 + if increment is None: + breaks, max_indent, end_mark = self.scan_block_scalar_indentation() + indent = max(min_indent, max_indent) + else: + indent = min_indent+increment-1 + breaks, end_mark = self.scan_block_scalar_breaks(indent) + line_break = '' + + # Scan the inner part of the block scalar. + while self.column == indent and self.peek() != '\0': + chunks.extend(breaks) + leading_non_space = self.peek() not in ' \t' + length = 0 + while self.peek(length) not in '\0\r\n\x85\u2028\u2029': + length += 1 + chunks.append(self.prefix(length)) + self.forward(length) + line_break = self.scan_line_break() + breaks, end_mark = self.scan_block_scalar_breaks(indent) + if self.column == indent and self.peek() != '\0': + + # Unfortunately, folding rules are ambiguous. + # + # This is the folding according to the specification: + + if folded and line_break == '\n' \ + and leading_non_space and self.peek() not in ' \t': + if not breaks: + chunks.append(' ') + else: + chunks.append(line_break) + + # This is Clark Evans's interpretation (also in the spec + # examples): + # + #if folded and line_break == '\n': + # if not breaks: + # if self.peek() not in ' \t': + # chunks.append(' ') + # else: + # chunks.append(line_break) + #else: + # chunks.append(line_break) + else: + break + + # Chomp the tail. + if chomping is not False: + chunks.append(line_break) + if chomping is True: + chunks.extend(breaks) + + # We are done. + return ScalarToken(''.join(chunks), False, start_mark, end_mark, + style) + + def scan_block_scalar_indicators(self, start_mark): + # See the specification for details. + chomping = None + increment = None + ch = self.peek() + if ch in '+-': + if ch == '+': + chomping = True + else: + chomping = False + self.forward() + ch = self.peek() + if ch in '0123456789': + increment = int(ch) + if increment == 0: + raise ScannerError("while scanning a block scalar", start_mark, + "expected indentation indicator in the range 1-9, but found 0", + self.get_mark()) + self.forward() + elif ch in '0123456789': + increment = int(ch) + if increment == 0: + raise ScannerError("while scanning a block scalar", start_mark, + "expected indentation indicator in the range 1-9, but found 0", + self.get_mark()) + self.forward() + ch = self.peek() + if ch in '+-': + if ch == '+': + chomping = True + else: + chomping = False + self.forward() + ch = self.peek() + if ch not in '\0 \r\n\x85\u2028\u2029': + raise ScannerError("while scanning a block scalar", start_mark, + "expected chomping or indentation indicators, but found %r" + % ch, self.get_mark()) + return chomping, increment + + def scan_block_scalar_ignored_line(self, start_mark): + # See the specification for details. + while self.peek() == ' ': + self.forward() + if self.peek() == '#': + while self.peek() not in '\0\r\n\x85\u2028\u2029': + self.forward() + ch = self.peek() + if ch not in '\0\r\n\x85\u2028\u2029': + raise ScannerError("while scanning a block scalar", start_mark, + "expected a comment or a line break, but found %r" % ch, + self.get_mark()) + self.scan_line_break() + + def scan_block_scalar_indentation(self): + # See the specification for details. + chunks = [] + max_indent = 0 + end_mark = self.get_mark() + while self.peek() in ' \r\n\x85\u2028\u2029': + if self.peek() != ' ': + chunks.append(self.scan_line_break()) + end_mark = self.get_mark() + else: + self.forward() + if self.column > max_indent: + max_indent = self.column + return chunks, max_indent, end_mark + + def scan_block_scalar_breaks(self, indent): + # See the specification for details. + chunks = [] + end_mark = self.get_mark() + while self.column < indent and self.peek() == ' ': + self.forward() + while self.peek() in '\r\n\x85\u2028\u2029': + chunks.append(self.scan_line_break()) + end_mark = self.get_mark() + while self.column < indent and self.peek() == ' ': + self.forward() + return chunks, end_mark + + def scan_flow_scalar(self, style): + # See the specification for details. + # Note that we loose indentation rules for quoted scalars. Quoted + # scalars don't need to adhere indentation because " and ' clearly + # mark the beginning and the end of them. Therefore we are less + # restrictive then the specification requires. We only need to check + # that document separators are not included in scalars. + if style == '"': + double = True + else: + double = False + chunks = [] + start_mark = self.get_mark() + quote = self.peek() + self.forward() + chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) + while self.peek() != quote: + chunks.extend(self.scan_flow_scalar_spaces(double, start_mark)) + chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) + self.forward() + end_mark = self.get_mark() + return ScalarToken(''.join(chunks), False, start_mark, end_mark, + style) + + ESCAPE_REPLACEMENTS = { + '0': '\0', + 'a': '\x07', + 'b': '\x08', + 't': '\x09', + '\t': '\x09', + 'n': '\x0A', + 'v': '\x0B', + 'f': '\x0C', + 'r': '\x0D', + 'e': '\x1B', + ' ': '\x20', + '\"': '\"', + '\\': '\\', + '/': '/', + 'N': '\x85', + '_': '\xA0', + 'L': '\u2028', + 'P': '\u2029', + } + + ESCAPE_CODES = { + 'x': 2, + 'u': 4, + 'U': 8, + } + + def scan_flow_scalar_non_spaces(self, double, start_mark): + # See the specification for details. + chunks = [] + while True: + length = 0 + while self.peek(length) not in '\'\"\\\0 \t\r\n\x85\u2028\u2029': + length += 1 + if length: + chunks.append(self.prefix(length)) + self.forward(length) + ch = self.peek() + if not double and ch == '\'' and self.peek(1) == '\'': + chunks.append('\'') + self.forward(2) + elif (double and ch == '\'') or (not double and ch in '\"\\'): + chunks.append(ch) + self.forward() + elif double and ch == '\\': + self.forward() + ch = self.peek() + if ch in self.ESCAPE_REPLACEMENTS: + chunks.append(self.ESCAPE_REPLACEMENTS[ch]) + self.forward() + elif ch in self.ESCAPE_CODES: + length = self.ESCAPE_CODES[ch] + self.forward() + for k in range(length): + if self.peek(k) not in '0123456789ABCDEFabcdef': + raise ScannerError("while scanning a double-quoted scalar", start_mark, + "expected escape sequence of %d hexdecimal numbers, but found %r" % + (length, self.peek(k)), self.get_mark()) + code = int(self.prefix(length), 16) + chunks.append(chr(code)) + self.forward(length) + elif ch in '\r\n\x85\u2028\u2029': + self.scan_line_break() + chunks.extend(self.scan_flow_scalar_breaks(double, start_mark)) + else: + raise ScannerError("while scanning a double-quoted scalar", start_mark, + "found unknown escape character %r" % ch, self.get_mark()) + else: + return chunks + + def scan_flow_scalar_spaces(self, double, start_mark): + # See the specification for details. + chunks = [] + length = 0 + while self.peek(length) in ' \t': + length += 1 + whitespaces = self.prefix(length) + self.forward(length) + ch = self.peek() + if ch == '\0': + raise ScannerError("while scanning a quoted scalar", start_mark, + "found unexpected end of stream", self.get_mark()) + elif ch in '\r\n\x85\u2028\u2029': + line_break = self.scan_line_break() + breaks = self.scan_flow_scalar_breaks(double, start_mark) + if line_break != '\n': + chunks.append(line_break) + elif not breaks: + chunks.append(' ') + chunks.extend(breaks) + else: + chunks.append(whitespaces) + return chunks + + def scan_flow_scalar_breaks(self, double, start_mark): + # See the specification for details. + chunks = [] + while True: + # Instead of checking indentation, we check for document + # separators. + prefix = self.prefix(3) + if (prefix == '---' or prefix == '...') \ + and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': + raise ScannerError("while scanning a quoted scalar", start_mark, + "found unexpected document separator", self.get_mark()) + while self.peek() in ' \t': + self.forward() + if self.peek() in '\r\n\x85\u2028\u2029': + chunks.append(self.scan_line_break()) + else: + return chunks + + def scan_plain(self): + # See the specification for details. + # We add an additional restriction for the flow context: + # plain scalars in the flow context cannot contain ',' or '?'. + # We also keep track of the `allow_simple_key` flag here. + # Indentation rules are loosed for the flow context. + chunks = [] + start_mark = self.get_mark() + end_mark = start_mark + indent = self.indent+1 + # We allow zero indentation for scalars, but then we need to check for + # document separators at the beginning of the line. + #if indent == 0: + # indent = 1 + spaces = [] + while True: + length = 0 + if self.peek() == '#': + break + while True: + ch = self.peek(length) + if ch in '\0 \t\r\n\x85\u2028\u2029' \ + or (ch == ':' and + self.peek(length+1) in '\0 \t\r\n\x85\u2028\u2029' + + (u',[]{}' if self.flow_level else u''))\ + or (self.flow_level and ch in ',?[]{}'): + break + length += 1 + if length == 0: + break + self.allow_simple_key = False + chunks.extend(spaces) + chunks.append(self.prefix(length)) + self.forward(length) + end_mark = self.get_mark() + spaces = self.scan_plain_spaces(indent, start_mark) + if not spaces or self.peek() == '#' \ + or (not self.flow_level and self.column < indent): + break + return ScalarToken(''.join(chunks), True, start_mark, end_mark) + + def scan_plain_spaces(self, indent, start_mark): + # See the specification for details. + # The specification is really confusing about tabs in plain scalars. + # We just forbid them completely. Do not use tabs in YAML! + chunks = [] + length = 0 + while self.peek(length) in ' ': + length += 1 + whitespaces = self.prefix(length) + self.forward(length) + ch = self.peek() + if ch in '\r\n\x85\u2028\u2029': + line_break = self.scan_line_break() + self.allow_simple_key = True + prefix = self.prefix(3) + if (prefix == '---' or prefix == '...') \ + and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': + return + breaks = [] + while self.peek() in ' \r\n\x85\u2028\u2029': + if self.peek() == ' ': + self.forward() + else: + breaks.append(self.scan_line_break()) + prefix = self.prefix(3) + if (prefix == '---' or prefix == '...') \ + and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': + return + if line_break != '\n': + chunks.append(line_break) + elif not breaks: + chunks.append(' ') + chunks.extend(breaks) + elif whitespaces: + chunks.append(whitespaces) + return chunks + + def scan_tag_handle(self, name, start_mark): + # See the specification for details. + # For some strange reasons, the specification does not allow '_' in + # tag handles. I have allowed it anyway. + ch = self.peek() + if ch != '!': + raise ScannerError("while scanning a %s" % name, start_mark, + "expected '!', but found %r" % ch, self.get_mark()) + length = 1 + ch = self.peek(length) + if ch != ' ': + while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ + or ch in '-_': + length += 1 + ch = self.peek(length) + if ch != '!': + self.forward(length) + raise ScannerError("while scanning a %s" % name, start_mark, + "expected '!', but found %r" % ch, self.get_mark()) + length += 1 + value = self.prefix(length) + self.forward(length) + return value + + def scan_tag_uri(self, name, start_mark): + # See the specification for details. + # Note: we do not check if URI is well-formed. + chunks = [] + length = 0 + ch = self.peek(length) + while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ + or ch in '-;/?:@&=+$,_.!~*\'()[]%': + if ch == '%': + chunks.append(self.prefix(length)) + self.forward(length) + length = 0 + chunks.append(self.scan_uri_escapes(name, start_mark)) + else: + length += 1 + ch = self.peek(length) + if length: + chunks.append(self.prefix(length)) + self.forward(length) + length = 0 + if not chunks: + raise ScannerError("while parsing a %s" % name, start_mark, + "expected URI, but found %r" % ch, self.get_mark()) + return ''.join(chunks) + + def scan_uri_escapes(self, name, start_mark): + # See the specification for details. + codes = [] + mark = self.get_mark() + while self.peek() == '%': + self.forward() + for k in range(2): + if self.peek(k) not in '0123456789ABCDEFabcdef': + raise ScannerError("while scanning a %s" % name, start_mark, + "expected URI escape sequence of 2 hexdecimal numbers, but found %r" + % self.peek(k), self.get_mark()) + codes.append(int(self.prefix(2), 16)) + self.forward(2) + try: + value = bytes(codes).decode('utf-8') + except UnicodeDecodeError as exc: + raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark) + return value + + def scan_line_break(self): + # Transforms: + # '\r\n' : '\n' + # '\r' : '\n' + # '\n' : '\n' + # '\x85' : '\n' + # '\u2028' : '\u2028' + # '\u2029 : '\u2029' + # default : '' + ch = self.peek() + if ch in '\r\n\x85': + if self.prefix(2) == '\r\n': + self.forward(2) + else: + self.forward() + return '\n' + elif ch in '\u2028\u2029': + self.forward() + return ch + return '' diff --git a/libs3/yaml/serializer.py b/libs3/yaml/serializer.py new file mode 100644 index 000000000..fe911e67a --- /dev/null +++ b/libs3/yaml/serializer.py @@ -0,0 +1,111 @@ + +__all__ = ['Serializer', 'SerializerError'] + +from .error import YAMLError +from .events import * +from .nodes import * + +class SerializerError(YAMLError): + pass + +class Serializer: + + ANCHOR_TEMPLATE = 'id%03d' + + def __init__(self, encoding=None, + explicit_start=None, explicit_end=None, version=None, tags=None): + self.use_encoding = encoding + self.use_explicit_start = explicit_start + self.use_explicit_end = explicit_end + self.use_version = version + self.use_tags = tags + self.serialized_nodes = {} + self.anchors = {} + self.last_anchor_id = 0 + self.closed = None + + def open(self): + if self.closed is None: + self.emit(StreamStartEvent(encoding=self.use_encoding)) + self.closed = False + elif self.closed: + raise SerializerError("serializer is closed") + else: + raise SerializerError("serializer is already opened") + + def close(self): + if self.closed is None: + raise SerializerError("serializer is not opened") + elif not self.closed: + self.emit(StreamEndEvent()) + self.closed = True + + #def __del__(self): + # self.close() + + def serialize(self, node): + if self.closed is None: + raise SerializerError("serializer is not opened") + elif self.closed: + raise SerializerError("serializer is closed") + self.emit(DocumentStartEvent(explicit=self.use_explicit_start, + version=self.use_version, tags=self.use_tags)) + self.anchor_node(node) + self.serialize_node(node, None, None) + self.emit(DocumentEndEvent(explicit=self.use_explicit_end)) + self.serialized_nodes = {} + self.anchors = {} + self.last_anchor_id = 0 + + def anchor_node(self, node): + if node in self.anchors: + if self.anchors[node] is None: + self.anchors[node] = self.generate_anchor(node) + else: + self.anchors[node] = None + if isinstance(node, SequenceNode): + for item in node.value: + self.anchor_node(item) + elif isinstance(node, MappingNode): + for key, value in node.value: + self.anchor_node(key) + self.anchor_node(value) + + def generate_anchor(self, node): + self.last_anchor_id += 1 + return self.ANCHOR_TEMPLATE % self.last_anchor_id + + def serialize_node(self, node, parent, index): + alias = self.anchors[node] + if node in self.serialized_nodes: + self.emit(AliasEvent(alias)) + else: + self.serialized_nodes[node] = True + self.descend_resolver(parent, index) + if isinstance(node, ScalarNode): + detected_tag = self.resolve(ScalarNode, node.value, (True, False)) + default_tag = self.resolve(ScalarNode, node.value, (False, True)) + implicit = (node.tag == detected_tag), (node.tag == default_tag) + self.emit(ScalarEvent(alias, node.tag, implicit, node.value, + style=node.style)) + elif isinstance(node, SequenceNode): + implicit = (node.tag + == self.resolve(SequenceNode, node.value, True)) + self.emit(SequenceStartEvent(alias, node.tag, implicit, + flow_style=node.flow_style)) + index = 0 + for item in node.value: + self.serialize_node(item, node, index) + index += 1 + self.emit(SequenceEndEvent()) + elif isinstance(node, MappingNode): + implicit = (node.tag + == self.resolve(MappingNode, node.value, True)) + self.emit(MappingStartEvent(alias, node.tag, implicit, + flow_style=node.flow_style)) + for key, value in node.value: + self.serialize_node(key, node, None) + self.serialize_node(value, node, key) + self.emit(MappingEndEvent()) + self.ascend_resolver() + diff --git a/libs3/yaml/tokens.py b/libs3/yaml/tokens.py new file mode 100644 index 000000000..4d0b48a39 --- /dev/null +++ b/libs3/yaml/tokens.py @@ -0,0 +1,104 @@ + +class Token(object): + def __init__(self, start_mark, end_mark): + self.start_mark = start_mark + self.end_mark = end_mark + def __repr__(self): + attributes = [key for key in self.__dict__ + if not key.endswith('_mark')] + attributes.sort() + arguments = ', '.join(['%s=%r' % (key, getattr(self, key)) + for key in attributes]) + return '%s(%s)' % (self.__class__.__name__, arguments) + +#class BOMToken(Token): +# id = '<byte order mark>' + +class DirectiveToken(Token): + id = '<directive>' + def __init__(self, name, value, start_mark, end_mark): + self.name = name + self.value = value + self.start_mark = start_mark + self.end_mark = end_mark + +class DocumentStartToken(Token): + id = '<document start>' + +class DocumentEndToken(Token): + id = '<document end>' + +class StreamStartToken(Token): + id = '<stream start>' + def __init__(self, start_mark=None, end_mark=None, + encoding=None): + self.start_mark = start_mark + self.end_mark = end_mark + self.encoding = encoding + +class StreamEndToken(Token): + id = '<stream end>' + +class BlockSequenceStartToken(Token): + id = '<block sequence start>' + +class BlockMappingStartToken(Token): + id = '<block mapping start>' + +class BlockEndToken(Token): + id = '<block end>' + +class FlowSequenceStartToken(Token): + id = '[' + +class FlowMappingStartToken(Token): + id = '{' + +class FlowSequenceEndToken(Token): + id = ']' + +class FlowMappingEndToken(Token): + id = '}' + +class KeyToken(Token): + id = '?' + +class ValueToken(Token): + id = ':' + +class BlockEntryToken(Token): + id = '-' + +class FlowEntryToken(Token): + id = ',' + +class AliasToken(Token): + id = '<alias>' + def __init__(self, value, start_mark, end_mark): + self.value = value + self.start_mark = start_mark + self.end_mark = end_mark + +class AnchorToken(Token): + id = '<anchor>' + def __init__(self, value, start_mark, end_mark): + self.value = value + self.start_mark = start_mark + self.end_mark = end_mark + +class TagToken(Token): + id = '<tag>' + def __init__(self, value, start_mark, end_mark): + self.value = value + self.start_mark = start_mark + self.end_mark = end_mark + +class ScalarToken(Token): + id = '<scalar>' + def __init__(self, value, plain, start_mark, end_mark, style=None): + self.value = value + self.plain = plain + self.start_mark = start_mark + self.end_mark = end_mark + self.style = style + diff --git a/views/episodes.tpl b/views/episodes.tpl index 9567098d9..3b928335b 100644 --- a/views/episodes.tpl +++ b/views/episodes.tpl @@ -81,7 +81,7 @@ $('.ui.accordion').accordion(); const first_season_acc_title = document.getElementsByClassName("title")[0]; first_season_acc_title.className += " active"; - const first_season_acc_content = document.getElementsByClassName("content")[0]; + const first_season_acc_content = document.getElementsByClassName("content season")[0]; first_season_acc_content.className += " active"; }); </script> @@ -189,7 +189,7 @@ </div> </div> </div> - <div class="content"> + <div class="content season"> <table class="ui very basic single line selectable table"> <thead> <tr> @@ -222,7 +222,7 @@ <td> %if episode['subtitles'] is not None: % actual_languages = ast.literal_eval(episode['subtitles']) - % actual_languages = sorted(actual_languages, key=lambda x: (x is None, x)) + % actual_languages.sort(key=lambda x: x[0]) %else: % actual_languages = '[]' %end @@ -731,7 +731,7 @@ render: function ( data, type, row ) { const array_release_info = data.release_info; let i; - let text = '<div class="ui fluid accordion"><div class="title"><i class="dropdown icon"></i>...</div><div class="content">'; + let text = '<div class="ui fluid accordion"><div class="title"><i class="dropdown icon"></i>...</div><div class="content season">'; for (i = 0; i < array_release_info.length; i++) { text += '<div class="ui tiny label" style="margin-bottom: 2px;">' + array_release_info[i] + '</div>'; } diff --git a/views/menu.tpl b/views/menu.tpl index a44b7b7a4..b13e44326 100644 --- a/views/menu.tpl +++ b/views/menu.tpl @@ -61,6 +61,7 @@ % from database import database % import operator % from config import settings + % from functools import reduce %if settings.sonarr.getboolean('only_monitored'): % monitored_only_query_string_sonarr = ' AND monitored = "True"' @@ -226,6 +227,31 @@ % elif restart_required['configured'] == '1': <div class='ui center aligned grid'><div class='fifteen wide column'><div class="ui red message">Bazarr Needs To Be Restarted To Apply Changes To General Settings. Click <a href=# id="restart_link">Here</a> To Restart.</div></div></div> % end + + % from six import PY2 + % import datetime + % if PY2: + <div class='ui left aligned grid'> + <div class='fluid column'> + <div class="ui yellow icon message"> + <i class="python icon"></i> + <div class="content"> + <div class="header">Python deprecation warning</div> + Bazarr is now compatible with Python 3.6 and newer. You should upgrade Python as we'll drop support for Python 2.7.x by the end of 2019. + <div class="ui bulleted list"> + % if os.name == 'posix': + <div class="item">If you are running under Docker, don't worry, we'll take care of this for you. Just pull the new image.</div> + % end + % if os.name == 'nt': + <div class="item">If you have installed using the Windows Installer, we'll provide a new installer soon.</div> + % end + <div class="item">If you are running from source, it's up to you to install Python 3 and use it to run Bazarr.</div> + </div> + </div> + </div> + </div> + </div> + % end </div> </body> </html> diff --git a/views/movie.tpl b/views/movie.tpl index 344a47589..7b917b7ac 100644 --- a/views/movie.tpl +++ b/views/movie.tpl @@ -178,7 +178,7 @@ <tbody> <% subtitles_files = ast.literal_eval(str(details['subtitles'])) - subtitles_files = sorted(subtitles_files, key=lambda x: (x is None, x)) + subtitles_files.sort(key=lambda x: x[0]) if subtitles_files is not None: for subtitles_file in subtitles_files: if subtitles_file[0].endswith(':forced'): diff --git a/views/system.tpl b/views/system.tpl index 3b66e5360..309067dbe 100644 --- a/views/system.tpl +++ b/views/system.tpl @@ -154,7 +154,7 @@ <tr> <td>{{provider[0]}}</td> <td>{{provider[1] if provider[1] is not None else "Good"}}</td> - <td>{{provider[2] if provider[2] is not "now" else "-"}}</td> + <td>{{provider[2] if provider[2] != "now" else "-"}}</td> </tr> %end </tbody> diff --git a/views/wanted.tpl b/views/wanted.tpl index 7284aada9..3f4c8e4b3 100644 --- a/views/wanted.tpl +++ b/views/wanted.tpl @@ -47,6 +47,7 @@ % from database import database % import operator % from config import settings + % from functools import reduce %if settings.sonarr.getboolean('only_monitored'): % episodes_missing_subtitles_clause = " AND monitored='True'" |