aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorSergey M․ <[email protected]>2016-06-04 00:26:03 +0700
committerSergey M․ <[email protected]>2016-06-04 00:26:03 +0700
commit4d8856d511aef11b5dbeb9f6523c2a117bdbb85d (patch)
treec256b5149d914edcef72dab56bd675c463fc5cac
parentc917106be4d6d98ce7504d71a32b58ddca2bc03d (diff)
downloadyoutube-dl-4d8856d511aef11b5dbeb9f6523c2a117bdbb85d.tar.gz
youtube-dl-4d8856d511aef11b5dbeb9f6523c2a117bdbb85d.zip
[loc] Extract direct download links
-rw-r--r--youtube_dl/extractor/libraryofcongress.py38
1 files changed, 37 insertions, 1 deletions
diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py
index 49351759e..0a94366fd 100644
--- a/youtube_dl/extractor/libraryofcongress.py
+++ b/youtube_dl/extractor/libraryofcongress.py
@@ -1,12 +1,15 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
determine_ext,
float_or_none,
int_or_none,
+ parse_filesize,
)
@@ -40,6 +43,20 @@ class LibraryOfCongressIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ }, {
+ # with direct download links
+ 'url': 'https://www.loc.gov/item/78710669/',
+ 'info_dict': {
+ 'id': '78710669',
+ 'ext': 'mp4',
+ 'title': 'La vie et la passion de Jesus-Christ',
+ 'duration': 0,
+ 'view_count': int,
+ 'formats': 'mincount:4',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -60,6 +77,9 @@ class LibraryOfCongressIE(InfoExtractor):
derivative = data['derivatives'][0]
media_url = derivative['derivativeUrl']
+ title = derivative.get('shortName') or data.get('shortName') or self._og_search_title(
+ webpage)
+
# Following algorithm was extracted from setAVSource js function
# found in webpage
media_url = media_url.replace('rtmp', 'https')
@@ -75,6 +95,7 @@ class LibraryOfCongressIE(InfoExtractor):
'format_id': 'hls',
'ext': 'mp4',
'protocol': 'm3u8_native',
+ 'quality': 1,
}]
elif 'vod/mp3:' in media_url:
formats = [{
@@ -82,9 +103,24 @@ class LibraryOfCongressIE(InfoExtractor):
'vcodec': 'none',
}]
+ download_urls = set()
+ for m in re.finditer(
+ r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s*(?P<id>.+?)(?:(?:&nbsp;|\s+)\((?P<size>.+?)\))?\s*<', webpage):
+ format_id = m.group('id').lower()
+ if format_id == 'gif':
+ continue
+ download_url = m.group('url')
+ if download_url in download_urls:
+ continue
+ download_urls.add(download_url)
+ formats.append({
+ 'url': download_url,
+ 'format_id': format_id,
+ 'filesize_approx': parse_filesize(m.group('size')),
+ })
+
self._sort_formats(formats)
- title = derivative.get('shortName') or data.get('shortName') or self._og_search_title(webpage)
duration = float_or_none(data.get('duration'))
view_count = int_or_none(data.get('viewCount'))