youtube_dl/extractor/packtpub.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
    clean_html,
    ExtractorError,
    remove_end,
    strip_or_none,
    unified_timestamp,
    urljoin,
)


class PacktPubBaseIE(InfoExtractor):
    _PACKT_BASE = 'https://www.packtpub.com'
    _MAPT_REST = '%s/mapt-rest' % _PACKT_BASE


class PacktPubIE(PacktPubBaseIE):
    _VALID_URL = r'https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P<course_id>\d+)/(?P<chapter_id>\d+)/(?P<id>\d+)'

    _TEST = {
        'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215/20528/20530/Project+Intro',
        'md5': '1e74bd6cfd45d7d07666f4684ef58f70',
        'info_dict': {
            'id': '20530',
            'ext': 'mp4',
            'title': 'Project Intro',
            'thumbnail': r're:(?i)^https?://.*\.jpg',
            'timestamp': 1490918400,
            'upload_date': '20170331',
        },
    }

    def _handle_error(self, response):
        if response.get('status') != 'success':
            raise ExtractorError(
                '% said: %s' % (self.IE_NAME, response['message']),
                expected=True)

    def _download_json(self, *args, **kwargs):
        response = super(PacktPubIE, self)._download_json(*args, **kwargs)
        self._handle_error(response)
        return response

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        course_id, chapter_id, video_id = mobj.group(
            'course_id', 'chapter_id', 'id')

        video = self._download_json(
            '%s/users/me/products/%s/chapters/%s/sections/%s'
            % (self._MAPT_REST, course_id, chapter_id, video_id), video_id,
            'Downloading JSON video')['data']

        content = video.get('content')
        if not content:
            raise ExtractorError('This video is locked', expected=True)

        video_url = content['file']

        metadata = self._download_json(
            '%s/products/%s/chapters/%s/sections/%s/metadata'
            % (self._MAPT_REST, course_id, chapter_id, video_id),
            video_id)['data']

        title = metadata['pageTitle']
        course_title = metadata.get('title')
        if course_title:
            title = remove_end(title, ' - %s' % course_title)
        timestamp = unified_timestamp(metadata.get('publicationDate'))
        thumbnail = urljoin(self._PACKT_BASE, metadata.get('filepath'))

        return {
            'id': video_id,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
            'timestamp': timestamp,
        }


class PacktPubCourseIE(PacktPubBaseIE):
    _VALID_URL = r'(?P<url>https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P<id>\d+))'
    _TEST = {
        'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215',
        'info_dict': {
            'id': '9781787122215',
            'title': 'Learn Nodejs by building 12 projects [Video]',
        },
        'playlist_count': 90,
    }

    @classmethod
    def suitable(cls, url):
        return False if PacktPubIE.suitable(url) else super(
            PacktPubCourseIE, cls).suitable(url)

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        url, course_id = mobj.group('url', 'id')

        course = self._download_json(
            '%s/products/%s/metadata' % (self._MAPT_REST, course_id),
            course_id)['data']

        entries = []
        for chapter_num, chapter in enumerate(course['tableOfContents'], 1):
            if chapter.get('type') != 'chapter':
                continue
            children = chapter.get('children')
            if not isinstance(children, list):
                continue
            chapter_info = {
                'chapter': chapter.get('title'),
                'chapter_number': chapter_num,
                'chapter_id': chapter.get('id'),
            }
            for section in children:
                if section.get('type') != 'section':
                    continue
                section_url = section.get('seoUrl')
                if not isinstance(section_url, compat_str):
                    continue
                entry = {
                    '_type': 'url_transparent',
                    'url': urljoin(url + '/', section_url),
                    'title': strip_or_none(section.get('title')),
                    'description': clean_html(section.get('summary')),
                    'ie_key': PacktPubIE.ie_key(),
                }
                entry.update(chapter_info)
                entries.append(entry)

        return self.playlist_result(entries, course_id, course.get('title'))