summaryrefslogtreecommitdiffstats
path: root/yt_dlp/extractor/khanacademy.py
diff options
context:
space:
mode:
Diffstat (limited to 'yt_dlp/extractor/khanacademy.py')
-rw-r--r--yt_dlp/extractor/khanacademy.py110
1 files changed, 110 insertions, 0 deletions
diff --git a/yt_dlp/extractor/khanacademy.py b/yt_dlp/extractor/khanacademy.py
new file mode 100644
index 0000000..5333036
--- /dev/null
+++ b/yt_dlp/extractor/khanacademy.py
@@ -0,0 +1,110 @@
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+ try_get,
+)
+
+
+class KhanAcademyBaseIE(InfoExtractor):
+ _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)'
+
+ def _parse_video(self, video):
+ return {
+ '_type': 'url_transparent',
+ 'url': video['youtubeId'],
+ 'id': video.get('slug'),
+ 'title': video.get('title'),
+ 'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'),
+ 'duration': int_or_none(video.get('duration')),
+ 'description': video.get('description'),
+ 'ie_key': 'Youtube',
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ content = self._download_json(
+ 'https://www.khanacademy.org/api/internal/graphql/FetchContentData',
+ display_id, query={
+ 'fastly_cacheable': 'persist_until_publish',
+ 'hash': '4134764944',
+ 'lang': 'en',
+ 'variables': json.dumps({
+ 'path': display_id,
+ 'queryParams': 'lang=en',
+ 'isModal': False,
+ 'followRedirects': True,
+ 'countryCode': 'US',
+ }),
+ })['data']['contentJson']
+ return self._parse_component_props(self._parse_json(content, display_id)['componentProps'])
+
+
+class KhanAcademyIE(KhanAcademyBaseIE):
+ IE_NAME = 'khanacademy'
+ _VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/')
+ _TEST = {
+ 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad',
+ 'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0',
+ 'info_dict': {
+ 'id': 'FlIG3TvQCBQ',
+ 'ext': 'mp4',
+ 'title': 'The one-time pad',
+ 'description': 'The perfect cipher',
+ 'duration': 176,
+ 'uploader': 'Brit Cruise',
+ 'uploader_id': 'khanacademy',
+ 'upload_date': '20120411',
+ 'timestamp': 1334170113,
+ 'license': 'cc-by-nc-sa',
+ },
+ 'add_ie': ['Youtube'],
+ }
+
+ def _parse_component_props(self, component_props):
+ video = component_props['tutorialPageData']['contentModel']
+ info = self._parse_video(video)
+ author_names = video.get('authorNames')
+ info.update({
+ 'uploader': ', '.join(author_names) if author_names else None,
+ 'timestamp': parse_iso8601(video.get('dateAdded')),
+ 'license': video.get('kaUserLicense'),
+ })
+ return info
+
+
+class KhanAcademyUnitIE(KhanAcademyBaseIE):
+ IE_NAME = 'khanacademy:unit'
+ _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)'
+ _TEST = {
+ 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography',
+ 'info_dict': {
+ 'id': 'cryptography',
+ 'title': 'Cryptography',
+ 'description': 'How have humans protected their secret messages through history? What has changed today?',
+ },
+ 'playlist_mincount': 31,
+ }
+
+ def _parse_component_props(self, component_props):
+ curation = component_props['curation']
+
+ entries = []
+ tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or []
+ for tutorial_number, tutorial in enumerate(tutorials, 1):
+ chapter_info = {
+ 'chapter': tutorial.get('title'),
+ 'chapter_number': tutorial_number,
+ 'chapter_id': tutorial.get('id'),
+ }
+ for content_item in (tutorial.get('contentItems') or []):
+ if content_item.get('kind') == 'Video':
+ info = self._parse_video(content_item)
+ info.update(chapter_info)
+ entries.append(info)
+
+ return self.playlist_result(
+ entries, curation.get('unit'), curation.get('title'),
+ curation.get('description'))