From 2415e66f889f38503b73e8ebc5f43ca342390e5c Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 15 Apr 2024 18:49:24 +0200 Subject: Adding upstream version 2024.03.10. Signed-off-by: Daniel Baumann --- yt_dlp/YoutubeDL.py | 4339 ++++++++++++++ yt_dlp/__init__.py | 1054 ++++ yt_dlp/__main__.py | 17 + yt_dlp/__pyinstaller/__init__.py | 5 + yt_dlp/__pyinstaller/hook-yt_dlp.py | 34 + yt_dlp/aes.py | 567 ++ yt_dlp/cache.py | 91 + yt_dlp/compat/__init__.py | 79 + yt_dlp/compat/_deprecated.py | 23 + yt_dlp/compat/_legacy.py | 108 + yt_dlp/compat/compat_utils.py | 83 + yt_dlp/compat/functools.py | 12 + yt_dlp/compat/imghdr.py | 16 + yt_dlp/compat/shutil.py | 30 + yt_dlp/compat/types.py | 13 + yt_dlp/compat/urllib/__init__.py | 10 + yt_dlp/compat/urllib/request.py | 40 + yt_dlp/cookies.py | 1346 +++++ yt_dlp/dependencies/Cryptodome.py | 38 + yt_dlp/dependencies/__init__.py | 92 + yt_dlp/downloader/__init__.py | 131 + yt_dlp/downloader/common.py | 486 ++ yt_dlp/downloader/dash.py | 90 + yt_dlp/downloader/external.py | 664 +++ yt_dlp/downloader/f4m.py | 427 ++ yt_dlp/downloader/fc2.py | 46 + yt_dlp/downloader/fragment.py | 527 ++ yt_dlp/downloader/hls.py | 378 ++ yt_dlp/downloader/http.py | 383 ++ yt_dlp/downloader/ism.py | 283 + yt_dlp/downloader/mhtml.py | 189 + yt_dlp/downloader/niconico.py | 140 + yt_dlp/downloader/rtmp.py | 213 + yt_dlp/downloader/rtsp.py | 42 + yt_dlp/downloader/websocket.py | 53 + yt_dlp/downloader/youtube_live_chat.py | 228 + yt_dlp/extractor/__init__.py | 42 + yt_dlp/extractor/_extractors.py | 2493 ++++++++ yt_dlp/extractor/abc.py | 421 ++ yt_dlp/extractor/abcnews.py | 153 + yt_dlp/extractor/abcotvs.py | 130 + yt_dlp/extractor/abematv.py | 484 ++ yt_dlp/extractor/academicearth.py | 39 + yt_dlp/extractor/acast.py | 143 + yt_dlp/extractor/acfun.py | 200 + yt_dlp/extractor/adn.py | 335 ++ yt_dlp/extractor/adobeconnect.py | 34 + yt_dlp/extractor/adobepass.py | 1778 ++++++ yt_dlp/extractor/adobetv.py | 286 + yt_dlp/extractor/adultswim.py | 198 + yt_dlp/extractor/aenetworks.py | 369 ++ yt_dlp/extractor/aeonco.py | 74 + yt_dlp/extractor/afreecatv.py | 484 ++ yt_dlp/extractor/agora.py | 251 + yt_dlp/extractor/airtv.py | 96 + yt_dlp/extractor/aitube.py | 60 + yt_dlp/extractor/aliexpress.py | 50 + yt_dlp/extractor/aljazeera.py | 83 + yt_dlp/extractor/allocine.py | 125 + yt_dlp/extractor/allstar.py | 253 + yt_dlp/extractor/alphaporno.py | 75 + yt_dlp/extractor/alsace20tv.py | 83 + yt_dlp/extractor/altcensored.py | 104 + yt_dlp/extractor/alura.py | 167 + yt_dlp/extractor/amadeustv.py | 77 + yt_dlp/extractor/amara.py | 100 + yt_dlp/extractor/amazon.py | 170 + yt_dlp/extractor/amazonminitv.py | 294 + yt_dlp/extractor/amcnetworks.py | 147 + yt_dlp/extractor/americastestkitchen.py | 215 + yt_dlp/extractor/amp.py | 101 + yt_dlp/extractor/anchorfm.py | 98 + yt_dlp/extractor/angel.py | 56 + yt_dlp/extractor/antenna.py | 143 + yt_dlp/extractor/anvato.py | 404 ++ yt_dlp/extractor/aol.py | 133 + yt_dlp/extractor/apa.py | 82 + yt_dlp/extractor/aparat.py | 88 + yt_dlp/extractor/appleconnect.py | 50 + yt_dlp/extractor/applepodcasts.py | 85 + yt_dlp/extractor/appletrailers.py | 278 + yt_dlp/extractor/archiveorg.py | 947 +++ yt_dlp/extractor/arcpublishing.py | 164 + yt_dlp/extractor/ard.py | 579 ++ yt_dlp/extractor/arkena.py | 150 + yt_dlp/extractor/arnes.py | 98 + yt_dlp/extractor/art19.py | 303 + yt_dlp/extractor/arte.py | 345 ++ yt_dlp/extractor/asobichannel.py | 168 + yt_dlp/extractor/atresplayer.py | 104 + yt_dlp/extractor/atscaleconf.py | 34 + yt_dlp/extractor/atvat.py | 108 + yt_dlp/extractor/audimedia.py | 89 + yt_dlp/extractor/audioboom.py | 57 + yt_dlp/extractor/audiodraft.py | 93 + yt_dlp/extractor/audiomack.py | 147 + yt_dlp/extractor/audius.py | 271 + yt_dlp/extractor/awaan.py | 184 + yt_dlp/extractor/aws.py | 75 + yt_dlp/extractor/axs.py | 89 + yt_dlp/extractor/azmedien.py | 66 + yt_dlp/extractor/baidu.py | 51 + yt_dlp/extractor/banbye.py | 168 + yt_dlp/extractor/bandaichannel.py | 33 + yt_dlp/extractor/bandcamp.py | 485 ++ yt_dlp/extractor/bannedvideo.py | 155 + yt_dlp/extractor/bbc.py | 1660 ++++++ yt_dlp/extractor/beatbump.py | 111 + yt_dlp/extractor/beatport.py | 97 + yt_dlp/extractor/beeg.py | 90 + yt_dlp/extractor/behindkink.py | 42 + yt_dlp/extractor/bellmedia.py | 91 + yt_dlp/extractor/berufetv.py | 70 + yt_dlp/extractor/bet.py | 79 + yt_dlp/extractor/bfi.py | 35 + yt_dlp/extractor/bfmtv.py | 119 + yt_dlp/extractor/bibeltv.py | 197 + yt_dlp/extractor/bigflix.py | 73 + yt_dlp/extractor/bigo.py | 57 + yt_dlp/extractor/bild.py | 63 + yt_dlp/extractor/bilibili.py | 2233 +++++++ yt_dlp/extractor/biobiochiletv.py | 83 + yt_dlp/extractor/bitchute.py | 275 + yt_dlp/extractor/blackboardcollaborate.py | 63 + yt_dlp/extractor/bleacherreport.py | 110 + yt_dlp/extractor/blerp.py | 167 + yt_dlp/extractor/blogger.py | 45 + yt_dlp/extractor/bloomberg.py | 77 + yt_dlp/extractor/bokecc.py | 53 + yt_dlp/extractor/bongacams.py | 70 + yt_dlp/extractor/boosty.py | 209 + yt_dlp/extractor/bostonglobe.py | 69 + yt_dlp/extractor/box.py | 83 + yt_dlp/extractor/boxcast.py | 102 + yt_dlp/extractor/bpb.py | 170 + yt_dlp/extractor/br.py | 166 + yt_dlp/extractor/brainpop.py | 318 + yt_dlp/extractor/bravotv.py | 189 + yt_dlp/extractor/breitbart.py | 34 + yt_dlp/extractor/brightcove.py | 952 +++ yt_dlp/extractor/brilliantpala.py | 127 + yt_dlp/extractor/bundesliga.py | 34 + yt_dlp/extractor/bundestag.py | 123 + yt_dlp/extractor/businessinsider.py | 45 + yt_dlp/extractor/buzzfeed.py | 95 + yt_dlp/extractor/byutv.py | 104 + yt_dlp/extractor/c56.py | 59 + yt_dlp/extractor/cableav.py | 32 + yt_dlp/extractor/callin.py | 155 + yt_dlp/extractor/caltrans.py | 37 + yt_dlp/extractor/cam4.py | 31 + yt_dlp/extractor/camdemy.py | 158 + yt_dlp/extractor/camfm.py | 85 + yt_dlp/extractor/cammodels.py | 77 + yt_dlp/extractor/camsoda.py | 57 + yt_dlp/extractor/camtasia.py | 71 + yt_dlp/extractor/canal1.py | 39 + yt_dlp/extractor/canalalpha.py | 94 + yt_dlp/extractor/canalc2.py | 68 + yt_dlp/extractor/canalplus.py | 110 + yt_dlp/extractor/caracoltv.py | 136 + yt_dlp/extractor/cartoonnetwork.py | 59 + yt_dlp/extractor/cbc.py | 653 +++ yt_dlp/extractor/cbs.py | 280 + yt_dlp/extractor/cbsnews.py | 443 ++ yt_dlp/extractor/cbssports.py | 111 + yt_dlp/extractor/ccc.py | 115 + yt_dlp/extractor/ccma.py | 147 + yt_dlp/extractor/cctv.py | 201 + yt_dlp/extractor/cda.py | 338 ++ yt_dlp/extractor/cellebrite.py | 63 + yt_dlp/extractor/ceskatelevize.py | 289 + yt_dlp/extractor/cgtn.py | 65 + yt_dlp/extractor/charlierose.py | 50 + yt_dlp/extractor/chaturbate.py | 106 + yt_dlp/extractor/chilloutzone.py | 123 + yt_dlp/extractor/chzzk.py | 139 + yt_dlp/extractor/cinemax.py | 25 + yt_dlp/extractor/cinetecamilano.py | 61 + yt_dlp/extractor/cineverse.py | 139 + yt_dlp/extractor/ciscolive.py | 145 + yt_dlp/extractor/ciscowebex.py | 106 + yt_dlp/extractor/cjsw.py | 67 + yt_dlp/extractor/clipchamp.py | 61 + yt_dlp/extractor/clippit.py | 70 + yt_dlp/extractor/cliprs.py | 31 + yt_dlp/extractor/closertotruth.py | 89 + yt_dlp/extractor/cloudflarestream.py | 76 + yt_dlp/extractor/cloudycdn.py | 79 + yt_dlp/extractor/clubic.py | 53 + yt_dlp/extractor/clyp.py | 99 + yt_dlp/extractor/cmt.py | 55 + yt_dlp/extractor/cnbc.py | 97 + yt_dlp/extractor/cnn.py | 198 + yt_dlp/extractor/comedycentral.py | 55 + yt_dlp/extractor/common.py | 3943 +++++++++++++ yt_dlp/extractor/commonmistakes.py | 42 + yt_dlp/extractor/commonprotocols.py | 70 + yt_dlp/extractor/condenast.py | 250 + yt_dlp/extractor/contv.py | 113 + yt_dlp/extractor/corus.py | 154 + yt_dlp/extractor/coub.py | 136 + yt_dlp/extractor/cozytv.py | 37 + yt_dlp/extractor/cpac.py | 136 + yt_dlp/extractor/cracked.py | 88 + yt_dlp/extractor/crackle.py | 243 + yt_dlp/extractor/craftsy.py | 75 + yt_dlp/extractor/crooksandliars.py | 56 + yt_dlp/extractor/crowdbunker.py | 109 + yt_dlp/extractor/crtvg.py | 53 + yt_dlp/extractor/crunchyroll.py | 650 +++ yt_dlp/extractor/cspan.py | 286 + yt_dlp/extractor/ctsnews.py | 84 + yt_dlp/extractor/ctv.py | 49 + yt_dlp/extractor/ctvnews.py | 70 + yt_dlp/extractor/cultureunplugged.py | 65 + yt_dlp/extractor/curiositystream.py | 203 + yt_dlp/extractor/cwtv.py | 99 + yt_dlp/extractor/cybrary.py | 144 + yt_dlp/extractor/dacast.py | 158 + yt_dlp/extractor/dailymail.py | 73 + yt_dlp/extractor/dailymotion.py | 474 ++ yt_dlp/extractor/dailywire.py | 113 + yt_dlp/extractor/damtomo.py | 108 + yt_dlp/extractor/daum.py | 258 + yt_dlp/extractor/daystar.py | 47 + yt_dlp/extractor/dbtv.py | 47 + yt_dlp/extractor/dctp.py | 102 + yt_dlp/extractor/deezer.py | 142 + yt_dlp/extractor/democracynow.py | 91 + yt_dlp/extractor/detik.py | 159 + yt_dlp/extractor/deuxm.py | 76 + yt_dlp/extractor/dfb.py | 52 + yt_dlp/extractor/dhm.py | 58 + yt_dlp/extractor/digitalconcerthall.py | 150 + yt_dlp/extractor/digiteka.py | 98 + yt_dlp/extractor/discogs.py | 35 + yt_dlp/extractor/discovery.py | 115 + yt_dlp/extractor/discoverygo.py | 172 + yt_dlp/extractor/disney.py | 160 + yt_dlp/extractor/dispeak.py | 127 + yt_dlp/extractor/dlf.py | 192 + yt_dlp/extractor/dlive.py | 92 + yt_dlp/extractor/douyutv.py | 306 + yt_dlp/extractor/dplay.py | 1059 ++++ yt_dlp/extractor/drbonanza.py | 54 + yt_dlp/extractor/dreisat.py | 41 + yt_dlp/extractor/drooble.py | 113 + yt_dlp/extractor/dropbox.py | 90 + yt_dlp/extractor/dropout.py | 224 + yt_dlp/extractor/drtuber.py | 104 + yt_dlp/extractor/drtv.py | 401 ++ yt_dlp/extractor/dtube.py | 80 + yt_dlp/extractor/duboku.py | 247 + yt_dlp/extractor/dumpert.py | 114 + yt_dlp/extractor/duoplay.py | 104 + yt_dlp/extractor/dvtv.py | 177 + yt_dlp/extractor/dw.py | 110 + yt_dlp/extractor/eagleplatform.py | 215 + yt_dlp/extractor/ebaumsworld.py | 31 + yt_dlp/extractor/ebay.py | 36 + yt_dlp/extractor/egghead.py | 134 + yt_dlp/extractor/eighttracks.py | 161 + yt_dlp/extractor/einthusan.py | 105 + yt_dlp/extractor/eitb.py | 79 + yt_dlp/extractor/elementorembed.py | 72 + yt_dlp/extractor/elonet.py | 64 + yt_dlp/extractor/elpais.py | 92 + yt_dlp/extractor/eltrecetv.py | 62 + yt_dlp/extractor/embedly.py | 109 + yt_dlp/extractor/epicon.py | 115 + yt_dlp/extractor/epidemicsound.py | 107 + yt_dlp/extractor/eplus.py | 183 + yt_dlp/extractor/epoch.py | 55 + yt_dlp/extractor/eporner.py | 137 + yt_dlp/extractor/erocast.py | 63 + yt_dlp/extractor/eroprofile.py | 122 + yt_dlp/extractor/err.py | 224 + yt_dlp/extractor/ertgr.py | 302 + yt_dlp/extractor/espn.py | 421 ++ yt_dlp/extractor/ettutv.py | 60 + yt_dlp/extractor/europa.py | 174 + yt_dlp/extractor/europeantour.py | 34 + yt_dlp/extractor/eurosport.py | 123 + yt_dlp/extractor/euscreen.py | 60 + yt_dlp/extractor/expressen.py | 96 + yt_dlp/extractor/extractors.py | 28 + yt_dlp/extractor/eyedotv.py | 61 + yt_dlp/extractor/facebook.py | 1060 ++++ yt_dlp/extractor/fancode.py | 181 + yt_dlp/extractor/faz.py | 89 + yt_dlp/extractor/fc2.py | 280 + yt_dlp/extractor/fczenit.py | 51 + yt_dlp/extractor/fifa.py | 83 + yt_dlp/extractor/filmon.py | 171 + yt_dlp/extractor/filmweb.py | 38 + yt_dlp/extractor/firsttv.py | 152 + yt_dlp/extractor/fivetv.py | 85 + yt_dlp/extractor/flextv.py | 62 + yt_dlp/extractor/flickr.py | 114 + yt_dlp/extractor/floatplane.py | 333 ++ yt_dlp/extractor/folketinget.py | 73 + yt_dlp/extractor/footyroom.py | 53 + yt_dlp/extractor/formula1.py | 24 + yt_dlp/extractor/fourtube.py | 306 + yt_dlp/extractor/fox.py | 177 + yt_dlp/extractor/fox9.py | 38 + yt_dlp/extractor/foxnews.py | 185 + yt_dlp/extractor/foxsports.py | 52 + yt_dlp/extractor/fptplay.py | 117 + yt_dlp/extractor/franceinter.py | 56 + yt_dlp/extractor/francetv.py | 423 ++ yt_dlp/extractor/freesound.py | 77 + yt_dlp/extractor/freespeech.py | 29 + yt_dlp/extractor/freetv.py | 139 + yt_dlp/extractor/frontendmasters.py | 252 + yt_dlp/extractor/fujitv.py | 71 + yt_dlp/extractor/funimation.py | 349 ++ yt_dlp/extractor/funk.py | 40 + yt_dlp/extractor/funker530.py | 80 + yt_dlp/extractor/fuyintv.py | 30 + yt_dlp/extractor/gab.py | 140 + yt_dlp/extractor/gaia.py | 122 + yt_dlp/extractor/gamejolt.py | 537 ++ yt_dlp/extractor/gamespot.py | 75 + yt_dlp/extractor/gamestar.py | 60 + yt_dlp/extractor/gaskrank.py | 96 + yt_dlp/extractor/gazeta.py | 44 + yt_dlp/extractor/gdcvault.py | 214 + yt_dlp/extractor/gedidigital.py | 198 + yt_dlp/extractor/generic.py | 2849 +++++++++ yt_dlp/extractor/genericembeds.py | 114 + yt_dlp/extractor/genius.py | 145 + yt_dlp/extractor/getcourseru.py | 178 + yt_dlp/extractor/gettr.py | 206 + yt_dlp/extractor/giantbomb.py | 85 + yt_dlp/extractor/gigya.py | 20 + yt_dlp/extractor/glide.py | 38 + yt_dlp/extractor/globalplayer.py | 254 + yt_dlp/extractor/globo.py | 246 + yt_dlp/extractor/glomex.py | 216 + yt_dlp/extractor/gmanetwork.py | 83 + yt_dlp/extractor/go.py | 333 ++ yt_dlp/extractor/godtube.py | 55 + yt_dlp/extractor/gofile.py | 106 + yt_dlp/extractor/golem.py | 68 + yt_dlp/extractor/goodgame.py | 57 + yt_dlp/extractor/googledrive.py | 341 ++ yt_dlp/extractor/googlepodcasts.py | 84 + yt_dlp/extractor/googlesearch.py | 38 + yt_dlp/extractor/goplay.py | 433 ++ yt_dlp/extractor/gopro.py | 105 + yt_dlp/extractor/goshgay.py | 48 + yt_dlp/extractor/gotostage.py | 70 + yt_dlp/extractor/gputechconf.py | 32 + yt_dlp/extractor/gronkh.py | 120 + yt_dlp/extractor/groupon.py | 64 + yt_dlp/extractor/harpodeon.py | 70 + yt_dlp/extractor/hbo.py | 171 + yt_dlp/extractor/hearthisat.py | 96 + yt_dlp/extractor/heise.py | 207 + yt_dlp/extractor/hellporno.py | 72 + yt_dlp/extractor/hgtv.py | 37 + yt_dlp/extractor/hidive.py | 119 + yt_dlp/extractor/historicfilms.py | 45 + yt_dlp/extractor/hitrecord.py | 66 + yt_dlp/extractor/hketv.py | 187 + yt_dlp/extractor/hollywoodreporter.py | 72 + yt_dlp/extractor/holodex.py | 100 + yt_dlp/extractor/hotnewhiphop.py | 61 + yt_dlp/extractor/hotstar.py | 468 ++ yt_dlp/extractor/hrefli.py | 15 + yt_dlp/extractor/hrfensehen.py | 90 + yt_dlp/extractor/hrti.py | 200 + yt_dlp/extractor/hse.py | 93 + yt_dlp/extractor/huajiao.py | 53 + yt_dlp/extractor/huffpost.py | 90 + yt_dlp/extractor/hungama.py | 201 + yt_dlp/extractor/huya.py | 134 + yt_dlp/extractor/hypem.py | 47 + yt_dlp/extractor/hypergryph.py | 32 + yt_dlp/extractor/hytale.py | 58 + yt_dlp/extractor/icareus.py | 179 + yt_dlp/extractor/ichinanalive.py | 160 + yt_dlp/extractor/idolplus.py | 115 + yt_dlp/extractor/ign.py | 399 ++ yt_dlp/extractor/iheart.py | 94 + yt_dlp/extractor/ilpost.py | 69 + yt_dlp/extractor/iltalehti.py | 51 + yt_dlp/extractor/imdb.py | 144 + yt_dlp/extractor/imggaming.py | 126 + yt_dlp/extractor/imgur.py | 366 ++ yt_dlp/extractor/ina.py | 84 + yt_dlp/extractor/inc.py | 57 + yt_dlp/extractor/indavideo.py | 115 + yt_dlp/extractor/infoq.py | 136 + yt_dlp/extractor/instagram.py | 735 +++ yt_dlp/extractor/internazionale.py | 75 + yt_dlp/extractor/internetvideoarchive.py | 58 + yt_dlp/extractor/iprima.py | 280 + yt_dlp/extractor/iqiyi.py | 766 +++ yt_dlp/extractor/islamchannel.py | 81 + yt_dlp/extractor/israelnationalnews.py | 50 + yt_dlp/extractor/itprotv.py | 139 + yt_dlp/extractor/itv.py | 266 + yt_dlp/extractor/ivi.py | 253 + yt_dlp/extractor/ivideon.py | 77 + yt_dlp/extractor/iwara.py | 298 + yt_dlp/extractor/ixigua.py | 83 + yt_dlp/extractor/izlesene.py | 113 + yt_dlp/extractor/jable.py | 103 + yt_dlp/extractor/jamendo.py | 210 + yt_dlp/extractor/japandiet.py | 274 + yt_dlp/extractor/jeuxvideo.py | 52 + yt_dlp/extractor/jiosaavn.py | 105 + yt_dlp/extractor/jixie.py | 47 + yt_dlp/extractor/joj.py | 108 + yt_dlp/extractor/joqrag.py | 112 + yt_dlp/extractor/jove.py | 76 + yt_dlp/extractor/jstream.py | 73 + yt_dlp/extractor/jtbc.py | 156 + yt_dlp/extractor/jwplatform.py | 90 + yt_dlp/extractor/kakao.py | 152 + yt_dlp/extractor/kaltura.py | 545 ++ yt_dlp/extractor/kankanews.py | 49 + yt_dlp/extractor/karaoketv.py | 61 + yt_dlp/extractor/kelbyone.py | 81 + yt_dlp/extractor/khanacademy.py | 110 + yt_dlp/extractor/kick.py | 126 + yt_dlp/extractor/kicker.py | 55 + yt_dlp/extractor/kickstarter.py | 68 + yt_dlp/extractor/kinja.py | 199 + yt_dlp/extractor/kinopoisk.py | 63 + yt_dlp/extractor/kommunetv.py | 31 + yt_dlp/extractor/kompas.py | 26 + yt_dlp/extractor/koo.py | 114 + yt_dlp/extractor/krasview.py | 58 + yt_dlp/extractor/kth.py | 28 + yt_dlp/extractor/ku6.py | 30 + yt_dlp/extractor/kukululive.py | 140 + yt_dlp/extractor/kuwo.py | 352 ++ yt_dlp/extractor/la7.py | 234 + yt_dlp/extractor/lastfm.py | 129 + yt_dlp/extractor/laxarxames.py | 73 + yt_dlp/extractor/lbry.py | 429 ++ yt_dlp/extractor/lci.py | 28 + yt_dlp/extractor/lcp.py | 87 + yt_dlp/extractor/lecture2go.py | 67 + yt_dlp/extractor/lecturio.py | 235 + yt_dlp/extractor/leeco.py | 364 ++ yt_dlp/extractor/lefigaro.py | 136 + yt_dlp/extractor/lego.py | 141 + yt_dlp/extractor/lemonde.py | 56 + yt_dlp/extractor/lenta.py | 51 + yt_dlp/extractor/libraryofcongress.py | 148 + yt_dlp/extractor/libsyn.py | 89 + yt_dlp/extractor/lifenews.py | 234 + yt_dlp/extractor/likee.py | 182 + yt_dlp/extractor/limelight.py | 358 ++ yt_dlp/extractor/linkedin.py | 272 + yt_dlp/extractor/liputan6.py | 64 + yt_dlp/extractor/listennotes.py | 86 + yt_dlp/extractor/litv.py | 148 + yt_dlp/extractor/livejournal.py | 39 + yt_dlp/extractor/livestream.py | 388 ++ yt_dlp/extractor/livestreamfails.py | 37 + yt_dlp/extractor/lnkgo.py | 163 + yt_dlp/extractor/lovehomeporn.py | 33 + yt_dlp/extractor/lrt.py | 108 + yt_dlp/extractor/lsm.py | 282 + yt_dlp/extractor/lumni.py | 23 + yt_dlp/extractor/lynda.py | 330 ++ yt_dlp/extractor/maariv.py | 62 + yt_dlp/extractor/magellantv.py | 62 + yt_dlp/extractor/magentamusik.py | 62 + yt_dlp/extractor/mailru.py | 338 ++ yt_dlp/extractor/mainstreaming.py | 210 + yt_dlp/extractor/mangomolo.py | 73 + yt_dlp/extractor/manoto.py | 133 + yt_dlp/extractor/manyvids.py | 162 + yt_dlp/extractor/maoritv.py | 28 + yt_dlp/extractor/markiza.py | 124 + yt_dlp/extractor/massengeschmacktv.py | 72 + yt_dlp/extractor/masters.py | 38 + yt_dlp/extractor/matchtv.py | 51 + yt_dlp/extractor/mbn.py | 89 + yt_dlp/extractor/mdr.py | 184 + yt_dlp/extractor/medaltv.py | 162 + yt_dlp/extractor/mediaite.py | 104 + yt_dlp/extractor/mediaklikk.py | 156 + yt_dlp/extractor/medialaan.py | 111 + yt_dlp/extractor/mediaset.py | 320 + yt_dlp/extractor/mediasite.py | 411 ++ yt_dlp/extractor/mediastream.py | 226 + yt_dlp/extractor/mediaworksnz.py | 103 + yt_dlp/extractor/medici.py | 67 + yt_dlp/extractor/megaphone.py | 46 + yt_dlp/extractor/megatvcom.py | 164 + yt_dlp/extractor/meipai.py | 99 + yt_dlp/extractor/melonvod.py | 68 + yt_dlp/extractor/metacritic.py | 62 + yt_dlp/extractor/mgtv.py | 165 + yt_dlp/extractor/microsoftembed.py | 65 + yt_dlp/extractor/microsoftstream.py | 121 + yt_dlp/extractor/microsoftvirtualacademy.py | 189 + yt_dlp/extractor/mildom.py | 291 + yt_dlp/extractor/minds.py | 193 + yt_dlp/extractor/minoto.py | 45 + yt_dlp/extractor/mirrativ.py | 118 + yt_dlp/extractor/mirrorcouk.py | 98 + yt_dlp/extractor/mit.py | 130 + yt_dlp/extractor/mitele.py | 82 + yt_dlp/extractor/mixch.py | 81 + yt_dlp/extractor/mixcloud.py | 379 ++ yt_dlp/extractor/mlb.py | 379 ++ yt_dlp/extractor/mlssoccer.py | 114 + yt_dlp/extractor/mocha.py | 64 + yt_dlp/extractor/mojvideo.py | 52 + yt_dlp/extractor/monstercat.py | 77 + yt_dlp/extractor/motherless.py | 297 + yt_dlp/extractor/motorsport.py | 52 + yt_dlp/extractor/moviepilot.py | 97 + yt_dlp/extractor/moview.py | 43 + yt_dlp/extractor/moviezine.py | 38 + yt_dlp/extractor/movingimage.py | 50 + yt_dlp/extractor/msn.py | 168 + yt_dlp/extractor/mtv.py | 654 +++ yt_dlp/extractor/muenchentv.py | 72 + yt_dlp/extractor/murrtube.py | 164 + yt_dlp/extractor/museai.py | 112 + yt_dlp/extractor/musescore.py | 64 + yt_dlp/extractor/musicdex.py | 172 + yt_dlp/extractor/mx3.py | 171 + yt_dlp/extractor/mxplayer.py | 241 + yt_dlp/extractor/myspace.py | 195 + yt_dlp/extractor/myspass.py | 92 + yt_dlp/extractor/myvideoge.py | 81 + yt_dlp/extractor/myvidster.py | 27 + yt_dlp/extractor/mzaalo.py | 95 + yt_dlp/extractor/n1.py | 163 + yt_dlp/extractor/nate.py | 120 + yt_dlp/extractor/nationalgeographic.py | 83 + yt_dlp/extractor/naver.py | 404 ++ yt_dlp/extractor/nba.py | 419 ++ yt_dlp/extractor/nbc.py | 851 +++ yt_dlp/extractor/ndr.py | 471 ++ yt_dlp/extractor/ndtv.py | 107 + yt_dlp/extractor/nebula.py | 468 ++ yt_dlp/extractor/nekohacker.py | 213 + yt_dlp/extractor/nerdcubed.py | 38 + yt_dlp/extractor/neteasemusic.py | 615 ++ yt_dlp/extractor/netverse.py | 281 + yt_dlp/extractor/netzkino.py | 85 + yt_dlp/extractor/newgrounds.py | 311 + yt_dlp/extractor/newspicks.py | 53 + yt_dlp/extractor/newsy.py | 47 + yt_dlp/extractor/nextmedia.py | 237 + yt_dlp/extractor/nexx.py | 525 ++ yt_dlp/extractor/nfb.py | 300 + yt_dlp/extractor/nfhsnetwork.py | 141 + yt_dlp/extractor/nfl.py | 373 ++ yt_dlp/extractor/nhk.py | 708 +++ yt_dlp/extractor/nhl.py | 123 + yt_dlp/extractor/nick.py | 224 + yt_dlp/extractor/niconico.py | 1061 ++++ yt_dlp/extractor/niconicochannelplus.py | 426 ++ yt_dlp/extractor/ninaprotocol.py | 225 + yt_dlp/extractor/ninecninemedia.py | 130 + yt_dlp/extractor/ninegag.py | 148 + yt_dlp/extractor/ninenews.py | 72 + yt_dlp/extractor/ninenow.py | 122 + yt_dlp/extractor/nintendo.py | 131 + yt_dlp/extractor/nitter.py | 360 ++ yt_dlp/extractor/nobelprize.py | 59 + yt_dlp/extractor/noice.py | 116 + yt_dlp/extractor/nonktube.py | 36 + yt_dlp/extractor/noodlemagazine.py | 80 + yt_dlp/extractor/noovo.py | 101 + yt_dlp/extractor/nosnl.py | 115 + yt_dlp/extractor/nova.py | 307 + yt_dlp/extractor/novaplay.py | 67 + yt_dlp/extractor/nowness.py | 142 + yt_dlp/extractor/noz.py | 83 + yt_dlp/extractor/npo.py | 612 ++ yt_dlp/extractor/npr.py | 132 + yt_dlp/extractor/nrk.py | 875 +++ yt_dlp/extractor/nrl.py | 27 + yt_dlp/extractor/ntvcojp.py | 55 + yt_dlp/extractor/ntvde.py | 83 + yt_dlp/extractor/ntvru.py | 142 + yt_dlp/extractor/nubilesporn.py | 99 + yt_dlp/extractor/nuevo.py | 36 + yt_dlp/extractor/nuum.py | 199 + yt_dlp/extractor/nuvid.py | 99 + yt_dlp/extractor/nytimes.py | 420 ++ yt_dlp/extractor/nzherald.py | 123 + yt_dlp/extractor/nzonscreen.py | 93 + yt_dlp/extractor/nzz.py | 40 + yt_dlp/extractor/odkmedia.py | 105 + yt_dlp/extractor/odnoklassniki.py | 464 ++ yt_dlp/extractor/oftv.py | 54 + yt_dlp/extractor/oktoberfesttv.py | 44 + yt_dlp/extractor/olympics.py | 65 + yt_dlp/extractor/on24.py | 87 + yt_dlp/extractor/once.py | 40 + yt_dlp/extractor/ondemandkorea.py | 169 + yt_dlp/extractor/onefootball.py | 51 + yt_dlp/extractor/onenewsnz.py | 111 + yt_dlp/extractor/oneplace.py | 43 + yt_dlp/extractor/onet.py | 259 + yt_dlp/extractor/onionstudios.py | 42 + yt_dlp/extractor/opencast.py | 183 + yt_dlp/extractor/openload.py | 243 + yt_dlp/extractor/openrec.py | 151 + yt_dlp/extractor/ora.py | 71 + yt_dlp/extractor/orf.py | 630 ++ yt_dlp/extractor/outsidetv.py | 25 + yt_dlp/extractor/owncloud.py | 80 + yt_dlp/extractor/packtpub.py | 155 + yt_dlp/extractor/palcomp3.py | 143 + yt_dlp/extractor/panopto.py | 600 ++ yt_dlp/extractor/paramountplus.py | 201 + yt_dlp/extractor/parler.py | 91 + yt_dlp/extractor/parlview.py | 64 + yt_dlp/extractor/patreon.py | 454 ++ yt_dlp/extractor/pbs.py | 757 +++ yt_dlp/extractor/pearvideo.py | 68 + yt_dlp/extractor/peekvids.py | 188 + yt_dlp/extractor/peertube.py | 1647 ++++++ yt_dlp/extractor/peertv.py | 52 + yt_dlp/extractor/peloton.py | 215 + yt_dlp/extractor/performgroup.py | 77 + yt_dlp/extractor/periscope.py | 188 + yt_dlp/extractor/pgatour.py | 47 + yt_dlp/extractor/philharmoniedeparis.py | 97 + yt_dlp/extractor/phoenix.py | 130 + yt_dlp/extractor/photobucket.py | 43 + yt_dlp/extractor/piapro.py | 121 + yt_dlp/extractor/piaulizaportal.py | 70 + yt_dlp/extractor/picarto.py | 152 + yt_dlp/extractor/piksel.py | 174 + yt_dlp/extractor/pinkbike.py | 93 + yt_dlp/extractor/pinterest.py | 248 + yt_dlp/extractor/pixivsketch.py | 118 + yt_dlp/extractor/pladform.py | 135 + yt_dlp/extractor/planetmarathi.py | 71 + yt_dlp/extractor/platzi.py | 213 + yt_dlp/extractor/playplustv.py | 100 + yt_dlp/extractor/playsuisse.py | 234 + yt_dlp/extractor/playtvak.py | 185 + yt_dlp/extractor/playwire.py | 72 + yt_dlp/extractor/pluralsight.py | 491 ++ yt_dlp/extractor/plutotv.py | 195 + yt_dlp/extractor/podbayfm.py | 75 + yt_dlp/extractor/podchaser.py | 97 + yt_dlp/extractor/podomatic.py | 74 + yt_dlp/extractor/pokemon.py | 136 + yt_dlp/extractor/pokergo.py | 106 + yt_dlp/extractor/polsatgo.py | 86 + yt_dlp/extractor/polskieradio.py | 610 ++ yt_dlp/extractor/popcorntimes.py | 91 + yt_dlp/extractor/popcorntv.py | 72 + yt_dlp/extractor/porn91.py | 95 + yt_dlp/extractor/pornbox.py | 113 + yt_dlp/extractor/pornflip.py | 77 + yt_dlp/extractor/pornhub.py | 825 +++ yt_dlp/extractor/pornotube.py | 83 + yt_dlp/extractor/pornovoisines.py | 103 + yt_dlp/extractor/pornoxo.py | 55 + yt_dlp/extractor/pr0gramm.py | 201 + yt_dlp/extractor/prankcast.py | 137 + yt_dlp/extractor/premiershiprugby.py | 39 + yt_dlp/extractor/presstv.py | 69 + yt_dlp/extractor/projectveritas.py | 52 + yt_dlp/extractor/prosiebensat1.py | 496 ++ yt_dlp/extractor/prx.py | 428 ++ yt_dlp/extractor/puhutv.py | 233 + yt_dlp/extractor/puls4.py | 51 + yt_dlp/extractor/pyvideo.py | 70 + yt_dlp/extractor/qdance.py | 171 + yt_dlp/extractor/qingting.py | 47 + yt_dlp/extractor/qqmusic.py | 365 ++ yt_dlp/extractor/r7.py | 112 + yt_dlp/extractor/radiko.py | 261 + yt_dlp/extractor/radiocanada.py | 165 + yt_dlp/extractor/radiocomercial.py | 154 + yt_dlp/extractor/radiode.py | 50 + yt_dlp/extractor/radiofrance.py | 473 ++ yt_dlp/extractor/radiojavan.py | 81 + yt_dlp/extractor/radiokapital.py | 97 + yt_dlp/extractor/radiozet.py | 50 + yt_dlp/extractor/radlive.py | 180 + yt_dlp/extractor/rai.py | 816 +++ yt_dlp/extractor/raywenderlich.py | 177 + yt_dlp/extractor/rbgtum.py | 142 + yt_dlp/extractor/rcs.py | 372 ++ yt_dlp/extractor/rcti.py | 373 ++ yt_dlp/extractor/rds.py | 68 + yt_dlp/extractor/redbee.py | 380 ++ yt_dlp/extractor/redbulltv.py | 224 + yt_dlp/extractor/reddit.py | 353 ++ yt_dlp/extractor/redge.py | 135 + yt_dlp/extractor/redgifs.py | 260 + yt_dlp/extractor/redtube.py | 144 + yt_dlp/extractor/rentv.py | 104 + yt_dlp/extractor/restudy.py | 41 + yt_dlp/extractor/reuters.py | 66 + yt_dlp/extractor/reverbnation.py | 51 + yt_dlp/extractor/rheinmaintv.py | 94 + yt_dlp/extractor/ridehome.py | 96 + yt_dlp/extractor/rinsefm.py | 89 + yt_dlp/extractor/rmcdecouverte.py | 71 + yt_dlp/extractor/rockstargames.py | 65 + yt_dlp/extractor/rokfin.py | 455 ++ yt_dlp/extractor/roosterteeth.py | 352 ++ yt_dlp/extractor/rottentomatoes.py | 80 + yt_dlp/extractor/rozhlas.py | 363 ++ yt_dlp/extractor/rte.py | 162 + yt_dlp/extractor/rtl2.py | 95 + yt_dlp/extractor/rtlnl.py | 294 + yt_dlp/extractor/rtnews.py | 196 + yt_dlp/extractor/rtp.py | 97 + yt_dlp/extractor/rtrfm.py | 65 + yt_dlp/extractor/rts.py | 232 + yt_dlp/extractor/rtvcplay.py | 285 + yt_dlp/extractor/rtve.py | 344 ++ yt_dlp/extractor/rtvs.py | 85 + yt_dlp/extractor/rtvslo.py | 166 + yt_dlp/extractor/rudovideo.py | 135 + yt_dlp/extractor/rule34video.py | 123 + yt_dlp/extractor/rumble.py | 390 ++ yt_dlp/extractor/rutube.py | 365 ++ yt_dlp/extractor/rutv.py | 203 + yt_dlp/extractor/ruutu.py | 262 + yt_dlp/extractor/ruv.py | 186 + yt_dlp/extractor/s4c.py | 103 + yt_dlp/extractor/safari.py | 259 + yt_dlp/extractor/saitosan.py | 75 + yt_dlp/extractor/samplefocus.py | 97 + yt_dlp/extractor/sapo.py | 114 + yt_dlp/extractor/sbs.py | 156 + yt_dlp/extractor/sbscokr.py | 200 + yt_dlp/extractor/screen9.py | 62 + yt_dlp/extractor/screencast.py | 117 + yt_dlp/extractor/screencastify.py | 70 + yt_dlp/extractor/screencastomatic.py | 72 + yt_dlp/extractor/scrippsnetworks.py | 155 + yt_dlp/extractor/scrolller.py | 102 + yt_dlp/extractor/scte.py | 137 + yt_dlp/extractor/sejmpl.py | 218 + yt_dlp/extractor/senalcolombia.py | 32 + yt_dlp/extractor/senategov.py | 200 + yt_dlp/extractor/sendtonews.py | 105 + yt_dlp/extractor/servus.py | 135 + yt_dlp/extractor/sevenplus.py | 132 + yt_dlp/extractor/sexu.py | 61 + yt_dlp/extractor/seznamzpravy.py | 157 + yt_dlp/extractor/shahid.py | 217 + yt_dlp/extractor/sharevideos.py | 6 + yt_dlp/extractor/shemaroome.py | 102 + yt_dlp/extractor/showroomlive.py | 80 + yt_dlp/extractor/sibnet.py | 17 + yt_dlp/extractor/simplecast.py | 151 + yt_dlp/extractor/sina.py | 109 + yt_dlp/extractor/sixplay.py | 122 + yt_dlp/extractor/skeb.py | 140 + yt_dlp/extractor/sky.py | 135 + yt_dlp/extractor/skyit.py | 227 + yt_dlp/extractor/skylinewebcams.py | 40 + yt_dlp/extractor/skynewsarabia.py | 116 + yt_dlp/extractor/skynewsau.py | 43 + yt_dlp/extractor/slideshare.py | 53 + yt_dlp/extractor/slideslive.py | 554 ++ yt_dlp/extractor/slutload.py | 63 + yt_dlp/extractor/smotrim.py | 65 + yt_dlp/extractor/snotr.py | 68 + yt_dlp/extractor/sohu.py | 293 + yt_dlp/extractor/sonyliv.py | 220 + yt_dlp/extractor/soundcloud.py | 948 +++ yt_dlp/extractor/soundgasm.py | 74 + yt_dlp/extractor/southpark.py | 188 + yt_dlp/extractor/sovietscloset.py | 207 + yt_dlp/extractor/spankbang.py | 195 + yt_dlp/extractor/spiegel.py | 51 + yt_dlp/extractor/spike.py | 46 + yt_dlp/extractor/sport5.py | 86 + yt_dlp/extractor/sportbox.py | 88 + yt_dlp/extractor/sportdeutschland.py | 142 + yt_dlp/extractor/spotify.py | 167 + yt_dlp/extractor/spreaker.py | 173 + yt_dlp/extractor/springboardplatform.py | 113 + yt_dlp/extractor/sprout.py | 61 + yt_dlp/extractor/srgssr.py | 247 + yt_dlp/extractor/srmediathek.py | 57 + yt_dlp/extractor/stacommu.py | 231 + yt_dlp/extractor/stageplus.py | 515 ++ yt_dlp/extractor/stanfordoc.py | 89 + yt_dlp/extractor/startrek.py | 76 + yt_dlp/extractor/startv.py | 100 + yt_dlp/extractor/steam.py | 170 + yt_dlp/extractor/stitcher.py | 142 + yt_dlp/extractor/storyfire.py | 133 + yt_dlp/extractor/streamable.py | 103 + yt_dlp/extractor/streamcz.py | 122 + yt_dlp/extractor/streetvoice.py | 97 + yt_dlp/extractor/stretchinternet.py | 35 + yt_dlp/extractor/stripchat.py | 66 + yt_dlp/extractor/stv.py | 89 + yt_dlp/extractor/substack.py | 108 + yt_dlp/extractor/sunporno.py | 75 + yt_dlp/extractor/sverigesradio.py | 149 + yt_dlp/extractor/svt.py | 489 ++ yt_dlp/extractor/swearnet.py | 79 + yt_dlp/extractor/syfy.py | 58 + yt_dlp/extractor/syvdk.py | 33 + yt_dlp/extractor/sztvhu.py | 38 + yt_dlp/extractor/tagesschau.py | 164 + yt_dlp/extractor/tass.py | 59 + yt_dlp/extractor/tbs.py | 89 + yt_dlp/extractor/tbsjp.py | 152 + yt_dlp/extractor/teachable.py | 296 + yt_dlp/extractor/teachertube.py | 126 + yt_dlp/extractor/teachingchannel.py | 32 + yt_dlp/extractor/teamcoco.py | 280 + yt_dlp/extractor/teamtreehouse.py | 134 + yt_dlp/extractor/ted.py | 236 + yt_dlp/extractor/tele13.py | 84 + yt_dlp/extractor/tele5.py | 89 + yt_dlp/extractor/telebruxelles.py | 72 + yt_dlp/extractor/telecaribe.py | 91 + yt_dlp/extractor/telecinco.py | 146 + yt_dlp/extractor/telegraaf.py | 86 + yt_dlp/extractor/telegram.py | 136 + yt_dlp/extractor/telemb.py | 75 + yt_dlp/extractor/telemundo.py | 50 + yt_dlp/extractor/telequebec.py | 237 + yt_dlp/extractor/teletask.py | 52 + yt_dlp/extractor/telewebion.py | 133 + yt_dlp/extractor/tempo.py | 114 + yt_dlp/extractor/tencent.py | 490 ++ yt_dlp/extractor/tennistv.py | 155 + yt_dlp/extractor/tenplay.py | 170 + yt_dlp/extractor/testurl.py | 50 + yt_dlp/extractor/tf1.py | 101 + yt_dlp/extractor/tfo.py | 48 + yt_dlp/extractor/theguardian.py | 135 + yt_dlp/extractor/theholetv.py | 35 + yt_dlp/extractor/theintercept.py | 46 + yt_dlp/extractor/theplatform.py | 429 ++ yt_dlp/extractor/thestar.py | 33 + yt_dlp/extractor/thesun.py | 43 + yt_dlp/extractor/theweatherchannel.py | 99 + yt_dlp/extractor/thisamericanlife.py | 38 + yt_dlp/extractor/thisoldhouse.py | 104 + yt_dlp/extractor/thisvid.py | 226 + yt_dlp/extractor/threeqsdn.py | 156 + yt_dlp/extractor/threespeak.py | 93 + yt_dlp/extractor/tiktok.py | 1317 +++++ yt_dlp/extractor/tmz.py | 193 + yt_dlp/extractor/tnaflix.py | 336 ++ yt_dlp/extractor/toggle.py | 228 + yt_dlp/extractor/toggo.py | 82 + yt_dlp/extractor/tonline.py | 53 + yt_dlp/extractor/toongoggles.py | 76 + yt_dlp/extractor/toutv.py | 87 + yt_dlp/extractor/toypics.py | 89 + yt_dlp/extractor/traileraddict.py | 61 + yt_dlp/extractor/triller.py | 329 ++ yt_dlp/extractor/trovo.py | 342 ++ yt_dlp/extractor/trtcocuk.py | 48 + yt_dlp/extractor/trtworld.py | 101 + yt_dlp/extractor/trueid.py | 136 + yt_dlp/extractor/trunews.py | 32 + yt_dlp/extractor/truth.py | 68 + yt_dlp/extractor/trutv.py | 70 + yt_dlp/extractor/tube8.py | 170 + yt_dlp/extractor/tubetugraz.py | 252 + yt_dlp/extractor/tubitv.py | 168 + yt_dlp/extractor/tumblr.py | 387 ++ yt_dlp/extractor/tunein.py | 234 + yt_dlp/extractor/turner.py | 256 + yt_dlp/extractor/tv2.py | 324 ++ yt_dlp/extractor/tv24ua.py | 78 + yt_dlp/extractor/tv2dk.py | 172 + yt_dlp/extractor/tv2hu.py | 104 + yt_dlp/extractor/tv4.py | 149 + yt_dlp/extractor/tv5mondeplus.py | 190 + yt_dlp/extractor/tv5unis.py | 116 + yt_dlp/extractor/tva.py | 85 + yt_dlp/extractor/tvanouvelles.py | 62 + yt_dlp/extractor/tvc.py | 97 + yt_dlp/extractor/tver.py | 103 + yt_dlp/extractor/tvigle.py | 133 + yt_dlp/extractor/tviplayer.py | 78 + yt_dlp/extractor/tvland.py | 37 + yt_dlp/extractor/tvn24.py | 100 + yt_dlp/extractor/tvnoe.py | 46 + yt_dlp/extractor/tvopengr.py | 116 + yt_dlp/extractor/tvp.py | 642 ++ yt_dlp/extractor/tvplay.py | 306 + yt_dlp/extractor/tvplayer.py | 80 + yt_dlp/extractor/tweakers.py | 59 + yt_dlp/extractor/twentymin.py | 80 + yt_dlp/extractor/twentythreevideo.py | 76 + yt_dlp/extractor/twitcasting.py | 306 + yt_dlp/extractor/twitch.py | 1211 ++++ yt_dlp/extractor/twitter.py | 1875 ++++++ yt_dlp/extractor/txxx.py | 438 ++ yt_dlp/extractor/udemy.py | 474 ++ yt_dlp/extractor/udn.py | 98 + yt_dlp/extractor/ufctv.py | 13 + yt_dlp/extractor/ukcolumn.py | 71 + yt_dlp/extractor/uktvplay.py | 36 + yt_dlp/extractor/umg.py | 98 + yt_dlp/extractor/unistra.py | 64 + yt_dlp/extractor/unity.py | 31 + yt_dlp/extractor/unsupported.py | 189 + yt_dlp/extractor/uol.py | 138 + yt_dlp/extractor/uplynk.py | 88 + yt_dlp/extractor/urort.py | 60 + yt_dlp/extractor/urplay.py | 164 + yt_dlp/extractor/usanetwork.py | 21 + yt_dlp/extractor/usatoday.py | 60 + yt_dlp/extractor/ustream.py | 275 + yt_dlp/extractor/ustudio.py | 119 + yt_dlp/extractor/utreon.py | 98 + yt_dlp/extractor/varzesh3.py | 73 + yt_dlp/extractor/vbox7.py | 97 + yt_dlp/extractor/veo.py | 76 + yt_dlp/extractor/veoh.py | 188 + yt_dlp/extractor/vesti.py | 119 + yt_dlp/extractor/vevo.py | 353 ++ yt_dlp/extractor/vgtv.py | 311 + yt_dlp/extractor/vh1.py | 33 + yt_dlp/extractor/vice.py | 313 + yt_dlp/extractor/viddler.py | 135 + yt_dlp/extractor/videa.py | 188 + yt_dlp/extractor/videocampus_sachsen.py | 253 + yt_dlp/extractor/videodetective.py | 27 + yt_dlp/extractor/videofyme.py | 51 + yt_dlp/extractor/videoken.py | 337 ++ yt_dlp/extractor/videomore.py | 307 + yt_dlp/extractor/videopress.py | 89 + yt_dlp/extractor/vidio.py | 309 + yt_dlp/extractor/vidlii.py | 154 + yt_dlp/extractor/vidly.py | 83 + yt_dlp/extractor/viewlift.py | 362 ++ yt_dlp/extractor/viidea.py | 199 + yt_dlp/extractor/viki.py | 346 ++ yt_dlp/extractor/vimeo.py | 1455 +++++ yt_dlp/extractor/vimm.py | 66 + yt_dlp/extractor/vine.py | 151 + yt_dlp/extractor/viously.py | 60 + yt_dlp/extractor/viqeo.py | 87 + yt_dlp/extractor/viu.py | 542 ++ yt_dlp/extractor/vk.py | 842 +++ yt_dlp/extractor/vocaroo.py | 63 + yt_dlp/extractor/vodpl.py | 29 + yt_dlp/extractor/vodplatform.py | 37 + yt_dlp/extractor/voicy.py | 146 + yt_dlp/extractor/volejtv.py | 40 + yt_dlp/extractor/voot.py | 212 + yt_dlp/extractor/voxmedia.py | 215 + yt_dlp/extractor/vrt.py | 427 ++ yt_dlp/extractor/vtm.py | 60 + yt_dlp/extractor/vuclip.py | 68 + yt_dlp/extractor/vvvvid.py | 336 ++ yt_dlp/extractor/walla.py | 82 + yt_dlp/extractor/washingtonpost.py | 123 + yt_dlp/extractor/wat.py | 119 + yt_dlp/extractor/wdr.py | 384 ++ yt_dlp/extractor/webcamerapl.py | 44 + yt_dlp/extractor/webcaster.py | 92 + yt_dlp/extractor/webofstories.py | 155 + yt_dlp/extractor/weibo.py | 251 + yt_dlp/extractor/weiqitv.py | 50 + yt_dlp/extractor/weverse.py | 608 ++ yt_dlp/extractor/wevidi.py | 108 + yt_dlp/extractor/weyyak.py | 86 + yt_dlp/extractor/whowatch.py | 96 + yt_dlp/extractor/whyp.py | 50 + yt_dlp/extractor/wikimedia.py | 55 + yt_dlp/extractor/wimbledon.py | 61 + yt_dlp/extractor/wimtv.py | 150 + yt_dlp/extractor/wistia.py | 394 ++ yt_dlp/extractor/wordpress.py | 154 + yt_dlp/extractor/worldstarhiphop.py | 38 + yt_dlp/extractor/wppilot.py | 173 + yt_dlp/extractor/wrestleuniverse.py | 304 + yt_dlp/extractor/wsj.py | 120 + yt_dlp/extractor/wwe.py | 138 + yt_dlp/extractor/wykop.py | 268 + yt_dlp/extractor/xanimu.py | 51 + yt_dlp/extractor/xboxclips.py | 62 + yt_dlp/extractor/xfileshare.py | 198 + yt_dlp/extractor/xhamster.py | 465 ++ yt_dlp/extractor/ximalaya.py | 167 + yt_dlp/extractor/xinpianchang.py | 92 + yt_dlp/extractor/xminus.py | 77 + yt_dlp/extractor/xnxx.py | 83 + yt_dlp/extractor/xstream.py | 115 + yt_dlp/extractor/xvideos.py | 180 + yt_dlp/extractor/xxxymovies.py | 77 + yt_dlp/extractor/yahoo.py | 430 ++ yt_dlp/extractor/yandexdisk.py | 142 + yt_dlp/extractor/yandexmusic.py | 454 ++ yt_dlp/extractor/yandexvideo.py | 390 ++ yt_dlp/extractor/yapfiles.py | 90 + yt_dlp/extractor/yappy.py | 128 + yt_dlp/extractor/yle_areena.py | 134 + yt_dlp/extractor/youjizz.py | 90 + yt_dlp/extractor/youku.py | 290 + yt_dlp/extractor/younow.py | 201 + yt_dlp/extractor/youporn.py | 198 + yt_dlp/extractor/yourporn.py | 65 + yt_dlp/extractor/yourupload.py | 43 + yt_dlp/extractor/youtube.py | 7387 ++++++++++++++++++++++++ yt_dlp/extractor/zaiko.py | 139 + yt_dlp/extractor/zapiks.py | 106 + yt_dlp/extractor/zattoo.py | 865 +++ yt_dlp/extractor/zdf.py | 442 ++ yt_dlp/extractor/zee5.py | 270 + yt_dlp/extractor/zeenews.py | 59 + yt_dlp/extractor/zenporn.py | 118 + yt_dlp/extractor/zetland.py | 71 + yt_dlp/extractor/zhihu.py | 65 + yt_dlp/extractor/zingmp3.py | 628 ++ yt_dlp/extractor/zoom.py | 164 + yt_dlp/extractor/zype.py | 135 + yt_dlp/jsinterp.py | 853 +++ yt_dlp/minicurses.py | 182 + yt_dlp/networking/__init__.py | 30 + yt_dlp/networking/_helper.py | 283 + yt_dlp/networking/_requests.py | 408 ++ yt_dlp/networking/_urllib.py | 422 ++ yt_dlp/networking/_websockets.py | 173 + yt_dlp/networking/common.py | 565 ++ yt_dlp/networking/exceptions.py | 103 + yt_dlp/networking/websocket.py | 23 + yt_dlp/options.py | 1920 ++++++ yt_dlp/plugins.py | 176 + yt_dlp/postprocessor/__init__.py | 47 + yt_dlp/postprocessor/common.py | 215 + yt_dlp/postprocessor/embedthumbnail.py | 227 + yt_dlp/postprocessor/exec.py | 41 + yt_dlp/postprocessor/ffmpeg.py | 1192 ++++ yt_dlp/postprocessor/metadataparser.py | 125 + yt_dlp/postprocessor/modify_chapters.py | 336 ++ yt_dlp/postprocessor/movefilesafterdownload.py | 53 + yt_dlp/postprocessor/sponskrub.py | 98 + yt_dlp/postprocessor/sponsorblock.py | 104 + yt_dlp/postprocessor/xattrpp.py | 63 + yt_dlp/socks.py | 274 + yt_dlp/update.py | 619 ++ yt_dlp/utils/__init__.py | 10 + yt_dlp/utils/_deprecated.py | 39 + yt_dlp/utils/_legacy.py | 315 + yt_dlp/utils/_utils.py | 5445 +++++++++++++++++ yt_dlp/utils/networking.py | 164 + yt_dlp/utils/progress.py | 109 + yt_dlp/utils/traversal.py | 276 + yt_dlp/version.py | 15 + yt_dlp/webvtt.py | 399 ++ 1063 files changed, 214885 insertions(+) create mode 100644 yt_dlp/YoutubeDL.py create mode 100644 yt_dlp/__init__.py create mode 100644 yt_dlp/__main__.py create mode 100644 yt_dlp/__pyinstaller/__init__.py create mode 100644 yt_dlp/__pyinstaller/hook-yt_dlp.py create mode 100644 yt_dlp/aes.py create mode 100644 yt_dlp/cache.py create mode 100644 yt_dlp/compat/__init__.py create mode 100644 yt_dlp/compat/_deprecated.py create mode 100644 yt_dlp/compat/_legacy.py create mode 100644 yt_dlp/compat/compat_utils.py create mode 100644 yt_dlp/compat/functools.py create mode 100644 yt_dlp/compat/imghdr.py create mode 100644 yt_dlp/compat/shutil.py create mode 100644 yt_dlp/compat/types.py create mode 100644 yt_dlp/compat/urllib/__init__.py create mode 100644 yt_dlp/compat/urllib/request.py create mode 100644 yt_dlp/cookies.py create mode 100644 yt_dlp/dependencies/Cryptodome.py create mode 100644 yt_dlp/dependencies/__init__.py create mode 100644 yt_dlp/downloader/__init__.py create mode 100644 yt_dlp/downloader/common.py create mode 100644 yt_dlp/downloader/dash.py create mode 100644 yt_dlp/downloader/external.py create mode 100644 yt_dlp/downloader/f4m.py create mode 100644 yt_dlp/downloader/fc2.py create mode 100644 yt_dlp/downloader/fragment.py create mode 100644 yt_dlp/downloader/hls.py create mode 100644 yt_dlp/downloader/http.py create mode 100644 yt_dlp/downloader/ism.py create mode 100644 yt_dlp/downloader/mhtml.py create mode 100644 yt_dlp/downloader/niconico.py create mode 100644 yt_dlp/downloader/rtmp.py create mode 100644 yt_dlp/downloader/rtsp.py create mode 100644 yt_dlp/downloader/websocket.py create mode 100644 yt_dlp/downloader/youtube_live_chat.py create mode 100644 yt_dlp/extractor/__init__.py create mode 100644 yt_dlp/extractor/_extractors.py create mode 100644 yt_dlp/extractor/abc.py create mode 100644 yt_dlp/extractor/abcnews.py create mode 100644 yt_dlp/extractor/abcotvs.py create mode 100644 yt_dlp/extractor/abematv.py create mode 100644 yt_dlp/extractor/academicearth.py create mode 100644 yt_dlp/extractor/acast.py create mode 100644 yt_dlp/extractor/acfun.py create mode 100644 yt_dlp/extractor/adn.py create mode 100644 yt_dlp/extractor/adobeconnect.py create mode 100644 yt_dlp/extractor/adobepass.py create mode 100644 yt_dlp/extractor/adobetv.py create mode 100644 yt_dlp/extractor/adultswim.py create mode 100644 yt_dlp/extractor/aenetworks.py create mode 100644 yt_dlp/extractor/aeonco.py create mode 100644 yt_dlp/extractor/afreecatv.py create mode 100644 yt_dlp/extractor/agora.py create mode 100644 yt_dlp/extractor/airtv.py create mode 100644 yt_dlp/extractor/aitube.py create mode 100644 yt_dlp/extractor/aliexpress.py create mode 100644 yt_dlp/extractor/aljazeera.py create mode 100644 yt_dlp/extractor/allocine.py create mode 100644 yt_dlp/extractor/allstar.py create mode 100644 yt_dlp/extractor/alphaporno.py create mode 100644 yt_dlp/extractor/alsace20tv.py create mode 100644 yt_dlp/extractor/altcensored.py create mode 100644 yt_dlp/extractor/alura.py create mode 100644 yt_dlp/extractor/amadeustv.py create mode 100644 yt_dlp/extractor/amara.py create mode 100644 yt_dlp/extractor/amazon.py create mode 100644 yt_dlp/extractor/amazonminitv.py create mode 100644 yt_dlp/extractor/amcnetworks.py create mode 100644 yt_dlp/extractor/americastestkitchen.py create mode 100644 yt_dlp/extractor/amp.py create mode 100644 yt_dlp/extractor/anchorfm.py create mode 100644 yt_dlp/extractor/angel.py create mode 100644 yt_dlp/extractor/antenna.py create mode 100644 yt_dlp/extractor/anvato.py create mode 100644 yt_dlp/extractor/aol.py create mode 100644 yt_dlp/extractor/apa.py create mode 100644 yt_dlp/extractor/aparat.py create mode 100644 yt_dlp/extractor/appleconnect.py create mode 100644 yt_dlp/extractor/applepodcasts.py create mode 100644 yt_dlp/extractor/appletrailers.py create mode 100644 yt_dlp/extractor/archiveorg.py create mode 100644 yt_dlp/extractor/arcpublishing.py create mode 100644 yt_dlp/extractor/ard.py create mode 100644 yt_dlp/extractor/arkena.py create mode 100644 yt_dlp/extractor/arnes.py create mode 100644 yt_dlp/extractor/art19.py create mode 100644 yt_dlp/extractor/arte.py create mode 100644 yt_dlp/extractor/asobichannel.py create mode 100644 yt_dlp/extractor/atresplayer.py create mode 100644 yt_dlp/extractor/atscaleconf.py create mode 100644 yt_dlp/extractor/atvat.py create mode 100644 yt_dlp/extractor/audimedia.py create mode 100644 yt_dlp/extractor/audioboom.py create mode 100644 yt_dlp/extractor/audiodraft.py create mode 100644 yt_dlp/extractor/audiomack.py create mode 100644 yt_dlp/extractor/audius.py create mode 100644 yt_dlp/extractor/awaan.py create mode 100644 yt_dlp/extractor/aws.py create mode 100644 yt_dlp/extractor/axs.py create mode 100644 yt_dlp/extractor/azmedien.py create mode 100644 yt_dlp/extractor/baidu.py create mode 100644 yt_dlp/extractor/banbye.py create mode 100644 yt_dlp/extractor/bandaichannel.py create mode 100644 yt_dlp/extractor/bandcamp.py create mode 100644 yt_dlp/extractor/bannedvideo.py create mode 100644 yt_dlp/extractor/bbc.py create mode 100644 yt_dlp/extractor/beatbump.py create mode 100644 yt_dlp/extractor/beatport.py create mode 100644 yt_dlp/extractor/beeg.py create mode 100644 yt_dlp/extractor/behindkink.py create mode 100644 yt_dlp/extractor/bellmedia.py create mode 100644 yt_dlp/extractor/berufetv.py create mode 100644 yt_dlp/extractor/bet.py create mode 100644 yt_dlp/extractor/bfi.py create mode 100644 yt_dlp/extractor/bfmtv.py create mode 100644 yt_dlp/extractor/bibeltv.py create mode 100644 yt_dlp/extractor/bigflix.py create mode 100644 yt_dlp/extractor/bigo.py create mode 100644 yt_dlp/extractor/bild.py create mode 100644 yt_dlp/extractor/bilibili.py create mode 100644 yt_dlp/extractor/biobiochiletv.py create mode 100644 yt_dlp/extractor/bitchute.py create mode 100644 yt_dlp/extractor/blackboardcollaborate.py create mode 100644 yt_dlp/extractor/bleacherreport.py create mode 100644 yt_dlp/extractor/blerp.py create mode 100644 yt_dlp/extractor/blogger.py create mode 100644 yt_dlp/extractor/bloomberg.py create mode 100644 yt_dlp/extractor/bokecc.py create mode 100644 yt_dlp/extractor/bongacams.py create mode 100644 yt_dlp/extractor/boosty.py create mode 100644 yt_dlp/extractor/bostonglobe.py create mode 100644 yt_dlp/extractor/box.py create mode 100644 yt_dlp/extractor/boxcast.py create mode 100644 yt_dlp/extractor/bpb.py create mode 100644 yt_dlp/extractor/br.py create mode 100644 yt_dlp/extractor/brainpop.py create mode 100644 yt_dlp/extractor/bravotv.py create mode 100644 yt_dlp/extractor/breitbart.py create mode 100644 yt_dlp/extractor/brightcove.py create mode 100644 yt_dlp/extractor/brilliantpala.py create mode 100644 yt_dlp/extractor/bundesliga.py create mode 100644 yt_dlp/extractor/bundestag.py create mode 100644 yt_dlp/extractor/businessinsider.py create mode 100644 yt_dlp/extractor/buzzfeed.py create mode 100644 yt_dlp/extractor/byutv.py create mode 100644 yt_dlp/extractor/c56.py create mode 100644 yt_dlp/extractor/cableav.py create mode 100644 yt_dlp/extractor/callin.py create mode 100644 yt_dlp/extractor/caltrans.py create mode 100644 yt_dlp/extractor/cam4.py create mode 100644 yt_dlp/extractor/camdemy.py create mode 100644 yt_dlp/extractor/camfm.py create mode 100644 yt_dlp/extractor/cammodels.py create mode 100644 yt_dlp/extractor/camsoda.py create mode 100644 yt_dlp/extractor/camtasia.py create mode 100644 yt_dlp/extractor/canal1.py create mode 100644 yt_dlp/extractor/canalalpha.py create mode 100644 yt_dlp/extractor/canalc2.py create mode 100644 yt_dlp/extractor/canalplus.py create mode 100644 yt_dlp/extractor/caracoltv.py create mode 100644 yt_dlp/extractor/cartoonnetwork.py create mode 100644 yt_dlp/extractor/cbc.py create mode 100644 yt_dlp/extractor/cbs.py create mode 100644 yt_dlp/extractor/cbsnews.py create mode 100644 yt_dlp/extractor/cbssports.py create mode 100644 yt_dlp/extractor/ccc.py create mode 100644 yt_dlp/extractor/ccma.py create mode 100644 yt_dlp/extractor/cctv.py create mode 100644 yt_dlp/extractor/cda.py create mode 100644 yt_dlp/extractor/cellebrite.py create mode 100644 yt_dlp/extractor/ceskatelevize.py create mode 100644 yt_dlp/extractor/cgtn.py create mode 100644 yt_dlp/extractor/charlierose.py create mode 100644 yt_dlp/extractor/chaturbate.py create mode 100644 yt_dlp/extractor/chilloutzone.py create mode 100644 yt_dlp/extractor/chzzk.py create mode 100644 yt_dlp/extractor/cinemax.py create mode 100644 yt_dlp/extractor/cinetecamilano.py create mode 100644 yt_dlp/extractor/cineverse.py create mode 100644 yt_dlp/extractor/ciscolive.py create mode 100644 yt_dlp/extractor/ciscowebex.py create mode 100644 yt_dlp/extractor/cjsw.py create mode 100644 yt_dlp/extractor/clipchamp.py create mode 100644 yt_dlp/extractor/clippit.py create mode 100644 yt_dlp/extractor/cliprs.py create mode 100644 yt_dlp/extractor/closertotruth.py create mode 100644 yt_dlp/extractor/cloudflarestream.py create mode 100644 yt_dlp/extractor/cloudycdn.py create mode 100644 yt_dlp/extractor/clubic.py create mode 100644 yt_dlp/extractor/clyp.py create mode 100644 yt_dlp/extractor/cmt.py create mode 100644 yt_dlp/extractor/cnbc.py create mode 100644 yt_dlp/extractor/cnn.py create mode 100644 yt_dlp/extractor/comedycentral.py create mode 100644 yt_dlp/extractor/common.py create mode 100644 yt_dlp/extractor/commonmistakes.py create mode 100644 yt_dlp/extractor/commonprotocols.py create mode 100644 yt_dlp/extractor/condenast.py create mode 100644 yt_dlp/extractor/contv.py create mode 100644 yt_dlp/extractor/corus.py create mode 100644 yt_dlp/extractor/coub.py create mode 100644 yt_dlp/extractor/cozytv.py create mode 100644 yt_dlp/extractor/cpac.py create mode 100644 yt_dlp/extractor/cracked.py create mode 100644 yt_dlp/extractor/crackle.py create mode 100644 yt_dlp/extractor/craftsy.py create mode 100644 yt_dlp/extractor/crooksandliars.py create mode 100644 yt_dlp/extractor/crowdbunker.py create mode 100644 yt_dlp/extractor/crtvg.py create mode 100644 yt_dlp/extractor/crunchyroll.py create mode 100644 yt_dlp/extractor/cspan.py create mode 100644 yt_dlp/extractor/ctsnews.py create mode 100644 yt_dlp/extractor/ctv.py create mode 100644 yt_dlp/extractor/ctvnews.py create mode 100644 yt_dlp/extractor/cultureunplugged.py create mode 100644 yt_dlp/extractor/curiositystream.py create mode 100644 yt_dlp/extractor/cwtv.py create mode 100644 yt_dlp/extractor/cybrary.py create mode 100644 yt_dlp/extractor/dacast.py create mode 100644 yt_dlp/extractor/dailymail.py create mode 100644 yt_dlp/extractor/dailymotion.py create mode 100644 yt_dlp/extractor/dailywire.py create mode 100644 yt_dlp/extractor/damtomo.py create mode 100644 yt_dlp/extractor/daum.py create mode 100644 yt_dlp/extractor/daystar.py create mode 100644 yt_dlp/extractor/dbtv.py create mode 100644 yt_dlp/extractor/dctp.py create mode 100644 yt_dlp/extractor/deezer.py create mode 100644 yt_dlp/extractor/democracynow.py create mode 100644 yt_dlp/extractor/detik.py create mode 100644 yt_dlp/extractor/deuxm.py create mode 100644 yt_dlp/extractor/dfb.py create mode 100644 yt_dlp/extractor/dhm.py create mode 100644 yt_dlp/extractor/digitalconcerthall.py create mode 100644 yt_dlp/extractor/digiteka.py create mode 100644 yt_dlp/extractor/discogs.py create mode 100644 yt_dlp/extractor/discovery.py create mode 100644 yt_dlp/extractor/discoverygo.py create mode 100644 yt_dlp/extractor/disney.py create mode 100644 yt_dlp/extractor/dispeak.py create mode 100644 yt_dlp/extractor/dlf.py create mode 100644 yt_dlp/extractor/dlive.py create mode 100644 yt_dlp/extractor/douyutv.py create mode 100644 yt_dlp/extractor/dplay.py create mode 100644 yt_dlp/extractor/drbonanza.py create mode 100644 yt_dlp/extractor/dreisat.py create mode 100644 yt_dlp/extractor/drooble.py create mode 100644 yt_dlp/extractor/dropbox.py create mode 100644 yt_dlp/extractor/dropout.py create mode 100644 yt_dlp/extractor/drtuber.py create mode 100644 yt_dlp/extractor/drtv.py create mode 100644 yt_dlp/extractor/dtube.py create mode 100644 yt_dlp/extractor/duboku.py create mode 100644 yt_dlp/extractor/dumpert.py create mode 100644 yt_dlp/extractor/duoplay.py create mode 100644 yt_dlp/extractor/dvtv.py create mode 100644 yt_dlp/extractor/dw.py create mode 100644 yt_dlp/extractor/eagleplatform.py create mode 100644 yt_dlp/extractor/ebaumsworld.py create mode 100644 yt_dlp/extractor/ebay.py create mode 100644 yt_dlp/extractor/egghead.py create mode 100644 yt_dlp/extractor/eighttracks.py create mode 100644 yt_dlp/extractor/einthusan.py create mode 100644 yt_dlp/extractor/eitb.py create mode 100644 yt_dlp/extractor/elementorembed.py create mode 100644 yt_dlp/extractor/elonet.py create mode 100644 yt_dlp/extractor/elpais.py create mode 100644 yt_dlp/extractor/eltrecetv.py create mode 100644 yt_dlp/extractor/embedly.py create mode 100644 yt_dlp/extractor/epicon.py create mode 100644 yt_dlp/extractor/epidemicsound.py create mode 100644 yt_dlp/extractor/eplus.py create mode 100644 yt_dlp/extractor/epoch.py create mode 100644 yt_dlp/extractor/eporner.py create mode 100644 yt_dlp/extractor/erocast.py create mode 100644 yt_dlp/extractor/eroprofile.py create mode 100644 yt_dlp/extractor/err.py create mode 100644 yt_dlp/extractor/ertgr.py create mode 100644 yt_dlp/extractor/espn.py create mode 100644 yt_dlp/extractor/ettutv.py create mode 100644 yt_dlp/extractor/europa.py create mode 100644 yt_dlp/extractor/europeantour.py create mode 100644 yt_dlp/extractor/eurosport.py create mode 100644 yt_dlp/extractor/euscreen.py create mode 100644 yt_dlp/extractor/expressen.py create mode 100644 yt_dlp/extractor/extractors.py create mode 100644 yt_dlp/extractor/eyedotv.py create mode 100644 yt_dlp/extractor/facebook.py create mode 100644 yt_dlp/extractor/fancode.py create mode 100644 yt_dlp/extractor/faz.py create mode 100644 yt_dlp/extractor/fc2.py create mode 100644 yt_dlp/extractor/fczenit.py create mode 100644 yt_dlp/extractor/fifa.py create mode 100644 yt_dlp/extractor/filmon.py create mode 100644 yt_dlp/extractor/filmweb.py create mode 100644 yt_dlp/extractor/firsttv.py create mode 100644 yt_dlp/extractor/fivetv.py create mode 100644 yt_dlp/extractor/flextv.py create mode 100644 yt_dlp/extractor/flickr.py create mode 100644 yt_dlp/extractor/floatplane.py create mode 100644 yt_dlp/extractor/folketinget.py create mode 100644 yt_dlp/extractor/footyroom.py create mode 100644 yt_dlp/extractor/formula1.py create mode 100644 yt_dlp/extractor/fourtube.py create mode 100644 yt_dlp/extractor/fox.py create mode 100644 yt_dlp/extractor/fox9.py create mode 100644 yt_dlp/extractor/foxnews.py create mode 100644 yt_dlp/extractor/foxsports.py create mode 100644 yt_dlp/extractor/fptplay.py create mode 100644 yt_dlp/extractor/franceinter.py create mode 100644 yt_dlp/extractor/francetv.py create mode 100644 yt_dlp/extractor/freesound.py create mode 100644 yt_dlp/extractor/freespeech.py create mode 100644 yt_dlp/extractor/freetv.py create mode 100644 yt_dlp/extractor/frontendmasters.py create mode 100644 yt_dlp/extractor/fujitv.py create mode 100644 yt_dlp/extractor/funimation.py create mode 100644 yt_dlp/extractor/funk.py create mode 100644 yt_dlp/extractor/funker530.py create mode 100644 yt_dlp/extractor/fuyintv.py create mode 100644 yt_dlp/extractor/gab.py create mode 100644 yt_dlp/extractor/gaia.py create mode 100644 yt_dlp/extractor/gamejolt.py create mode 100644 yt_dlp/extractor/gamespot.py create mode 100644 yt_dlp/extractor/gamestar.py create mode 100644 yt_dlp/extractor/gaskrank.py create mode 100644 yt_dlp/extractor/gazeta.py create mode 100644 yt_dlp/extractor/gdcvault.py create mode 100644 yt_dlp/extractor/gedidigital.py create mode 100644 yt_dlp/extractor/generic.py create mode 100644 yt_dlp/extractor/genericembeds.py create mode 100644 yt_dlp/extractor/genius.py create mode 100644 yt_dlp/extractor/getcourseru.py create mode 100644 yt_dlp/extractor/gettr.py create mode 100644 yt_dlp/extractor/giantbomb.py create mode 100644 yt_dlp/extractor/gigya.py create mode 100644 yt_dlp/extractor/glide.py create mode 100644 yt_dlp/extractor/globalplayer.py create mode 100644 yt_dlp/extractor/globo.py create mode 100644 yt_dlp/extractor/glomex.py create mode 100644 yt_dlp/extractor/gmanetwork.py create mode 100644 yt_dlp/extractor/go.py create mode 100644 yt_dlp/extractor/godtube.py create mode 100644 yt_dlp/extractor/gofile.py create mode 100644 yt_dlp/extractor/golem.py create mode 100644 yt_dlp/extractor/goodgame.py create mode 100644 yt_dlp/extractor/googledrive.py create mode 100644 yt_dlp/extractor/googlepodcasts.py create mode 100644 yt_dlp/extractor/googlesearch.py create mode 100644 yt_dlp/extractor/goplay.py create mode 100644 yt_dlp/extractor/gopro.py create mode 100644 yt_dlp/extractor/goshgay.py create mode 100644 yt_dlp/extractor/gotostage.py create mode 100644 yt_dlp/extractor/gputechconf.py create mode 100644 yt_dlp/extractor/gronkh.py create mode 100644 yt_dlp/extractor/groupon.py create mode 100644 yt_dlp/extractor/harpodeon.py create mode 100644 yt_dlp/extractor/hbo.py create mode 100644 yt_dlp/extractor/hearthisat.py create mode 100644 yt_dlp/extractor/heise.py create mode 100644 yt_dlp/extractor/hellporno.py create mode 100644 yt_dlp/extractor/hgtv.py create mode 100644 yt_dlp/extractor/hidive.py create mode 100644 yt_dlp/extractor/historicfilms.py create mode 100644 yt_dlp/extractor/hitrecord.py create mode 100644 yt_dlp/extractor/hketv.py create mode 100644 yt_dlp/extractor/hollywoodreporter.py create mode 100644 yt_dlp/extractor/holodex.py create mode 100644 yt_dlp/extractor/hotnewhiphop.py create mode 100644 yt_dlp/extractor/hotstar.py create mode 100644 yt_dlp/extractor/hrefli.py create mode 100644 yt_dlp/extractor/hrfensehen.py create mode 100644 yt_dlp/extractor/hrti.py create mode 100644 yt_dlp/extractor/hse.py create mode 100644 yt_dlp/extractor/huajiao.py create mode 100644 yt_dlp/extractor/huffpost.py create mode 100644 yt_dlp/extractor/hungama.py create mode 100644 yt_dlp/extractor/huya.py create mode 100644 yt_dlp/extractor/hypem.py create mode 100644 yt_dlp/extractor/hypergryph.py create mode 100644 yt_dlp/extractor/hytale.py create mode 100644 yt_dlp/extractor/icareus.py create mode 100644 yt_dlp/extractor/ichinanalive.py create mode 100644 yt_dlp/extractor/idolplus.py create mode 100644 yt_dlp/extractor/ign.py create mode 100644 yt_dlp/extractor/iheart.py create mode 100644 yt_dlp/extractor/ilpost.py create mode 100644 yt_dlp/extractor/iltalehti.py create mode 100644 yt_dlp/extractor/imdb.py create mode 100644 yt_dlp/extractor/imggaming.py create mode 100644 yt_dlp/extractor/imgur.py create mode 100644 yt_dlp/extractor/ina.py create mode 100644 yt_dlp/extractor/inc.py create mode 100644 yt_dlp/extractor/indavideo.py create mode 100644 yt_dlp/extractor/infoq.py create mode 100644 yt_dlp/extractor/instagram.py create mode 100644 yt_dlp/extractor/internazionale.py create mode 100644 yt_dlp/extractor/internetvideoarchive.py create mode 100644 yt_dlp/extractor/iprima.py create mode 100644 yt_dlp/extractor/iqiyi.py create mode 100644 yt_dlp/extractor/islamchannel.py create mode 100644 yt_dlp/extractor/israelnationalnews.py create mode 100644 yt_dlp/extractor/itprotv.py create mode 100644 yt_dlp/extractor/itv.py create mode 100644 yt_dlp/extractor/ivi.py create mode 100644 yt_dlp/extractor/ivideon.py create mode 100644 yt_dlp/extractor/iwara.py create mode 100644 yt_dlp/extractor/ixigua.py create mode 100644 yt_dlp/extractor/izlesene.py create mode 100644 yt_dlp/extractor/jable.py create mode 100644 yt_dlp/extractor/jamendo.py create mode 100644 yt_dlp/extractor/japandiet.py create mode 100644 yt_dlp/extractor/jeuxvideo.py create mode 100644 yt_dlp/extractor/jiosaavn.py create mode 100644 yt_dlp/extractor/jixie.py create mode 100644 yt_dlp/extractor/joj.py create mode 100644 yt_dlp/extractor/joqrag.py create mode 100644 yt_dlp/extractor/jove.py create mode 100644 yt_dlp/extractor/jstream.py create mode 100644 yt_dlp/extractor/jtbc.py create mode 100644 yt_dlp/extractor/jwplatform.py create mode 100644 yt_dlp/extractor/kakao.py create mode 100644 yt_dlp/extractor/kaltura.py create mode 100644 yt_dlp/extractor/kankanews.py create mode 100644 yt_dlp/extractor/karaoketv.py create mode 100644 yt_dlp/extractor/kelbyone.py create mode 100644 yt_dlp/extractor/khanacademy.py create mode 100644 yt_dlp/extractor/kick.py create mode 100644 yt_dlp/extractor/kicker.py create mode 100644 yt_dlp/extractor/kickstarter.py create mode 100644 yt_dlp/extractor/kinja.py create mode 100644 yt_dlp/extractor/kinopoisk.py create mode 100644 yt_dlp/extractor/kommunetv.py create mode 100644 yt_dlp/extractor/kompas.py create mode 100644 yt_dlp/extractor/koo.py create mode 100644 yt_dlp/extractor/krasview.py create mode 100644 yt_dlp/extractor/kth.py create mode 100644 yt_dlp/extractor/ku6.py create mode 100644 yt_dlp/extractor/kukululive.py create mode 100644 yt_dlp/extractor/kuwo.py create mode 100644 yt_dlp/extractor/la7.py create mode 100644 yt_dlp/extractor/lastfm.py create mode 100644 yt_dlp/extractor/laxarxames.py create mode 100644 yt_dlp/extractor/lbry.py create mode 100644 yt_dlp/extractor/lci.py create mode 100644 yt_dlp/extractor/lcp.py create mode 100644 yt_dlp/extractor/lecture2go.py create mode 100644 yt_dlp/extractor/lecturio.py create mode 100644 yt_dlp/extractor/leeco.py create mode 100644 yt_dlp/extractor/lefigaro.py create mode 100644 yt_dlp/extractor/lego.py create mode 100644 yt_dlp/extractor/lemonde.py create mode 100644 yt_dlp/extractor/lenta.py create mode 100644 yt_dlp/extractor/libraryofcongress.py create mode 100644 yt_dlp/extractor/libsyn.py create mode 100644 yt_dlp/extractor/lifenews.py create mode 100644 yt_dlp/extractor/likee.py create mode 100644 yt_dlp/extractor/limelight.py create mode 100644 yt_dlp/extractor/linkedin.py create mode 100644 yt_dlp/extractor/liputan6.py create mode 100644 yt_dlp/extractor/listennotes.py create mode 100644 yt_dlp/extractor/litv.py create mode 100644 yt_dlp/extractor/livejournal.py create mode 100644 yt_dlp/extractor/livestream.py create mode 100644 yt_dlp/extractor/livestreamfails.py create mode 100644 yt_dlp/extractor/lnkgo.py create mode 100644 yt_dlp/extractor/lovehomeporn.py create mode 100644 yt_dlp/extractor/lrt.py create mode 100644 yt_dlp/extractor/lsm.py create mode 100644 yt_dlp/extractor/lumni.py create mode 100644 yt_dlp/extractor/lynda.py create mode 100644 yt_dlp/extractor/maariv.py create mode 100644 yt_dlp/extractor/magellantv.py create mode 100644 yt_dlp/extractor/magentamusik.py create mode 100644 yt_dlp/extractor/mailru.py create mode 100644 yt_dlp/extractor/mainstreaming.py create mode 100644 yt_dlp/extractor/mangomolo.py create mode 100644 yt_dlp/extractor/manoto.py create mode 100644 yt_dlp/extractor/manyvids.py create mode 100644 yt_dlp/extractor/maoritv.py create mode 100644 yt_dlp/extractor/markiza.py create mode 100644 yt_dlp/extractor/massengeschmacktv.py create mode 100644 yt_dlp/extractor/masters.py create mode 100644 yt_dlp/extractor/matchtv.py create mode 100644 yt_dlp/extractor/mbn.py create mode 100644 yt_dlp/extractor/mdr.py create mode 100644 yt_dlp/extractor/medaltv.py create mode 100644 yt_dlp/extractor/mediaite.py create mode 100644 yt_dlp/extractor/mediaklikk.py create mode 100644 yt_dlp/extractor/medialaan.py create mode 100644 yt_dlp/extractor/mediaset.py create mode 100644 yt_dlp/extractor/mediasite.py create mode 100644 yt_dlp/extractor/mediastream.py create mode 100644 yt_dlp/extractor/mediaworksnz.py create mode 100644 yt_dlp/extractor/medici.py create mode 100644 yt_dlp/extractor/megaphone.py create mode 100644 yt_dlp/extractor/megatvcom.py create mode 100644 yt_dlp/extractor/meipai.py create mode 100644 yt_dlp/extractor/melonvod.py create mode 100644 yt_dlp/extractor/metacritic.py create mode 100644 yt_dlp/extractor/mgtv.py create mode 100644 yt_dlp/extractor/microsoftembed.py create mode 100644 yt_dlp/extractor/microsoftstream.py create mode 100644 yt_dlp/extractor/microsoftvirtualacademy.py create mode 100644 yt_dlp/extractor/mildom.py create mode 100644 yt_dlp/extractor/minds.py create mode 100644 yt_dlp/extractor/minoto.py create mode 100644 yt_dlp/extractor/mirrativ.py create mode 100644 yt_dlp/extractor/mirrorcouk.py create mode 100644 yt_dlp/extractor/mit.py create mode 100644 yt_dlp/extractor/mitele.py create mode 100644 yt_dlp/extractor/mixch.py create mode 100644 yt_dlp/extractor/mixcloud.py create mode 100644 yt_dlp/extractor/mlb.py create mode 100644 yt_dlp/extractor/mlssoccer.py create mode 100644 yt_dlp/extractor/mocha.py create mode 100644 yt_dlp/extractor/mojvideo.py create mode 100644 yt_dlp/extractor/monstercat.py create mode 100644 yt_dlp/extractor/motherless.py create mode 100644 yt_dlp/extractor/motorsport.py create mode 100644 yt_dlp/extractor/moviepilot.py create mode 100644 yt_dlp/extractor/moview.py create mode 100644 yt_dlp/extractor/moviezine.py create mode 100644 yt_dlp/extractor/movingimage.py create mode 100644 yt_dlp/extractor/msn.py create mode 100644 yt_dlp/extractor/mtv.py create mode 100644 yt_dlp/extractor/muenchentv.py create mode 100644 yt_dlp/extractor/murrtube.py create mode 100644 yt_dlp/extractor/museai.py create mode 100644 yt_dlp/extractor/musescore.py create mode 100644 yt_dlp/extractor/musicdex.py create mode 100644 yt_dlp/extractor/mx3.py create mode 100644 yt_dlp/extractor/mxplayer.py create mode 100644 yt_dlp/extractor/myspace.py create mode 100644 yt_dlp/extractor/myspass.py create mode 100644 yt_dlp/extractor/myvideoge.py create mode 100644 yt_dlp/extractor/myvidster.py create mode 100644 yt_dlp/extractor/mzaalo.py create mode 100644 yt_dlp/extractor/n1.py create mode 100644 yt_dlp/extractor/nate.py create mode 100644 yt_dlp/extractor/nationalgeographic.py create mode 100644 yt_dlp/extractor/naver.py create mode 100644 yt_dlp/extractor/nba.py create mode 100644 yt_dlp/extractor/nbc.py create mode 100644 yt_dlp/extractor/ndr.py create mode 100644 yt_dlp/extractor/ndtv.py create mode 100644 yt_dlp/extractor/nebula.py create mode 100644 yt_dlp/extractor/nekohacker.py create mode 100644 yt_dlp/extractor/nerdcubed.py create mode 100644 yt_dlp/extractor/neteasemusic.py create mode 100644 yt_dlp/extractor/netverse.py create mode 100644 yt_dlp/extractor/netzkino.py create mode 100644 yt_dlp/extractor/newgrounds.py create mode 100644 yt_dlp/extractor/newspicks.py create mode 100644 yt_dlp/extractor/newsy.py create mode 100644 yt_dlp/extractor/nextmedia.py create mode 100644 yt_dlp/extractor/nexx.py create mode 100644 yt_dlp/extractor/nfb.py create mode 100644 yt_dlp/extractor/nfhsnetwork.py create mode 100644 yt_dlp/extractor/nfl.py create mode 100644 yt_dlp/extractor/nhk.py create mode 100644 yt_dlp/extractor/nhl.py create mode 100644 yt_dlp/extractor/nick.py create mode 100644 yt_dlp/extractor/niconico.py create mode 100644 yt_dlp/extractor/niconicochannelplus.py create mode 100644 yt_dlp/extractor/ninaprotocol.py create mode 100644 yt_dlp/extractor/ninecninemedia.py create mode 100644 yt_dlp/extractor/ninegag.py create mode 100644 yt_dlp/extractor/ninenews.py create mode 100644 yt_dlp/extractor/ninenow.py create mode 100644 yt_dlp/extractor/nintendo.py create mode 100644 yt_dlp/extractor/nitter.py create mode 100644 yt_dlp/extractor/nobelprize.py create mode 100644 yt_dlp/extractor/noice.py create mode 100644 yt_dlp/extractor/nonktube.py create mode 100644 yt_dlp/extractor/noodlemagazine.py create mode 100644 yt_dlp/extractor/noovo.py create mode 100644 yt_dlp/extractor/nosnl.py create mode 100644 yt_dlp/extractor/nova.py create mode 100644 yt_dlp/extractor/novaplay.py create mode 100644 yt_dlp/extractor/nowness.py create mode 100644 yt_dlp/extractor/noz.py create mode 100644 yt_dlp/extractor/npo.py create mode 100644 yt_dlp/extractor/npr.py create mode 100644 yt_dlp/extractor/nrk.py create mode 100644 yt_dlp/extractor/nrl.py create mode 100644 yt_dlp/extractor/ntvcojp.py create mode 100644 yt_dlp/extractor/ntvde.py create mode 100644 yt_dlp/extractor/ntvru.py create mode 100644 yt_dlp/extractor/nubilesporn.py create mode 100644 yt_dlp/extractor/nuevo.py create mode 100644 yt_dlp/extractor/nuum.py create mode 100644 yt_dlp/extractor/nuvid.py create mode 100644 yt_dlp/extractor/nytimes.py create mode 100644 yt_dlp/extractor/nzherald.py create mode 100644 yt_dlp/extractor/nzonscreen.py create mode 100644 yt_dlp/extractor/nzz.py create mode 100644 yt_dlp/extractor/odkmedia.py create mode 100644 yt_dlp/extractor/odnoklassniki.py create mode 100644 yt_dlp/extractor/oftv.py create mode 100644 yt_dlp/extractor/oktoberfesttv.py create mode 100644 yt_dlp/extractor/olympics.py create mode 100644 yt_dlp/extractor/on24.py create mode 100644 yt_dlp/extractor/once.py create mode 100644 yt_dlp/extractor/ondemandkorea.py create mode 100644 yt_dlp/extractor/onefootball.py create mode 100644 yt_dlp/extractor/onenewsnz.py create mode 100644 yt_dlp/extractor/oneplace.py create mode 100644 yt_dlp/extractor/onet.py create mode 100644 yt_dlp/extractor/onionstudios.py create mode 100644 yt_dlp/extractor/opencast.py create mode 100644 yt_dlp/extractor/openload.py create mode 100644 yt_dlp/extractor/openrec.py create mode 100644 yt_dlp/extractor/ora.py create mode 100644 yt_dlp/extractor/orf.py create mode 100644 yt_dlp/extractor/outsidetv.py create mode 100644 yt_dlp/extractor/owncloud.py create mode 100644 yt_dlp/extractor/packtpub.py create mode 100644 yt_dlp/extractor/palcomp3.py create mode 100644 yt_dlp/extractor/panopto.py create mode 100644 yt_dlp/extractor/paramountplus.py create mode 100644 yt_dlp/extractor/parler.py create mode 100644 yt_dlp/extractor/parlview.py create mode 100644 yt_dlp/extractor/patreon.py create mode 100644 yt_dlp/extractor/pbs.py create mode 100644 yt_dlp/extractor/pearvideo.py create mode 100644 yt_dlp/extractor/peekvids.py create mode 100644 yt_dlp/extractor/peertube.py create mode 100644 yt_dlp/extractor/peertv.py create mode 100644 yt_dlp/extractor/peloton.py create mode 100644 yt_dlp/extractor/performgroup.py create mode 100644 yt_dlp/extractor/periscope.py create mode 100644 yt_dlp/extractor/pgatour.py create mode 100644 yt_dlp/extractor/philharmoniedeparis.py create mode 100644 yt_dlp/extractor/phoenix.py create mode 100644 yt_dlp/extractor/photobucket.py create mode 100644 yt_dlp/extractor/piapro.py create mode 100644 yt_dlp/extractor/piaulizaportal.py create mode 100644 yt_dlp/extractor/picarto.py create mode 100644 yt_dlp/extractor/piksel.py create mode 100644 yt_dlp/extractor/pinkbike.py create mode 100644 yt_dlp/extractor/pinterest.py create mode 100644 yt_dlp/extractor/pixivsketch.py create mode 100644 yt_dlp/extractor/pladform.py create mode 100644 yt_dlp/extractor/planetmarathi.py create mode 100644 yt_dlp/extractor/platzi.py create mode 100644 yt_dlp/extractor/playplustv.py create mode 100644 yt_dlp/extractor/playsuisse.py create mode 100644 yt_dlp/extractor/playtvak.py create mode 100644 yt_dlp/extractor/playwire.py create mode 100644 yt_dlp/extractor/pluralsight.py create mode 100644 yt_dlp/extractor/plutotv.py create mode 100644 yt_dlp/extractor/podbayfm.py create mode 100644 yt_dlp/extractor/podchaser.py create mode 100644 yt_dlp/extractor/podomatic.py create mode 100644 yt_dlp/extractor/pokemon.py create mode 100644 yt_dlp/extractor/pokergo.py create mode 100644 yt_dlp/extractor/polsatgo.py create mode 100644 yt_dlp/extractor/polskieradio.py create mode 100644 yt_dlp/extractor/popcorntimes.py create mode 100644 yt_dlp/extractor/popcorntv.py create mode 100644 yt_dlp/extractor/porn91.py create mode 100644 yt_dlp/extractor/pornbox.py create mode 100644 yt_dlp/extractor/pornflip.py create mode 100644 yt_dlp/extractor/pornhub.py create mode 100644 yt_dlp/extractor/pornotube.py create mode 100644 yt_dlp/extractor/pornovoisines.py create mode 100644 yt_dlp/extractor/pornoxo.py create mode 100644 yt_dlp/extractor/pr0gramm.py create mode 100644 yt_dlp/extractor/prankcast.py create mode 100644 yt_dlp/extractor/premiershiprugby.py create mode 100644 yt_dlp/extractor/presstv.py create mode 100644 yt_dlp/extractor/projectveritas.py create mode 100644 yt_dlp/extractor/prosiebensat1.py create mode 100644 yt_dlp/extractor/prx.py create mode 100644 yt_dlp/extractor/puhutv.py create mode 100644 yt_dlp/extractor/puls4.py create mode 100644 yt_dlp/extractor/pyvideo.py create mode 100644 yt_dlp/extractor/qdance.py create mode 100644 yt_dlp/extractor/qingting.py create mode 100644 yt_dlp/extractor/qqmusic.py create mode 100644 yt_dlp/extractor/r7.py create mode 100644 yt_dlp/extractor/radiko.py create mode 100644 yt_dlp/extractor/radiocanada.py create mode 100644 yt_dlp/extractor/radiocomercial.py create mode 100644 yt_dlp/extractor/radiode.py create mode 100644 yt_dlp/extractor/radiofrance.py create mode 100644 yt_dlp/extractor/radiojavan.py create mode 100644 yt_dlp/extractor/radiokapital.py create mode 100644 yt_dlp/extractor/radiozet.py create mode 100644 yt_dlp/extractor/radlive.py create mode 100644 yt_dlp/extractor/rai.py create mode 100644 yt_dlp/extractor/raywenderlich.py create mode 100644 yt_dlp/extractor/rbgtum.py create mode 100644 yt_dlp/extractor/rcs.py create mode 100644 yt_dlp/extractor/rcti.py create mode 100644 yt_dlp/extractor/rds.py create mode 100644 yt_dlp/extractor/redbee.py create mode 100644 yt_dlp/extractor/redbulltv.py create mode 100644 yt_dlp/extractor/reddit.py create mode 100644 yt_dlp/extractor/redge.py create mode 100644 yt_dlp/extractor/redgifs.py create mode 100644 yt_dlp/extractor/redtube.py create mode 100644 yt_dlp/extractor/rentv.py create mode 100644 yt_dlp/extractor/restudy.py create mode 100644 yt_dlp/extractor/reuters.py create mode 100644 yt_dlp/extractor/reverbnation.py create mode 100644 yt_dlp/extractor/rheinmaintv.py create mode 100644 yt_dlp/extractor/ridehome.py create mode 100644 yt_dlp/extractor/rinsefm.py create mode 100644 yt_dlp/extractor/rmcdecouverte.py create mode 100644 yt_dlp/extractor/rockstargames.py create mode 100644 yt_dlp/extractor/rokfin.py create mode 100644 yt_dlp/extractor/roosterteeth.py create mode 100644 yt_dlp/extractor/rottentomatoes.py create mode 100644 yt_dlp/extractor/rozhlas.py create mode 100644 yt_dlp/extractor/rte.py create mode 100644 yt_dlp/extractor/rtl2.py create mode 100644 yt_dlp/extractor/rtlnl.py create mode 100644 yt_dlp/extractor/rtnews.py create mode 100644 yt_dlp/extractor/rtp.py create mode 100644 yt_dlp/extractor/rtrfm.py create mode 100644 yt_dlp/extractor/rts.py create mode 100644 yt_dlp/extractor/rtvcplay.py create mode 100644 yt_dlp/extractor/rtve.py create mode 100644 yt_dlp/extractor/rtvs.py create mode 100644 yt_dlp/extractor/rtvslo.py create mode 100644 yt_dlp/extractor/rudovideo.py create mode 100644 yt_dlp/extractor/rule34video.py create mode 100644 yt_dlp/extractor/rumble.py create mode 100644 yt_dlp/extractor/rutube.py create mode 100644 yt_dlp/extractor/rutv.py create mode 100644 yt_dlp/extractor/ruutu.py create mode 100644 yt_dlp/extractor/ruv.py create mode 100644 yt_dlp/extractor/s4c.py create mode 100644 yt_dlp/extractor/safari.py create mode 100644 yt_dlp/extractor/saitosan.py create mode 100644 yt_dlp/extractor/samplefocus.py create mode 100644 yt_dlp/extractor/sapo.py create mode 100644 yt_dlp/extractor/sbs.py create mode 100644 yt_dlp/extractor/sbscokr.py create mode 100644 yt_dlp/extractor/screen9.py create mode 100644 yt_dlp/extractor/screencast.py create mode 100644 yt_dlp/extractor/screencastify.py create mode 100644 yt_dlp/extractor/screencastomatic.py create mode 100644 yt_dlp/extractor/scrippsnetworks.py create mode 100644 yt_dlp/extractor/scrolller.py create mode 100644 yt_dlp/extractor/scte.py create mode 100644 yt_dlp/extractor/sejmpl.py create mode 100644 yt_dlp/extractor/senalcolombia.py create mode 100644 yt_dlp/extractor/senategov.py create mode 100644 yt_dlp/extractor/sendtonews.py create mode 100644 yt_dlp/extractor/servus.py create mode 100644 yt_dlp/extractor/sevenplus.py create mode 100644 yt_dlp/extractor/sexu.py create mode 100644 yt_dlp/extractor/seznamzpravy.py create mode 100644 yt_dlp/extractor/shahid.py create mode 100644 yt_dlp/extractor/sharevideos.py create mode 100644 yt_dlp/extractor/shemaroome.py create mode 100644 yt_dlp/extractor/showroomlive.py create mode 100644 yt_dlp/extractor/sibnet.py create mode 100644 yt_dlp/extractor/simplecast.py create mode 100644 yt_dlp/extractor/sina.py create mode 100644 yt_dlp/extractor/sixplay.py create mode 100644 yt_dlp/extractor/skeb.py create mode 100644 yt_dlp/extractor/sky.py create mode 100644 yt_dlp/extractor/skyit.py create mode 100644 yt_dlp/extractor/skylinewebcams.py create mode 100644 yt_dlp/extractor/skynewsarabia.py create mode 100644 yt_dlp/extractor/skynewsau.py create mode 100644 yt_dlp/extractor/slideshare.py create mode 100644 yt_dlp/extractor/slideslive.py create mode 100644 yt_dlp/extractor/slutload.py create mode 100644 yt_dlp/extractor/smotrim.py create mode 100644 yt_dlp/extractor/snotr.py create mode 100644 yt_dlp/extractor/sohu.py create mode 100644 yt_dlp/extractor/sonyliv.py create mode 100644 yt_dlp/extractor/soundcloud.py create mode 100644 yt_dlp/extractor/soundgasm.py create mode 100644 yt_dlp/extractor/southpark.py create mode 100644 yt_dlp/extractor/sovietscloset.py create mode 100644 yt_dlp/extractor/spankbang.py create mode 100644 yt_dlp/extractor/spiegel.py create mode 100644 yt_dlp/extractor/spike.py create mode 100644 yt_dlp/extractor/sport5.py create mode 100644 yt_dlp/extractor/sportbox.py create mode 100644 yt_dlp/extractor/sportdeutschland.py create mode 100644 yt_dlp/extractor/spotify.py create mode 100644 yt_dlp/extractor/spreaker.py create mode 100644 yt_dlp/extractor/springboardplatform.py create mode 100644 yt_dlp/extractor/sprout.py create mode 100644 yt_dlp/extractor/srgssr.py create mode 100644 yt_dlp/extractor/srmediathek.py create mode 100644 yt_dlp/extractor/stacommu.py create mode 100644 yt_dlp/extractor/stageplus.py create mode 100644 yt_dlp/extractor/stanfordoc.py create mode 100644 yt_dlp/extractor/startrek.py create mode 100644 yt_dlp/extractor/startv.py create mode 100644 yt_dlp/extractor/steam.py create mode 100644 yt_dlp/extractor/stitcher.py create mode 100644 yt_dlp/extractor/storyfire.py create mode 100644 yt_dlp/extractor/streamable.py create mode 100644 yt_dlp/extractor/streamcz.py create mode 100644 yt_dlp/extractor/streetvoice.py create mode 100644 yt_dlp/extractor/stretchinternet.py create mode 100644 yt_dlp/extractor/stripchat.py create mode 100644 yt_dlp/extractor/stv.py create mode 100644 yt_dlp/extractor/substack.py create mode 100644 yt_dlp/extractor/sunporno.py create mode 100644 yt_dlp/extractor/sverigesradio.py create mode 100644 yt_dlp/extractor/svt.py create mode 100644 yt_dlp/extractor/swearnet.py create mode 100644 yt_dlp/extractor/syfy.py create mode 100644 yt_dlp/extractor/syvdk.py create mode 100644 yt_dlp/extractor/sztvhu.py create mode 100644 yt_dlp/extractor/tagesschau.py create mode 100644 yt_dlp/extractor/tass.py create mode 100644 yt_dlp/extractor/tbs.py create mode 100644 yt_dlp/extractor/tbsjp.py create mode 100644 yt_dlp/extractor/teachable.py create mode 100644 yt_dlp/extractor/teachertube.py create mode 100644 yt_dlp/extractor/teachingchannel.py create mode 100644 yt_dlp/extractor/teamcoco.py create mode 100644 yt_dlp/extractor/teamtreehouse.py create mode 100644 yt_dlp/extractor/ted.py create mode 100644 yt_dlp/extractor/tele13.py create mode 100644 yt_dlp/extractor/tele5.py create mode 100644 yt_dlp/extractor/telebruxelles.py create mode 100644 yt_dlp/extractor/telecaribe.py create mode 100644 yt_dlp/extractor/telecinco.py create mode 100644 yt_dlp/extractor/telegraaf.py create mode 100644 yt_dlp/extractor/telegram.py create mode 100644 yt_dlp/extractor/telemb.py create mode 100644 yt_dlp/extractor/telemundo.py create mode 100644 yt_dlp/extractor/telequebec.py create mode 100644 yt_dlp/extractor/teletask.py create mode 100644 yt_dlp/extractor/telewebion.py create mode 100644 yt_dlp/extractor/tempo.py create mode 100644 yt_dlp/extractor/tencent.py create mode 100644 yt_dlp/extractor/tennistv.py create mode 100644 yt_dlp/extractor/tenplay.py create mode 100644 yt_dlp/extractor/testurl.py create mode 100644 yt_dlp/extractor/tf1.py create mode 100644 yt_dlp/extractor/tfo.py create mode 100644 yt_dlp/extractor/theguardian.py create mode 100644 yt_dlp/extractor/theholetv.py create mode 100644 yt_dlp/extractor/theintercept.py create mode 100644 yt_dlp/extractor/theplatform.py create mode 100644 yt_dlp/extractor/thestar.py create mode 100644 yt_dlp/extractor/thesun.py create mode 100644 yt_dlp/extractor/theweatherchannel.py create mode 100644 yt_dlp/extractor/thisamericanlife.py create mode 100644 yt_dlp/extractor/thisoldhouse.py create mode 100644 yt_dlp/extractor/thisvid.py create mode 100644 yt_dlp/extractor/threeqsdn.py create mode 100644 yt_dlp/extractor/threespeak.py create mode 100644 yt_dlp/extractor/tiktok.py create mode 100644 yt_dlp/extractor/tmz.py create mode 100644 yt_dlp/extractor/tnaflix.py create mode 100644 yt_dlp/extractor/toggle.py create mode 100644 yt_dlp/extractor/toggo.py create mode 100644 yt_dlp/extractor/tonline.py create mode 100644 yt_dlp/extractor/toongoggles.py create mode 100644 yt_dlp/extractor/toutv.py create mode 100644 yt_dlp/extractor/toypics.py create mode 100644 yt_dlp/extractor/traileraddict.py create mode 100644 yt_dlp/extractor/triller.py create mode 100644 yt_dlp/extractor/trovo.py create mode 100644 yt_dlp/extractor/trtcocuk.py create mode 100644 yt_dlp/extractor/trtworld.py create mode 100644 yt_dlp/extractor/trueid.py create mode 100644 yt_dlp/extractor/trunews.py create mode 100644 yt_dlp/extractor/truth.py create mode 100644 yt_dlp/extractor/trutv.py create mode 100644 yt_dlp/extractor/tube8.py create mode 100644 yt_dlp/extractor/tubetugraz.py create mode 100644 yt_dlp/extractor/tubitv.py create mode 100644 yt_dlp/extractor/tumblr.py create mode 100644 yt_dlp/extractor/tunein.py create mode 100644 yt_dlp/extractor/turner.py create mode 100644 yt_dlp/extractor/tv2.py create mode 100644 yt_dlp/extractor/tv24ua.py create mode 100644 yt_dlp/extractor/tv2dk.py create mode 100644 yt_dlp/extractor/tv2hu.py create mode 100644 yt_dlp/extractor/tv4.py create mode 100644 yt_dlp/extractor/tv5mondeplus.py create mode 100644 yt_dlp/extractor/tv5unis.py create mode 100644 yt_dlp/extractor/tva.py create mode 100644 yt_dlp/extractor/tvanouvelles.py create mode 100644 yt_dlp/extractor/tvc.py create mode 100644 yt_dlp/extractor/tver.py create mode 100644 yt_dlp/extractor/tvigle.py create mode 100644 yt_dlp/extractor/tviplayer.py create mode 100644 yt_dlp/extractor/tvland.py create mode 100644 yt_dlp/extractor/tvn24.py create mode 100644 yt_dlp/extractor/tvnoe.py create mode 100644 yt_dlp/extractor/tvopengr.py create mode 100644 yt_dlp/extractor/tvp.py create mode 100644 yt_dlp/extractor/tvplay.py create mode 100644 yt_dlp/extractor/tvplayer.py create mode 100644 yt_dlp/extractor/tweakers.py create mode 100644 yt_dlp/extractor/twentymin.py create mode 100644 yt_dlp/extractor/twentythreevideo.py create mode 100644 yt_dlp/extractor/twitcasting.py create mode 100644 yt_dlp/extractor/twitch.py create mode 100644 yt_dlp/extractor/twitter.py create mode 100644 yt_dlp/extractor/txxx.py create mode 100644 yt_dlp/extractor/udemy.py create mode 100644 yt_dlp/extractor/udn.py create mode 100644 yt_dlp/extractor/ufctv.py create mode 100644 yt_dlp/extractor/ukcolumn.py create mode 100644 yt_dlp/extractor/uktvplay.py create mode 100644 yt_dlp/extractor/umg.py create mode 100644 yt_dlp/extractor/unistra.py create mode 100644 yt_dlp/extractor/unity.py create mode 100644 yt_dlp/extractor/unsupported.py create mode 100644 yt_dlp/extractor/uol.py create mode 100644 yt_dlp/extractor/uplynk.py create mode 100644 yt_dlp/extractor/urort.py create mode 100644 yt_dlp/extractor/urplay.py create mode 100644 yt_dlp/extractor/usanetwork.py create mode 100644 yt_dlp/extractor/usatoday.py create mode 100644 yt_dlp/extractor/ustream.py create mode 100644 yt_dlp/extractor/ustudio.py create mode 100644 yt_dlp/extractor/utreon.py create mode 100644 yt_dlp/extractor/varzesh3.py create mode 100644 yt_dlp/extractor/vbox7.py create mode 100644 yt_dlp/extractor/veo.py create mode 100644 yt_dlp/extractor/veoh.py create mode 100644 yt_dlp/extractor/vesti.py create mode 100644 yt_dlp/extractor/vevo.py create mode 100644 yt_dlp/extractor/vgtv.py create mode 100644 yt_dlp/extractor/vh1.py create mode 100644 yt_dlp/extractor/vice.py create mode 100644 yt_dlp/extractor/viddler.py create mode 100644 yt_dlp/extractor/videa.py create mode 100644 yt_dlp/extractor/videocampus_sachsen.py create mode 100644 yt_dlp/extractor/videodetective.py create mode 100644 yt_dlp/extractor/videofyme.py create mode 100644 yt_dlp/extractor/videoken.py create mode 100644 yt_dlp/extractor/videomore.py create mode 100644 yt_dlp/extractor/videopress.py create mode 100644 yt_dlp/extractor/vidio.py create mode 100644 yt_dlp/extractor/vidlii.py create mode 100644 yt_dlp/extractor/vidly.py create mode 100644 yt_dlp/extractor/viewlift.py create mode 100644 yt_dlp/extractor/viidea.py create mode 100644 yt_dlp/extractor/viki.py create mode 100644 yt_dlp/extractor/vimeo.py create mode 100644 yt_dlp/extractor/vimm.py create mode 100644 yt_dlp/extractor/vine.py create mode 100644 yt_dlp/extractor/viously.py create mode 100644 yt_dlp/extractor/viqeo.py create mode 100644 yt_dlp/extractor/viu.py create mode 100644 yt_dlp/extractor/vk.py create mode 100644 yt_dlp/extractor/vocaroo.py create mode 100644 yt_dlp/extractor/vodpl.py create mode 100644 yt_dlp/extractor/vodplatform.py create mode 100644 yt_dlp/extractor/voicy.py create mode 100644 yt_dlp/extractor/volejtv.py create mode 100644 yt_dlp/extractor/voot.py create mode 100644 yt_dlp/extractor/voxmedia.py create mode 100644 yt_dlp/extractor/vrt.py create mode 100644 yt_dlp/extractor/vtm.py create mode 100644 yt_dlp/extractor/vuclip.py create mode 100644 yt_dlp/extractor/vvvvid.py create mode 100644 yt_dlp/extractor/walla.py create mode 100644 yt_dlp/extractor/washingtonpost.py create mode 100644 yt_dlp/extractor/wat.py create mode 100644 yt_dlp/extractor/wdr.py create mode 100644 yt_dlp/extractor/webcamerapl.py create mode 100644 yt_dlp/extractor/webcaster.py create mode 100644 yt_dlp/extractor/webofstories.py create mode 100644 yt_dlp/extractor/weibo.py create mode 100644 yt_dlp/extractor/weiqitv.py create mode 100644 yt_dlp/extractor/weverse.py create mode 100644 yt_dlp/extractor/wevidi.py create mode 100644 yt_dlp/extractor/weyyak.py create mode 100644 yt_dlp/extractor/whowatch.py create mode 100644 yt_dlp/extractor/whyp.py create mode 100644 yt_dlp/extractor/wikimedia.py create mode 100644 yt_dlp/extractor/wimbledon.py create mode 100644 yt_dlp/extractor/wimtv.py create mode 100644 yt_dlp/extractor/wistia.py create mode 100644 yt_dlp/extractor/wordpress.py create mode 100644 yt_dlp/extractor/worldstarhiphop.py create mode 100644 yt_dlp/extractor/wppilot.py create mode 100644 yt_dlp/extractor/wrestleuniverse.py create mode 100644 yt_dlp/extractor/wsj.py create mode 100644 yt_dlp/extractor/wwe.py create mode 100644 yt_dlp/extractor/wykop.py create mode 100644 yt_dlp/extractor/xanimu.py create mode 100644 yt_dlp/extractor/xboxclips.py create mode 100644 yt_dlp/extractor/xfileshare.py create mode 100644 yt_dlp/extractor/xhamster.py create mode 100644 yt_dlp/extractor/ximalaya.py create mode 100644 yt_dlp/extractor/xinpianchang.py create mode 100644 yt_dlp/extractor/xminus.py create mode 100644 yt_dlp/extractor/xnxx.py create mode 100644 yt_dlp/extractor/xstream.py create mode 100644 yt_dlp/extractor/xvideos.py create mode 100644 yt_dlp/extractor/xxxymovies.py create mode 100644 yt_dlp/extractor/yahoo.py create mode 100644 yt_dlp/extractor/yandexdisk.py create mode 100644 yt_dlp/extractor/yandexmusic.py create mode 100644 yt_dlp/extractor/yandexvideo.py create mode 100644 yt_dlp/extractor/yapfiles.py create mode 100644 yt_dlp/extractor/yappy.py create mode 100644 yt_dlp/extractor/yle_areena.py create mode 100644 yt_dlp/extractor/youjizz.py create mode 100644 yt_dlp/extractor/youku.py create mode 100644 yt_dlp/extractor/younow.py create mode 100644 yt_dlp/extractor/youporn.py create mode 100644 yt_dlp/extractor/yourporn.py create mode 100644 yt_dlp/extractor/yourupload.py create mode 100644 yt_dlp/extractor/youtube.py create mode 100644 yt_dlp/extractor/zaiko.py create mode 100644 yt_dlp/extractor/zapiks.py create mode 100644 yt_dlp/extractor/zattoo.py create mode 100644 yt_dlp/extractor/zdf.py create mode 100644 yt_dlp/extractor/zee5.py create mode 100644 yt_dlp/extractor/zeenews.py create mode 100644 yt_dlp/extractor/zenporn.py create mode 100644 yt_dlp/extractor/zetland.py create mode 100644 yt_dlp/extractor/zhihu.py create mode 100644 yt_dlp/extractor/zingmp3.py create mode 100644 yt_dlp/extractor/zoom.py create mode 100644 yt_dlp/extractor/zype.py create mode 100644 yt_dlp/jsinterp.py create mode 100644 yt_dlp/minicurses.py create mode 100644 yt_dlp/networking/__init__.py create mode 100644 yt_dlp/networking/_helper.py create mode 100644 yt_dlp/networking/_requests.py create mode 100644 yt_dlp/networking/_urllib.py create mode 100644 yt_dlp/networking/_websockets.py create mode 100644 yt_dlp/networking/common.py create mode 100644 yt_dlp/networking/exceptions.py create mode 100644 yt_dlp/networking/websocket.py create mode 100644 yt_dlp/options.py create mode 100644 yt_dlp/plugins.py create mode 100644 yt_dlp/postprocessor/__init__.py create mode 100644 yt_dlp/postprocessor/common.py create mode 100644 yt_dlp/postprocessor/embedthumbnail.py create mode 100644 yt_dlp/postprocessor/exec.py create mode 100644 yt_dlp/postprocessor/ffmpeg.py create mode 100644 yt_dlp/postprocessor/metadataparser.py create mode 100644 yt_dlp/postprocessor/modify_chapters.py create mode 100644 yt_dlp/postprocessor/movefilesafterdownload.py create mode 100644 yt_dlp/postprocessor/sponskrub.py create mode 100644 yt_dlp/postprocessor/sponsorblock.py create mode 100644 yt_dlp/postprocessor/xattrpp.py create mode 100644 yt_dlp/socks.py create mode 100644 yt_dlp/update.py create mode 100644 yt_dlp/utils/__init__.py create mode 100644 yt_dlp/utils/_deprecated.py create mode 100644 yt_dlp/utils/_legacy.py create mode 100644 yt_dlp/utils/_utils.py create mode 100644 yt_dlp/utils/networking.py create mode 100644 yt_dlp/utils/progress.py create mode 100644 yt_dlp/utils/traversal.py create mode 100644 yt_dlp/version.py create mode 100644 yt_dlp/webvtt.py (limited to 'yt_dlp') diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py new file mode 100644 index 0000000..c34d97b --- /dev/null +++ b/yt_dlp/YoutubeDL.py @@ -0,0 +1,4339 @@ +import collections +import contextlib +import copy +import datetime +import errno +import fileinput +import http.cookiejar +import io +import itertools +import json +import locale +import operator +import os +import random +import re +import shutil +import string +import subprocess +import sys +import tempfile +import time +import tokenize +import traceback +import unicodedata + +from .cache import Cache +from .compat import functools, urllib # isort: split +from .compat import compat_os_name, compat_shlex_quote, urllib_req_to_req +from .cookies import LenientSimpleCookie, load_cookies +from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name +from .downloader.rtmp import rtmpdump_version +from .extractor import gen_extractor_classes, get_info_extractor +from .extractor.common import UnsupportedURLIE +from .extractor.openload import PhantomJSwrapper +from .minicurses import format_text +from .networking import HEADRequest, Request, RequestDirector +from .networking.common import _REQUEST_HANDLERS, _RH_PREFERENCES +from .networking.exceptions import ( + HTTPError, + NoSupportingHandlers, + RequestError, + SSLError, + network_exceptions, +) +from .plugins import directories as plugin_directories +from .postprocessor import _PLUGIN_CLASSES as plugin_pps +from .postprocessor import ( + EmbedThumbnailPP, + FFmpegFixupDuplicateMoovPP, + FFmpegFixupDurationPP, + FFmpegFixupM3u8PP, + FFmpegFixupM4aPP, + FFmpegFixupStretchedPP, + FFmpegFixupTimestampPP, + FFmpegMergerPP, + FFmpegPostProcessor, + FFmpegVideoConvertorPP, + MoveFilesAfterDownloadPP, + get_postprocessor, +) +from .postprocessor.ffmpeg import resolve_mapping as resolve_recode_mapping +from .update import ( + REPOSITORY, + _get_system_deprecation, + _make_label, + current_git_head, + detect_variant, +) +from .utils import ( + DEFAULT_OUTTMPL, + IDENTITY, + LINK_TEMPLATES, + MEDIA_EXTENSIONS, + NO_DEFAULT, + NUMBER_RE, + OUTTMPL_TYPES, + POSTPROCESS_WHEN, + STR_FORMAT_RE_TMPL, + STR_FORMAT_TYPES, + ContentTooShortError, + DateRange, + DownloadCancelled, + DownloadError, + EntryNotInPlaylist, + ExistingVideoReached, + ExtractorError, + FormatSorter, + GeoRestrictedError, + ISO3166Utils, + LazyList, + MaxDownloadsReached, + Namespace, + PagedList, + PlaylistEntries, + Popen, + PostProcessingError, + ReExtractInfo, + RejectedVideoReached, + SameFileError, + UnavailableVideoError, + UserNotLive, + age_restricted, + args_to_str, + bug_reports_message, + date_from_str, + deprecation_warning, + determine_ext, + determine_protocol, + encode_compat_str, + encodeFilename, + error_to_compat_str, + escapeHTML, + expand_path, + extract_basic_auth, + filter_dict, + float_or_none, + format_bytes, + format_decimal_suffix, + format_field, + formatSeconds, + get_compatible_ext, + get_domain, + int_or_none, + iri_to_uri, + is_path_like, + join_nonempty, + locked_file, + make_archive_id, + make_dir, + number_of_digits, + orderedSet, + orderedSet_from_options, + parse_filesize, + preferredencoding, + prepend_extension, + remove_terminal_sequences, + render_table, + replace_extension, + sanitize_filename, + sanitize_path, + sanitize_url, + str_or_none, + strftime_or_none, + subtitles_filename, + supports_terminal_sequences, + system_identifier, + timetuple_from_msec, + to_high_limit_path, + traverse_obj, + try_call, + try_get, + url_basename, + variadic, + version_tuple, + windows_enable_vt_mode, + write_json_file, + write_string, +) +from .utils._utils import _YDLLogger +from .utils.networking import ( + HTTPHeaderDict, + clean_headers, + clean_proxies, + std_headers, +) +from .version import CHANNEL, ORIGIN, RELEASE_GIT_HEAD, VARIANT, __version__ + +if compat_os_name == 'nt': + import ctypes + + +class YoutubeDL: + """YoutubeDL class. + + YoutubeDL objects are the ones responsible of downloading the + actual video file and writing it to disk if the user has requested + it, among some other tasks. In most cases there should be one per + program. As, given a video URL, the downloader doesn't know how to + extract all the needed information, task that InfoExtractors do, it + has to pass the URL to one of them. + + For this, YoutubeDL objects have a method that allows + InfoExtractors to be registered in a given order. When it is passed + a URL, the YoutubeDL object handles it to the first InfoExtractor it + finds that reports being able to handle it. The InfoExtractor extracts + all the information about the video or videos the URL refers to, and + YoutubeDL process the extracted information, possibly using a File + Downloader to download the video. + + YoutubeDL objects accept a lot of parameters. In order not to saturate + the object constructor with arguments, it receives a dictionary of + options instead. These options are available through the params + attribute for the InfoExtractors to use. The YoutubeDL also + registers itself as the downloader in charge for the InfoExtractors + that are added to it, so this is a "mutual registration". + + Available options: + + username: Username for authentication purposes. + password: Password for authentication purposes. + videopassword: Password for accessing a video. + ap_mso: Adobe Pass multiple-system operator identifier. + ap_username: Multiple-system operator account username. + ap_password: Multiple-system operator account password. + usenetrc: Use netrc for authentication instead. + netrc_location: Location of the netrc file. Defaults to ~/.netrc. + netrc_cmd: Use a shell command to get credentials + verbose: Print additional info to stdout. + quiet: Do not print messages to stdout. + no_warnings: Do not print out anything for warnings. + forceprint: A dict with keys WHEN mapped to a list of templates to + print to stdout. The allowed keys are video or any of the + items in utils.POSTPROCESS_WHEN. + For compatibility, a single list is also accepted + print_to_file: A dict with keys WHEN (same as forceprint) mapped to + a list of tuples with (template, filename) + forcejson: Force printing info_dict as JSON. + dump_single_json: Force printing the info_dict of the whole playlist + (or video) as a single JSON line. + force_write_download_archive: Force writing download archive regardless + of 'skip_download' or 'simulate'. + simulate: Do not download the video files. If unset (or None), + simulate only if listsubtitles, listformats or list_thumbnails is used + format: Video format code. see "FORMAT SELECTION" for more details. + You can also pass a function. The function takes 'ctx' as + argument and returns the formats to download. + See "build_format_selector" for an implementation + allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded. + ignore_no_formats_error: Ignore "No video formats" error. Usefull for + extracting metadata even if the video is not actually + available for download (experimental) + format_sort: A list of fields by which to sort the video formats. + See "Sorting Formats" for more details. + format_sort_force: Force the given format_sort. see "Sorting Formats" + for more details. + prefer_free_formats: Whether to prefer video formats with free containers + over non-free ones of same quality. + allow_multiple_video_streams: Allow multiple video streams to be merged + into a single file + allow_multiple_audio_streams: Allow multiple audio streams to be merged + into a single file + check_formats Whether to test if the formats are downloadable. + Can be True (check all), False (check none), + 'selected' (check selected formats), + or None (check only if requested by extractor) + paths: Dictionary of output paths. The allowed keys are 'home' + 'temp' and the keys of OUTTMPL_TYPES (in utils/_utils.py) + outtmpl: Dictionary of templates for output names. Allowed keys + are 'default' and the keys of OUTTMPL_TYPES (in utils/_utils.py). + For compatibility with youtube-dl, a single string can also be used + outtmpl_na_placeholder: Placeholder for unavailable meta fields. + restrictfilenames: Do not allow "&" and spaces in file names + trim_file_name: Limit length of filename (extension excluded) + windowsfilenames: Force the filenames to be windows compatible + ignoreerrors: Do not stop on download/postprocessing errors. + Can be 'only_download' to ignore only download errors. + Default is 'only_download' for CLI, but False for API + skip_playlist_after_errors: Number of allowed failures until the rest of + the playlist is skipped + allowed_extractors: List of regexes to match against extractor names that are allowed + overwrites: Overwrite all video and metadata files if True, + overwrite only non-video files if None + and don't overwrite any file if False + playlist_items: Specific indices of playlist to download. + playlistrandom: Download playlist items in random order. + lazy_playlist: Process playlist entries as they are received. + matchtitle: Download only matching titles. + rejecttitle: Reject downloads for matching titles. + logger: Log messages to a logging.Logger instance. + logtostderr: Print everything to stderr instead of stdout. + consoletitle: Display progress in console window's titlebar. + writedescription: Write the video description to a .description file + writeinfojson: Write the video description to a .info.json file + clean_infojson: Remove internal metadata from the infojson + getcomments: Extract video comments. This will not be written to disk + unless writeinfojson is also given + writeannotations: Write the video annotations to a .annotations.xml file + writethumbnail: Write the thumbnail image to a file + allow_playlist_files: Whether to write playlists' description, infojson etc + also to disk when using the 'write*' options + write_all_thumbnails: Write all thumbnail formats to files + writelink: Write an internet shortcut file, depending on the + current platform (.url/.webloc/.desktop) + writeurllink: Write a Windows internet shortcut file (.url) + writewebloclink: Write a macOS internet shortcut file (.webloc) + writedesktoplink: Write a Linux internet shortcut file (.desktop) + writesubtitles: Write the video subtitles to a file + writeautomaticsub: Write the automatically generated subtitles to a file + listsubtitles: Lists all available subtitles for the video + subtitlesformat: The format code for subtitles + subtitleslangs: List of languages of the subtitles to download (can be regex). + The list may contain "all" to refer to all the available + subtitles. The language can be prefixed with a "-" to + exclude it from the requested languages, e.g. ['all', '-live_chat'] + keepvideo: Keep the video file after post-processing + daterange: A utils.DateRange object, download only if the upload_date is in the range. + skip_download: Skip the actual download of the video file + cachedir: Location of the cache files in the filesystem. + False to disable filesystem cache. + noplaylist: Download single video instead of a playlist if in doubt. + age_limit: An integer representing the user's age in years. + Unsuitable videos for the given age are skipped. + min_views: An integer representing the minimum view count the video + must have in order to not be skipped. + Videos without view count information are always + downloaded. None for no limit. + max_views: An integer representing the maximum view count. + Videos that are more popular than that are not + downloaded. + Videos without view count information are always + downloaded. None for no limit. + download_archive: A set, or the name of a file where all downloads are recorded. + Videos already present in the file are not downloaded again. + break_on_existing: Stop the download process after attempting to download a + file that is in the archive. + break_per_url: Whether break_on_reject and break_on_existing + should act on each input URL as opposed to for the entire queue + cookiefile: File name or text stream from where cookies should be read and dumped to + cookiesfrombrowser: A tuple containing the name of the browser, the profile + name/path from where cookies are loaded, the name of the keyring, + and the container name, e.g. ('chrome', ) or + ('vivaldi', 'default', 'BASICTEXT') or ('firefox', 'default', None, 'Meta') + legacyserverconnect: Explicitly allow HTTPS connection to servers that do not + support RFC 5746 secure renegotiation + nocheckcertificate: Do not verify SSL certificates + client_certificate: Path to client certificate file in PEM format. May include the private key + client_certificate_key: Path to private key file for client certificate + client_certificate_password: Password for client certificate private key, if encrypted. + If not provided and the key is encrypted, yt-dlp will ask interactively + prefer_insecure: Use HTTP instead of HTTPS to retrieve information. + (Only supported by some extractors) + enable_file_urls: Enable file:// URLs. This is disabled by default for security reasons. + http_headers: A dictionary of custom headers to be used for all requests + proxy: URL of the proxy server to use + geo_verification_proxy: URL of the proxy to use for IP address verification + on geo-restricted sites. + socket_timeout: Time to wait for unresponsive hosts, in seconds + bidi_workaround: Work around buggy terminals without bidirectional text + support, using fridibi + debug_printtraffic:Print out sent and received HTTP traffic + default_search: Prepend this string if an input url is not valid. + 'auto' for elaborate guessing + encoding: Use this encoding instead of the system-specified. + extract_flat: Whether to resolve and process url_results further + * False: Always process. Default for API + * True: Never process + * 'in_playlist': Do not process inside playlist/multi_video + * 'discard': Always process, but don't return the result + from inside playlist/multi_video + * 'discard_in_playlist': Same as "discard", but only for + playlists (not multi_video). Default for CLI + wait_for_video: If given, wait for scheduled streams to become available. + The value should be a tuple containing the range + (min_secs, max_secs) to wait between retries + postprocessors: A list of dictionaries, each with an entry + * key: The name of the postprocessor. See + yt_dlp/postprocessor/__init__.py for a list. + * when: When to run the postprocessor. Allowed values are + the entries of utils.POSTPROCESS_WHEN + Assumed to be 'post_process' if not given + progress_hooks: A list of functions that get called on download + progress, with a dictionary with the entries + * status: One of "downloading", "error", or "finished". + Check this first and ignore unknown values. + * info_dict: The extracted info_dict + + If status is one of "downloading", or "finished", the + following properties may also be present: + * filename: The final filename (always present) + * tmpfilename: The filename we're currently writing to + * downloaded_bytes: Bytes on disk + * total_bytes: Size of the whole file, None if unknown + * total_bytes_estimate: Guess of the eventual file size, + None if unavailable. + * elapsed: The number of seconds since download started. + * eta: The estimated time in seconds, None if unknown + * speed: The download speed in bytes/second, None if + unknown + * fragment_index: The counter of the currently + downloaded video fragment. + * fragment_count: The number of fragments (= individual + files that will be merged) + + Progress hooks are guaranteed to be called at least once + (with status "finished") if the download is successful. + postprocessor_hooks: A list of functions that get called on postprocessing + progress, with a dictionary with the entries + * status: One of "started", "processing", or "finished". + Check this first and ignore unknown values. + * postprocessor: Name of the postprocessor + * info_dict: The extracted info_dict + + Progress hooks are guaranteed to be called at least twice + (with status "started" and "finished") if the processing is successful. + merge_output_format: "/" separated list of extensions to use when merging formats. + final_ext: Expected final extension; used to detect when the file was + already downloaded and converted + fixup: Automatically correct known faults of the file. + One of: + - "never": do nothing + - "warn": only emit a warning + - "detect_or_warn": check whether we can do anything + about it, warn otherwise (default) + source_address: Client-side IP address to bind to. + sleep_interval_requests: Number of seconds to sleep between requests + during extraction + sleep_interval: Number of seconds to sleep before each download when + used alone or a lower bound of a range for randomized + sleep before each download (minimum possible number + of seconds to sleep) when used along with + max_sleep_interval. + max_sleep_interval:Upper bound of a range for randomized sleep before each + download (maximum possible number of seconds to sleep). + Must only be used along with sleep_interval. + Actual sleep time will be a random float from range + [sleep_interval; max_sleep_interval]. + sleep_interval_subtitles: Number of seconds to sleep before each subtitle download + listformats: Print an overview of available video formats and exit. + list_thumbnails: Print a table of all thumbnails and exit. + match_filter: A function that gets called for every video with the signature + (info_dict, *, incomplete: bool) -> Optional[str] + For backward compatibility with youtube-dl, the signature + (info_dict) -> Optional[str] is also allowed. + - If it returns a message, the video is ignored. + - If it returns None, the video is downloaded. + - If it returns utils.NO_DEFAULT, the user is interactively + asked whether to download the video. + - Raise utils.DownloadCancelled(msg) to abort remaining + downloads when a video is rejected. + match_filter_func in utils/_utils.py is one example for this. + color: A Dictionary with output stream names as keys + and their respective color policy as values. + Can also just be a single color policy, + in which case it applies to all outputs. + Valid stream names are 'stdout' and 'stderr'. + Valid color policies are one of 'always', 'auto', 'no_color' or 'never'. + geo_bypass: Bypass geographic restriction via faking X-Forwarded-For + HTTP header + geo_bypass_country: + Two-letter ISO 3166-2 country code that will be used for + explicit geographic restriction bypassing via faking + X-Forwarded-For HTTP header + geo_bypass_ip_block: + IP range in CIDR notation that will be used similarly to + geo_bypass_country + external_downloader: A dictionary of protocol keys and the executable of the + external downloader to use for it. The allowed protocols + are default|http|ftp|m3u8|dash|rtsp|rtmp|mms. + Set the value to 'native' to use the native downloader + compat_opts: Compatibility options. See "Differences in default behavior". + The following options do not work when used through the API: + filename, abort-on-error, multistreams, no-live-chat, format-sort + no-clean-infojson, no-playlist-metafiles, no-keep-subs, no-attach-info-json. + Refer __init__.py for their implementation + progress_template: Dictionary of templates for progress outputs. + Allowed keys are 'download', 'postprocess', + 'download-title' (console title) and 'postprocess-title'. + The template is mapped on a dictionary with keys 'progress' and 'info' + retry_sleep_functions: Dictionary of functions that takes the number of attempts + as argument and returns the time to sleep in seconds. + Allowed keys are 'http', 'fragment', 'file_access' + download_ranges: A callback function that gets called for every video with + the signature (info_dict, ydl) -> Iterable[Section]. + Only the returned sections will be downloaded. + Each Section is a dict with the following keys: + * start_time: Start time of the section in seconds + * end_time: End time of the section in seconds + * title: Section title (Optional) + * index: Section number (Optional) + force_keyframes_at_cuts: Re-encode the video when downloading ranges to get precise cuts + noprogress: Do not print the progress bar + live_from_start: Whether to download livestreams videos from the start + + The following parameters are not used by YoutubeDL itself, they are used by + the downloader (see yt_dlp/downloader/common.py): + nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize, + max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries, + continuedl, xattr_set_filesize, hls_use_mpegts, http_chunk_size, + external_downloader_args, concurrent_fragment_downloads. + + The following options are used by the post processors: + ffmpeg_location: Location of the ffmpeg/avconv binary; either the path + to the binary or its containing directory. + postprocessor_args: A dictionary of postprocessor/executable keys (in lower case) + and a list of additional command-line arguments for the + postprocessor/executable. The dict can also have "PP+EXE" keys + which are used when the given exe is used by the given PP. + Use 'default' as the name for arguments to passed to all PP + For compatibility with youtube-dl, a single list of args + can also be used + + The following options are used by the extractors: + extractor_retries: Number of times to retry for known errors (default: 3) + dynamic_mpd: Whether to process dynamic DASH manifests (default: True) + hls_split_discontinuity: Split HLS playlists to different formats at + discontinuities such as ad breaks (default: False) + extractor_args: A dictionary of arguments to be passed to the extractors. + See "EXTRACTOR ARGUMENTS" for details. + E.g. {'youtube': {'skip': ['dash', 'hls']}} + mark_watched: Mark videos watched (even with --simulate). Only for YouTube + + The following options are deprecated and may be removed in the future: + + break_on_reject: Stop the download process when encountering a video that + has been filtered out. + - `raise DownloadCancelled(msg)` in match_filter instead + force_generic_extractor: Force downloader to use the generic extractor + - Use allowed_extractors = ['generic', 'default'] + playliststart: - Use playlist_items + Playlist item to start at. + playlistend: - Use playlist_items + Playlist item to end at. + playlistreverse: - Use playlist_items + Download playlist items in reverse order. + forceurl: - Use forceprint + Force printing final URL. + forcetitle: - Use forceprint + Force printing title. + forceid: - Use forceprint + Force printing ID. + forcethumbnail: - Use forceprint + Force printing thumbnail URL. + forcedescription: - Use forceprint + Force printing description. + forcefilename: - Use forceprint + Force printing final filename. + forceduration: - Use forceprint + Force printing duration. + allsubtitles: - Use subtitleslangs = ['all'] + Downloads all the subtitles of the video + (requires writesubtitles or writeautomaticsub) + include_ads: - Doesn't work + Download ads as well + call_home: - Not implemented + Boolean, true iff we are allowed to contact the + yt-dlp servers for debugging. + post_hooks: - Register a custom postprocessor + A list of functions that get called as the final step + for each video file, after all postprocessors have been + called. The filename will be passed as the only argument. + hls_prefer_native: - Use external_downloader = {'m3u8': 'native'} or {'m3u8': 'ffmpeg'}. + Use the native HLS downloader instead of ffmpeg/avconv + if True, otherwise use ffmpeg/avconv if False, otherwise + use downloader suggested by extractor if None. + prefer_ffmpeg: - avconv support is deprecated + If False, use avconv instead of ffmpeg if both are available, + otherwise prefer ffmpeg. + youtube_include_dash_manifest: - Use extractor_args + If True (default), DASH manifests and related + data will be downloaded and processed by extractor. + You can reduce network I/O by disabling it if you don't + care about DASH. (only for youtube) + youtube_include_hls_manifest: - Use extractor_args + If True (default), HLS manifests and related + data will be downloaded and processed by extractor. + You can reduce network I/O by disabling it if you don't + care about HLS. (only for youtube) + no_color: Same as `color='no_color'` + no_overwrites: Same as `overwrites=False` + """ + + _NUMERIC_FIELDS = { + 'width', 'height', 'asr', 'audio_channels', 'fps', + 'tbr', 'abr', 'vbr', 'filesize', 'filesize_approx', + 'timestamp', 'release_timestamp', + 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', + 'average_rating', 'comment_count', 'age_limit', + 'start_time', 'end_time', + 'chapter_number', 'season_number', 'episode_number', + 'track_number', 'disc_number', 'release_year', + } + + _format_fields = { + # NB: Keep in sync with the docstring of extractor/common.py + 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note', + 'width', 'height', 'aspect_ratio', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'audio_channels', + 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns', + 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'is_dash_periods', 'request_data', + 'preference', 'language', 'language_preference', 'quality', 'source_preference', 'cookies', + 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'hls_aes', 'downloader_options', + 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time' + } + _deprecated_multivalue_fields = { + 'album_artist': 'album_artists', + 'artist': 'artists', + 'composer': 'composers', + 'creator': 'creators', + 'genre': 'genres', + } + _format_selection_exts = { + 'audio': set(MEDIA_EXTENSIONS.common_audio), + 'video': set(MEDIA_EXTENSIONS.common_video + ('3gp', )), + 'storyboards': set(MEDIA_EXTENSIONS.storyboards), + } + + def __init__(self, params=None, auto_init=True): + """Create a FileDownloader object with the given options. + @param auto_init Whether to load the default extractors and print header (if verbose). + Set to 'no_verbose_header' to not print the header + """ + if params is None: + params = {} + self.params = params + self._ies = {} + self._ies_instances = {} + self._pps = {k: [] for k in POSTPROCESS_WHEN} + self._printed_messages = set() + self._first_webpage_request = True + self._post_hooks = [] + self._progress_hooks = [] + self._postprocessor_hooks = [] + self._download_retcode = 0 + self._num_downloads = 0 + self._num_videos = 0 + self._playlist_level = 0 + self._playlist_urls = set() + self.cache = Cache(self) + self.__header_cookies = [] + + stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout + self._out_files = Namespace( + out=stdout, + error=sys.stderr, + screen=sys.stderr if self.params.get('quiet') else stdout, + console=None if compat_os_name == 'nt' else next( + filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None) + ) + + try: + windows_enable_vt_mode() + except Exception as e: + self.write_debug(f'Failed to enable VT mode: {e}') + + if self.params.get('no_color'): + if self.params.get('color') is not None: + self.params.setdefault('_warnings', []).append( + 'Overwriting params from "color" with "no_color"') + self.params['color'] = 'no_color' + + term_allow_color = os.getenv('TERM', '').lower() != 'dumb' + no_color = bool(os.getenv('NO_COLOR')) + + def process_color_policy(stream): + stream_name = {sys.stdout: 'stdout', sys.stderr: 'stderr'}[stream] + policy = traverse_obj(self.params, ('color', (stream_name, None), {str}), get_all=False) + if policy in ('auto', None): + if term_allow_color and supports_terminal_sequences(stream): + return 'no_color' if no_color else True + return False + assert policy in ('always', 'never', 'no_color'), policy + return {'always': True, 'never': False}.get(policy, policy) + + self._allow_colors = Namespace(**{ + name: process_color_policy(stream) + for name, stream in self._out_files.items_ if name != 'console' + }) + + system_deprecation = _get_system_deprecation() + if system_deprecation: + self.deprecated_feature(system_deprecation.replace('\n', '\n ')) + + if self.params.get('allow_unplayable_formats'): + self.report_warning( + f'You have asked for {self._format_err("UNPLAYABLE", self.Styles.EMPHASIS)} formats to be listed/downloaded. ' + 'This is a developer option intended for debugging. \n' + ' If you experience any issues while using this option, ' + f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report') + + if self.params.get('bidi_workaround', False): + try: + import pty + master, slave = pty.openpty() + width = shutil.get_terminal_size().columns + width_args = [] if width is None else ['-w', str(width)] + sp_kwargs = {'stdin': subprocess.PIPE, 'stdout': slave, 'stderr': self._out_files.error} + try: + self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs) + except OSError: + self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) + self._output_channel = os.fdopen(master, 'rb') + except OSError as ose: + if ose.errno == errno.ENOENT: + self.report_warning( + 'Could not find fribidi executable, ignoring --bidi-workaround. ' + 'Make sure that fribidi is an executable file in one of the directories in your $PATH.') + else: + raise + + self.params['compat_opts'] = set(self.params.get('compat_opts', ())) + self.params['http_headers'] = HTTPHeaderDict(std_headers, self.params.get('http_headers')) + self._load_cookies(self.params['http_headers'].get('Cookie')) # compat + self.params['http_headers'].pop('Cookie', None) + + if auto_init and auto_init != 'no_verbose_header': + self.print_debug_header() + + def check_deprecated(param, option, suggestion): + if self.params.get(param) is not None: + self.report_warning(f'{option} is deprecated. Use {suggestion} instead') + return True + return False + + if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'): + if self.params.get('geo_verification_proxy') is None: + self.params['geo_verification_proxy'] = self.params['cn_verification_proxy'] + + check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"') + check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"') + check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"') + + for msg in self.params.get('_warnings', []): + self.report_warning(msg) + for msg in self.params.get('_deprecation_warnings', []): + self.deprecated_feature(msg) + + if 'list-formats' in self.params['compat_opts']: + self.params['listformats_table'] = False + + if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None: + # nooverwrites was unnecessarily changed to overwrites + # in 0c3d0f51778b153f65c21906031c2e091fcfb641 + # This ensures compatibility with both keys + self.params['overwrites'] = not self.params['nooverwrites'] + elif self.params.get('overwrites') is None: + self.params.pop('overwrites', None) + else: + self.params['nooverwrites'] = not self.params['overwrites'] + + if self.params.get('simulate') is None and any(( + self.params.get('list_thumbnails'), + self.params.get('listformats'), + self.params.get('listsubtitles'), + )): + self.params['simulate'] = 'list_only' + + self.params.setdefault('forceprint', {}) + self.params.setdefault('print_to_file', {}) + + # Compatibility with older syntax + if not isinstance(params['forceprint'], dict): + self.params['forceprint'] = {'video': params['forceprint']} + + if auto_init: + self.add_default_info_extractors() + + if (sys.platform != 'win32' + and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] + and not self.params.get('restrictfilenames', False)): + # Unicode filesystem API will throw errors (#1474, #13027) + self.report_warning( + 'Assuming --restrict-filenames since file system encoding ' + 'cannot encode all characters. ' + 'Set the LC_ALL environment variable to fix this.') + self.params['restrictfilenames'] = True + + self._parse_outtmpl() + + # Creating format selector here allows us to catch syntax errors before the extraction + self.format_selector = ( + self.params.get('format') if self.params.get('format') in (None, '-') + else self.params['format'] if callable(self.params['format']) + else self.build_format_selector(self.params['format'])) + + hooks = { + 'post_hooks': self.add_post_hook, + 'progress_hooks': self.add_progress_hook, + 'postprocessor_hooks': self.add_postprocessor_hook, + } + for opt, fn in hooks.items(): + for ph in self.params.get(opt, []): + fn(ph) + + for pp_def_raw in self.params.get('postprocessors', []): + pp_def = dict(pp_def_raw) + when = pp_def.pop('when', 'post_process') + self.add_post_processor( + get_postprocessor(pp_def.pop('key'))(self, **pp_def), + when=when) + + def preload_download_archive(fn): + """Preload the archive, if any is specified""" + archive = set() + if fn is None: + return archive + elif not is_path_like(fn): + return fn + + self.write_debug(f'Loading archive file {fn!r}') + try: + with locked_file(fn, 'r', encoding='utf-8') as archive_file: + for line in archive_file: + archive.add(line.strip()) + except OSError as ioe: + if ioe.errno != errno.ENOENT: + raise + return archive + + self.archive = preload_download_archive(self.params.get('download_archive')) + + def warn_if_short_id(self, argv): + # short YouTube ID starting with dash? + idxs = [ + i for i, a in enumerate(argv) + if re.match(r'^-[0-9A-Za-z_-]{10}$', a)] + if idxs: + correct_argv = ( + ['yt-dlp'] + + [a for i, a in enumerate(argv) if i not in idxs] + + ['--'] + [argv[i] for i in idxs] + ) + self.report_warning( + 'Long argument string detected. ' + 'Use -- to separate parameters and URLs, like this:\n%s' % + args_to_str(correct_argv)) + + def add_info_extractor(self, ie): + """Add an InfoExtractor object to the end of the list.""" + ie_key = ie.ie_key() + self._ies[ie_key] = ie + if not isinstance(ie, type): + self._ies_instances[ie_key] = ie + ie.set_downloader(self) + + def get_info_extractor(self, ie_key): + """ + Get an instance of an IE with name ie_key, it will try to get one from + the _ies list, if there's no instance it will create a new one and add + it to the extractor list. + """ + ie = self._ies_instances.get(ie_key) + if ie is None: + ie = get_info_extractor(ie_key)() + self.add_info_extractor(ie) + return ie + + def add_default_info_extractors(self): + """ + Add the InfoExtractors returned by gen_extractors to the end of the list + """ + all_ies = {ie.IE_NAME.lower(): ie for ie in gen_extractor_classes()} + all_ies['end'] = UnsupportedURLIE() + try: + ie_names = orderedSet_from_options( + self.params.get('allowed_extractors', ['default']), { + 'all': list(all_ies), + 'default': [name for name, ie in all_ies.items() if ie._ENABLED], + }, use_regex=True) + except re.error as e: + raise ValueError(f'Wrong regex for allowed_extractors: {e.pattern}') + for name in ie_names: + self.add_info_extractor(all_ies[name]) + self.write_debug(f'Loaded {len(ie_names)} extractors') + + def add_post_processor(self, pp, when='post_process'): + """Add a PostProcessor object to the end of the chain.""" + assert when in POSTPROCESS_WHEN, f'Invalid when={when}' + self._pps[when].append(pp) + pp.set_downloader(self) + + def add_post_hook(self, ph): + """Add the post hook""" + self._post_hooks.append(ph) + + def add_progress_hook(self, ph): + """Add the download progress hook""" + self._progress_hooks.append(ph) + + def add_postprocessor_hook(self, ph): + """Add the postprocessing progress hook""" + self._postprocessor_hooks.append(ph) + for pps in self._pps.values(): + for pp in pps: + pp.add_progress_hook(ph) + + def _bidi_workaround(self, message): + if not hasattr(self, '_output_channel'): + return message + + assert hasattr(self, '_output_process') + assert isinstance(message, str) + line_count = message.count('\n') + 1 + self._output_process.stdin.write((message + '\n').encode()) + self._output_process.stdin.flush() + res = ''.join(self._output_channel.readline().decode() + for _ in range(line_count)) + return res[:-len('\n')] + + def _write_string(self, message, out=None, only_once=False): + if only_once: + if message in self._printed_messages: + return + self._printed_messages.add(message) + write_string(message, out=out, encoding=self.params.get('encoding')) + + def to_stdout(self, message, skip_eol=False, quiet=None): + """Print message to stdout""" + if quiet is not None: + self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. ' + 'Use "YoutubeDL.to_screen" instead') + if skip_eol is not False: + self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument skip_eol. ' + 'Use "YoutubeDL.to_screen" instead') + self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.out) + + def to_screen(self, message, skip_eol=False, quiet=None, only_once=False): + """Print message to screen if not in quiet mode""" + if self.params.get('logger'): + self.params['logger'].debug(message) + return + if (self.params.get('quiet') if quiet is None else quiet) and not self.params.get('verbose'): + return + self._write_string( + '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')), + self._out_files.screen, only_once=only_once) + + def to_stderr(self, message, only_once=False): + """Print message to stderr""" + assert isinstance(message, str) + if self.params.get('logger'): + self.params['logger'].error(message) + else: + self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.error, only_once=only_once) + + def _send_console_code(self, code): + if compat_os_name == 'nt' or not self._out_files.console: + return + self._write_string(code, self._out_files.console) + + def to_console_title(self, message): + if not self.params.get('consoletitle', False): + return + message = remove_terminal_sequences(message) + if compat_os_name == 'nt': + if ctypes.windll.kernel32.GetConsoleWindow(): + # c_wchar_p() might not be necessary if `message` is + # already of type unicode() + ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) + else: + self._send_console_code(f'\033]0;{message}\007') + + def save_console_title(self): + if not self.params.get('consoletitle') or self.params.get('simulate'): + return + self._send_console_code('\033[22;0t') # Save the title on stack + + def restore_console_title(self): + if not self.params.get('consoletitle') or self.params.get('simulate'): + return + self._send_console_code('\033[23;0t') # Restore the title from stack + + def __enter__(self): + self.save_console_title() + return self + + def save_cookies(self): + if self.params.get('cookiefile') is not None: + self.cookiejar.save() + + def __exit__(self, *args): + self.restore_console_title() + self.close() + + def close(self): + self.save_cookies() + if '_request_director' in self.__dict__: + self._request_director.close() + del self._request_director + + def trouble(self, message=None, tb=None, is_error=True): + """Determine action to take when a download problem appears. + + Depending on if the downloader has been configured to ignore + download errors or not, this method may throw an exception or + not when errors are found, after printing the message. + + @param tb If given, is additional traceback information + @param is_error Whether to raise error according to ignorerrors + """ + if message is not None: + self.to_stderr(message) + if self.params.get('verbose'): + if tb is None: + if sys.exc_info()[0]: # if .trouble has been called from an except block + tb = '' + if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: + tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info)) + tb += encode_compat_str(traceback.format_exc()) + else: + tb_data = traceback.format_list(traceback.extract_stack()) + tb = ''.join(tb_data) + if tb: + self.to_stderr(tb) + if not is_error: + return + if not self.params.get('ignoreerrors'): + if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: + exc_info = sys.exc_info()[1].exc_info + else: + exc_info = sys.exc_info() + raise DownloadError(message, exc_info) + self._download_retcode = 1 + + Styles = Namespace( + HEADERS='yellow', + EMPHASIS='light blue', + FILENAME='green', + ID='green', + DELIM='blue', + ERROR='red', + BAD_FORMAT='light red', + WARNING='yellow', + SUPPRESS='light black', + ) + + def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_encoding=False): + text = str(text) + if test_encoding: + original_text = text + # handle.encoding can be None. See https://github.com/yt-dlp/yt-dlp/issues/2711 + encoding = self.params.get('encoding') or getattr(handle, 'encoding', None) or 'ascii' + text = text.encode(encoding, 'ignore').decode(encoding) + if fallback is not None and text != original_text: + text = fallback + return format_text(text, f) if allow_colors is True else text if fallback is None else fallback + + def _format_out(self, *args, **kwargs): + return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs) + + def _format_screen(self, *args, **kwargs): + return self._format_text(self._out_files.screen, self._allow_colors.screen, *args, **kwargs) + + def _format_err(self, *args, **kwargs): + return self._format_text(self._out_files.error, self._allow_colors.error, *args, **kwargs) + + def report_warning(self, message, only_once=False): + ''' + Print the message to stderr, it will be prefixed with 'WARNING:' + If stderr is a tty file the 'WARNING:' will be colored + ''' + if self.params.get('logger') is not None: + self.params['logger'].warning(message) + else: + if self.params.get('no_warnings'): + return + self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once) + + def deprecation_warning(self, message, *, stacklevel=0): + deprecation_warning( + message, stacklevel=stacklevel + 1, printer=self.report_error, is_error=False) + + def deprecated_feature(self, message): + if self.params.get('logger') is not None: + self.params['logger'].warning(f'Deprecated Feature: {message}') + self.to_stderr(f'{self._format_err("Deprecated Feature:", self.Styles.ERROR)} {message}', True) + + def report_error(self, message, *args, **kwargs): + ''' + Do the same as trouble, but prefixes the message with 'ERROR:', colored + in red if stderr is a tty file. + ''' + self.trouble(f'{self._format_err("ERROR:", self.Styles.ERROR)} {message}', *args, **kwargs) + + def write_debug(self, message, only_once=False): + '''Log debug message or Print message to stderr''' + if not self.params.get('verbose', False): + return + message = f'[debug] {message}' + if self.params.get('logger'): + self.params['logger'].debug(message) + else: + self.to_stderr(message, only_once) + + def report_file_already_downloaded(self, file_name): + """Report file has already been fully downloaded.""" + try: + self.to_screen('[download] %s has already been downloaded' % file_name) + except UnicodeEncodeError: + self.to_screen('[download] The file has already been downloaded') + + def report_file_delete(self, file_name): + """Report that existing file will be deleted.""" + try: + self.to_screen('Deleting existing file %s' % file_name) + except UnicodeEncodeError: + self.to_screen('Deleting existing file') + + def raise_no_formats(self, info, forced=False, *, msg=None): + has_drm = info.get('_has_drm') + ignored, expected = self.params.get('ignore_no_formats_error'), bool(msg) + msg = msg or has_drm and 'This video is DRM protected' or 'No video formats found!' + if forced or not ignored: + raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'], + expected=has_drm or ignored or expected) + else: + self.report_warning(msg) + + def parse_outtmpl(self): + self.deprecation_warning('"YoutubeDL.parse_outtmpl" is deprecated and may be removed in a future version') + self._parse_outtmpl() + return self.params['outtmpl'] + + def _parse_outtmpl(self): + sanitize = IDENTITY + if self.params.get('restrictfilenames'): # Remove spaces in the default template + sanitize = lambda x: x.replace(' - ', ' ').replace(' ', '-') + + outtmpl = self.params.setdefault('outtmpl', {}) + if not isinstance(outtmpl, dict): + self.params['outtmpl'] = outtmpl = {'default': outtmpl} + outtmpl.update({k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() if outtmpl.get(k) is None}) + + def get_output_path(self, dir_type='', filename=None): + paths = self.params.get('paths', {}) + assert isinstance(paths, dict), '"paths" parameter must be a dictionary' + path = os.path.join( + expand_path(paths.get('home', '').strip()), + expand_path(paths.get(dir_type, '').strip()) if dir_type else '', + filename or '') + return sanitize_path(path, force=self.params.get('windowsfilenames')) + + @staticmethod + def _outtmpl_expandpath(outtmpl): + # expand_path translates '%%' into '%' and '$$' into '$' + # correspondingly that is not what we want since we need to keep + # '%%' intact for template dict substitution step. Working around + # with boundary-alike separator hack. + sep = ''.join(random.choices(string.ascii_letters, k=32)) + outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$') + + # outtmpl should be expand_path'ed before template dict substitution + # because meta fields may contain env variables we don't want to + # be expanded. E.g. for outtmpl "%(title)s.%(ext)s" and + # title "Hello $PATH", we don't want `$PATH` to be expanded. + return expand_path(outtmpl).replace(sep, '') + + @staticmethod + def escape_outtmpl(outtmpl): + ''' Escape any remaining strings like %s, %abc% etc. ''' + return re.sub( + STR_FORMAT_RE_TMPL.format('', '(?![%(\0])'), + lambda mobj: ('' if mobj.group('has_key') else '%') + mobj.group(0), + outtmpl) + + @classmethod + def validate_outtmpl(cls, outtmpl): + ''' @return None or Exception object ''' + outtmpl = re.sub( + STR_FORMAT_RE_TMPL.format('[^)]*', '[ljhqBUDS]'), + lambda mobj: f'{mobj.group(0)[:-1]}s', + cls._outtmpl_expandpath(outtmpl)) + try: + cls.escape_outtmpl(outtmpl) % collections.defaultdict(int) + return None + except ValueError as err: + return err + + @staticmethod + def _copy_infodict(info_dict): + info_dict = dict(info_dict) + info_dict.pop('__postprocessors', None) + info_dict.pop('__pending_error', None) + return info_dict + + def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): + """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict + @param sanitize Whether to sanitize the output as a filename. + For backward compatibility, a function can also be passed + """ + + info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set + + info_dict = self._copy_infodict(info_dict) + info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs + formatSeconds(info_dict['duration'], '-' if sanitize else ':') + if info_dict.get('duration', None) is not None + else None) + info_dict['autonumber'] = int(self.params.get('autonumber_start', 1) - 1 + self._num_downloads) + info_dict['video_autonumber'] = self._num_videos + if info_dict.get('resolution') is None: + info_dict['resolution'] = self.format_resolution(info_dict, default=None) + + # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences + # of %(field)s to %(field)0Nd for backward compatibility + field_size_compat_map = { + 'playlist_index': number_of_digits(info_dict.get('__last_playlist_index') or 0), + 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0), + 'autonumber': self.params.get('autonumber_size') or 5, + } + + TMPL_DICT = {} + EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljhqBUDS]')) + MATH_FUNCTIONS = { + '+': float.__add__, + '-': float.__sub__, + '*': float.__mul__, + } + # Field is of the form key1.key2... + # where keys (except first) can be string, int, slice or "{field, ...}" + FIELD_INNER_RE = r'(?:\w+|%(num)s|%(num)s?(?::%(num)s?){1,2})' % {'num': r'(?:-?\d+)'} + FIELD_RE = r'\w*(?:\.(?:%(inner)s|{%(field)s(?:,%(field)s)*}))*' % { + 'inner': FIELD_INNER_RE, + 'field': rf'\w*(?:\.{FIELD_INNER_RE})*' + } + MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})' + MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys())) + INTERNAL_FORMAT_RE = re.compile(rf'''(?xs) + (?P-)? + (?P{FIELD_RE}) + (?P(?:{MATH_OPERATORS_RE}{MATH_FIELD_RE})*) + (?:>(?P.+?))? + (?P + (?P(?.*?))? + (?:\|(?P.*?))? + )$''') + + def _from_user_input(field): + if field == ':': + return ... + elif ':' in field: + return slice(*map(int_or_none, field.split(':'))) + elif int_or_none(field) is not None: + return int(field) + return field + + def _traverse_infodict(fields): + fields = [f for x in re.split(r'\.({.+?})\.?', fields) + for f in ([x] if x.startswith('{') else x.split('.'))] + for i in (0, -1): + if fields and not fields[i]: + fields.pop(i) + + for i, f in enumerate(fields): + if not f.startswith('{'): + fields[i] = _from_user_input(f) + continue + assert f.endswith('}'), f'No closing brace for {f} in {fields}' + fields[i] = {k: list(map(_from_user_input, k.split('.'))) for k in f[1:-1].split(',')} + + return traverse_obj(info_dict, fields, traverse_string=True) + + def get_value(mdict): + # Object traversal + value = _traverse_infodict(mdict['fields']) + # Negative + if mdict['negate']: + value = float_or_none(value) + if value is not None: + value *= -1 + # Do maths + offset_key = mdict['maths'] + if offset_key: + value = float_or_none(value) + operator = None + while offset_key: + item = re.match( + MATH_FIELD_RE if operator else MATH_OPERATORS_RE, + offset_key).group(0) + offset_key = offset_key[len(item):] + if operator is None: + operator = MATH_FUNCTIONS[item] + continue + item, multiplier = (item[1:], -1) if item[0] == '-' else (item, 1) + offset = float_or_none(item) + if offset is None: + offset = float_or_none(_traverse_infodict(item)) + try: + value = operator(value, multiplier * offset) + except (TypeError, ZeroDivisionError): + return None + operator = None + # Datetime formatting + if mdict['strf_format']: + value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ',')) + + # XXX: Workaround for https://github.com/yt-dlp/yt-dlp/issues/4485 + if sanitize and value == '': + value = None + return value + + na = self.params.get('outtmpl_na_placeholder', 'NA') + + def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')): + return sanitize_filename(str(value), restricted=restricted, is_id=( + bool(re.search(r'(^|[_.])id(\.|$)', key)) + if 'filename-sanitization' in self.params['compat_opts'] + else NO_DEFAULT)) + + sanitizer = sanitize if callable(sanitize) else filename_sanitizer + sanitize = bool(sanitize) + + def _dumpjson_default(obj): + if isinstance(obj, (set, LazyList)): + return list(obj) + return repr(obj) + + class _ReplacementFormatter(string.Formatter): + def get_field(self, field_name, args, kwargs): + if field_name.isdigit(): + return args[0], -1 + raise ValueError('Unsupported field') + + replacement_formatter = _ReplacementFormatter() + + def create_key(outer_mobj): + if not outer_mobj.group('has_key'): + return outer_mobj.group(0) + key = outer_mobj.group('key') + mobj = re.match(INTERNAL_FORMAT_RE, key) + value, replacement, default, last_field = None, None, na, '' + while mobj: + mobj = mobj.groupdict() + default = mobj['default'] if mobj['default'] is not None else default + value = get_value(mobj) + last_field, replacement = mobj['fields'], mobj['replacement'] + if value is None and mobj['alternate']: + mobj = re.match(INTERNAL_FORMAT_RE, mobj['remaining'][1:]) + else: + break + + if None not in (value, replacement): + try: + value = replacement_formatter.format(replacement, value) + except ValueError: + value, default = None, na + + fmt = outer_mobj.group('format') + if fmt == 's' and last_field in field_size_compat_map.keys() and isinstance(value, int): + fmt = f'0{field_size_compat_map[last_field]:d}d' + + flags = outer_mobj.group('conversion') or '' + str_fmt = f'{fmt[:-1]}s' + if value is None: + value, fmt = default, 's' + elif fmt[-1] == 'l': # list + delim = '\n' if '#' in flags else ', ' + value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt + elif fmt[-1] == 'j': # json + value, fmt = json.dumps( + value, default=_dumpjson_default, + indent=4 if '#' in flags else None, ensure_ascii='+' not in flags), str_fmt + elif fmt[-1] == 'h': # html + value, fmt = escapeHTML(str(value)), str_fmt + elif fmt[-1] == 'q': # quoted + value = map(str, variadic(value) if '#' in flags else [value]) + value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt + elif fmt[-1] == 'B': # bytes + value = f'%{str_fmt}'.encode() % str(value).encode() + value, fmt = value.decode('utf-8', 'ignore'), 's' + elif fmt[-1] == 'U': # unicode normalized + value, fmt = unicodedata.normalize( + # "+" = compatibility equivalence, "#" = NFD + 'NF%s%s' % ('K' if '+' in flags else '', 'D' if '#' in flags else 'C'), + value), str_fmt + elif fmt[-1] == 'D': # decimal suffix + num_fmt, fmt = fmt[:-1].replace('#', ''), 's' + value = format_decimal_suffix(value, f'%{num_fmt}f%s' if num_fmt else '%d%s', + factor=1024 if '#' in flags else 1000) + elif fmt[-1] == 'S': # filename sanitization + value, fmt = filename_sanitizer(last_field, value, restricted='#' in flags), str_fmt + elif fmt[-1] == 'c': + if value: + value = str(value)[0] + else: + fmt = str_fmt + elif fmt[-1] not in 'rsa': # numeric + value = float_or_none(value) + if value is None: + value, fmt = default, 's' + + if sanitize: + # If value is an object, sanitize might convert it to a string + # So we convert it to repr first + if fmt[-1] == 'r': + value, fmt = repr(value), str_fmt + elif fmt[-1] == 'a': + value, fmt = ascii(value), str_fmt + if fmt[-1] in 'csra': + value = sanitizer(last_field, value) + + key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format')) + TMPL_DICT[key] = value + return '{prefix}%({key}){fmt}'.format(key=key, fmt=fmt, prefix=outer_mobj.group('prefix')) + + return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT + + def evaluate_outtmpl(self, outtmpl, info_dict, *args, **kwargs): + outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs) + return self.escape_outtmpl(outtmpl) % info_dict + + def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None): + assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive' + if outtmpl is None: + outtmpl = self.params['outtmpl'].get(tmpl_type or 'default', self.params['outtmpl']['default']) + try: + outtmpl = self._outtmpl_expandpath(outtmpl) + filename = self.evaluate_outtmpl(outtmpl, info_dict, True) + if not filename: + return None + + if tmpl_type in ('', 'temp'): + final_ext, ext = self.params.get('final_ext'), info_dict.get('ext') + if final_ext and ext and final_ext != ext and filename.endswith(f'.{final_ext}'): + filename = replace_extension(filename, ext, final_ext) + elif tmpl_type: + force_ext = OUTTMPL_TYPES[tmpl_type] + if force_ext: + filename = replace_extension(filename, force_ext, info_dict.get('ext')) + + # https://github.com/blackjack4494/youtube-dlc/issues/85 + trim_file_name = self.params.get('trim_file_name', False) + if trim_file_name: + no_ext, *ext = filename.rsplit('.', 2) + filename = join_nonempty(no_ext[:trim_file_name], *ext, delim='.') + + return filename + except ValueError as err: + self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') + return None + + def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False): + """Generate the output filename""" + if outtmpl: + assert not dir_type, 'outtmpl and dir_type are mutually exclusive' + dir_type = None + filename = self._prepare_filename(info_dict, tmpl_type=dir_type, outtmpl=outtmpl) + if not filename and dir_type not in ('', 'temp'): + return '' + + if warn: + if not self.params.get('paths'): + pass + elif filename == '-': + self.report_warning('--paths is ignored when an outputting to stdout', only_once=True) + elif os.path.isabs(filename): + self.report_warning('--paths is ignored since an absolute path is given in output template', only_once=True) + if filename == '-' or not filename: + return filename + + return self.get_output_path(dir_type, filename) + + def _match_entry(self, info_dict, incomplete=False, silent=False): + """Returns None if the file should be downloaded""" + _type = 'video' if 'playlist-match-filter' in self.params['compat_opts'] else info_dict.get('_type', 'video') + assert incomplete or _type == 'video', 'Only video result can be considered complete' + + video_title = info_dict.get('title', info_dict.get('id', 'entry')) + + def check_filter(): + if _type in ('playlist', 'multi_video'): + return + elif _type in ('url', 'url_transparent') and not try_call( + lambda: self.get_info_extractor(info_dict['ie_key']).is_single_video(info_dict['url'])): + return + + if 'title' in info_dict: + # This can happen when we're just evaluating the playlist + title = info_dict['title'] + matchtitle = self.params.get('matchtitle', False) + if matchtitle: + if not re.search(matchtitle, title, re.IGNORECASE): + return '"' + title + '" title did not match pattern "' + matchtitle + '"' + rejecttitle = self.params.get('rejecttitle', False) + if rejecttitle: + if re.search(rejecttitle, title, re.IGNORECASE): + return '"' + title + '" title matched reject pattern "' + rejecttitle + '"' + + date = info_dict.get('upload_date') + if date is not None: + dateRange = self.params.get('daterange', DateRange()) + if date not in dateRange: + return f'{date_from_str(date).isoformat()} upload date is not in range {dateRange}' + view_count = info_dict.get('view_count') + if view_count is not None: + min_views = self.params.get('min_views') + if min_views is not None and view_count < min_views: + return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views) + max_views = self.params.get('max_views') + if max_views is not None and view_count > max_views: + return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) + if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): + return 'Skipping "%s" because it is age restricted' % video_title + + match_filter = self.params.get('match_filter') + if match_filter is None: + return None + + cancelled = None + try: + try: + ret = match_filter(info_dict, incomplete=incomplete) + except TypeError: + # For backward compatibility + ret = None if incomplete else match_filter(info_dict) + except DownloadCancelled as err: + if err.msg is not NO_DEFAULT: + raise + ret, cancelled = err.msg, err + + if ret is NO_DEFAULT: + while True: + filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME) + reply = input(self._format_screen( + f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip() + if reply in {'y', ''}: + return None + elif reply == 'n': + if cancelled: + raise type(cancelled)(f'Skipping {video_title}') + return f'Skipping {video_title}' + return ret + + if self.in_download_archive(info_dict): + reason = ''.join(( + format_field(info_dict, 'id', f'{self._format_screen("%s", self.Styles.ID)}: '), + format_field(info_dict, 'title', f'{self._format_screen("%s", self.Styles.EMPHASIS)} '), + 'has already been recorded in the archive')) + break_opt, break_err = 'break_on_existing', ExistingVideoReached + else: + try: + reason = check_filter() + except DownloadCancelled as e: + reason, break_opt, break_err = e.msg, 'match_filter', type(e) + else: + break_opt, break_err = 'break_on_reject', RejectedVideoReached + if reason is not None: + if not silent: + self.to_screen('[download] ' + reason) + if self.params.get(break_opt, False): + raise break_err() + return reason + + @staticmethod + def add_extra_info(info_dict, extra_info): + '''Set the keys from extra_info in info dict if they are missing''' + for key, value in extra_info.items(): + info_dict.setdefault(key, value) + + def extract_info(self, url, download=True, ie_key=None, extra_info=None, + process=True, force_generic_extractor=False): + """ + Extract and return the information dictionary of the URL + + Arguments: + @param url URL to extract + + Keyword arguments: + @param download Whether to download videos + @param process Whether to resolve all unresolved references (URLs, playlist items). + Must be True for download to work + @param ie_key Use only the extractor with this key + + @param extra_info Dictionary containing the extra values to add to the info (For internal use only) + @force_generic_extractor Force using the generic extractor (Deprecated; use ie_key='Generic') + """ + + if extra_info is None: + extra_info = {} + + if not ie_key and force_generic_extractor: + ie_key = 'Generic' + + if ie_key: + ies = {ie_key: self._ies[ie_key]} if ie_key in self._ies else {} + else: + ies = self._ies + + for key, ie in ies.items(): + if not ie.suitable(url): + continue + + if not ie.working(): + self.report_warning('The program functionality for this site has been marked as broken, ' + 'and will probably not work.') + + temp_id = ie.get_temp_id(url) + if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': key}): + self.to_screen(f'[download] {self._format_screen(temp_id, self.Styles.ID)}: ' + 'has already been recorded in the archive') + if self.params.get('break_on_existing', False): + raise ExistingVideoReached() + break + return self.__extract_info(url, self.get_info_extractor(key), download, extra_info, process) + else: + extractors_restricted = self.params.get('allowed_extractors') not in (None, ['default']) + self.report_error(f'No suitable extractor{format_field(ie_key, None, " (%s)")} found for URL {url}', + tb=False if extractors_restricted else None) + + def _handle_extraction_exceptions(func): + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + while True: + try: + return func(self, *args, **kwargs) + except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError): + raise + except ReExtractInfo as e: + if e.expected: + self.to_screen(f'{e}; Re-extracting data') + else: + self.to_stderr('\r') + self.report_warning(f'{e}; Re-extracting data') + continue + except GeoRestrictedError as e: + msg = e.msg + if e.countries: + msg += '\nThis video is available in %s.' % ', '.join( + map(ISO3166Utils.short2full, e.countries)) + msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' + self.report_error(msg) + except ExtractorError as e: # An error we somewhat expected + self.report_error(str(e), e.format_traceback()) + except Exception as e: + if self.params.get('ignoreerrors'): + self.report_error(str(e), tb=encode_compat_str(traceback.format_exc())) + else: + raise + break + return wrapper + + def _wait_for_video(self, ie_result={}): + if (not self.params.get('wait_for_video') + or ie_result.get('_type', 'video') != 'video' + or ie_result.get('formats') or ie_result.get('url')): + return + + format_dur = lambda dur: '%02d:%02d:%02d' % timetuple_from_msec(dur * 1000)[:-1] + last_msg = '' + + def progress(msg): + nonlocal last_msg + full_msg = f'{msg}\n' + if not self.params.get('noprogress'): + full_msg = msg + ' ' * (len(last_msg) - len(msg)) + '\r' + elif last_msg: + return + self.to_screen(full_msg, skip_eol=True) + last_msg = msg + + min_wait, max_wait = self.params.get('wait_for_video') + diff = try_get(ie_result, lambda x: x['release_timestamp'] - time.time()) + if diff is None and ie_result.get('live_status') == 'is_upcoming': + diff = round(random.uniform(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait), 0) + self.report_warning('Release time of video is not known') + elif ie_result and (diff or 0) <= 0: + self.report_warning('Video should already be available according to extracted info') + diff = min(max(diff or 0, min_wait or 0), max_wait or float('inf')) + self.to_screen(f'[wait] Waiting for {format_dur(diff)} - Press Ctrl+C to try now') + + wait_till = time.time() + diff + try: + while True: + diff = wait_till - time.time() + if diff <= 0: + progress('') + raise ReExtractInfo('[wait] Wait period ended', expected=True) + progress(f'[wait] Remaining time until next attempt: {self._format_screen(format_dur(diff), self.Styles.EMPHASIS)}') + time.sleep(1) + except KeyboardInterrupt: + progress('') + raise ReExtractInfo('[wait] Interrupted by user', expected=True) + except BaseException as e: + if not isinstance(e, ReExtractInfo): + self.to_screen('') + raise + + def _load_cookies(self, data, *, autoscope=True): + """Loads cookies from a `Cookie` header + + This tries to work around the security vulnerability of passing cookies to every domain. + See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj + + @param data The Cookie header as string to load the cookies from + @param autoscope If `False`, scope cookies using Set-Cookie syntax and error for cookie without domains + If `True`, save cookies for later to be stored in the jar with a limited scope + If a URL, save cookies in the jar with the domain of the URL + """ + for cookie in LenientSimpleCookie(data).values(): + if autoscope and any(cookie.values()): + raise ValueError('Invalid syntax in Cookie Header') + + domain = cookie.get('domain') or '' + expiry = cookie.get('expires') + if expiry == '': # 0 is valid + expiry = None + prepared_cookie = http.cookiejar.Cookie( + cookie.get('version') or 0, cookie.key, cookie.value, None, False, + domain, True, True, cookie.get('path') or '', bool(cookie.get('path')), + cookie.get('secure') or False, expiry, False, None, None, {}) + + if domain: + self.cookiejar.set_cookie(prepared_cookie) + elif autoscope is True: + self.deprecated_feature( + 'Passing cookies as a header is a potential security risk; ' + 'they will be scoped to the domain of the downloaded urls. ' + 'Please consider loading cookies from a file or browser instead.') + self.__header_cookies.append(prepared_cookie) + elif autoscope: + self.report_warning( + 'The extractor result contains an unscoped cookie as an HTTP header. ' + f'If you are using yt-dlp with an input URL{bug_reports_message(before=",")}', + only_once=True) + self._apply_header_cookies(autoscope, [prepared_cookie]) + else: + self.report_error('Unscoped cookies are not allowed; please specify some sort of scoping', + tb=False, is_error=False) + + def _apply_header_cookies(self, url, cookies=None): + """Applies stray header cookies to the provided url + + This loads header cookies and scopes them to the domain provided in `url`. + While this is not ideal, it helps reduce the risk of them being sent + to an unintended destination while mostly maintaining compatibility. + """ + parsed = urllib.parse.urlparse(url) + if not parsed.hostname: + return + + for cookie in map(copy.copy, cookies or self.__header_cookies): + cookie.domain = f'.{parsed.hostname}' + self.cookiejar.set_cookie(cookie) + + @_handle_extraction_exceptions + def __extract_info(self, url, ie, download, extra_info, process): + self._apply_header_cookies(url) + + try: + ie_result = ie.extract(url) + except UserNotLive as e: + if process: + if self.params.get('wait_for_video'): + self.report_warning(e) + self._wait_for_video() + raise + if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) + self.report_warning(f'Extractor {ie.IE_NAME} returned nothing{bug_reports_message()}') + return + if isinstance(ie_result, list): + # Backwards compatibility: old IE result format + ie_result = { + '_type': 'compat_list', + 'entries': ie_result, + } + if extra_info.get('original_url'): + ie_result.setdefault('original_url', extra_info['original_url']) + self.add_default_extra_info(ie_result, ie, url) + if process: + self._wait_for_video(ie_result) + return self.process_ie_result(ie_result, download, extra_info) + else: + return ie_result + + def add_default_extra_info(self, ie_result, ie, url): + if url is not None: + self.add_extra_info(ie_result, { + 'webpage_url': url, + 'original_url': url, + }) + webpage_url = ie_result.get('webpage_url') + if webpage_url: + self.add_extra_info(ie_result, { + 'webpage_url_basename': url_basename(webpage_url), + 'webpage_url_domain': get_domain(webpage_url), + }) + if ie is not None: + self.add_extra_info(ie_result, { + 'extractor': ie.IE_NAME, + 'extractor_key': ie.ie_key(), + }) + + def process_ie_result(self, ie_result, download=True, extra_info=None): + """ + Take the result of the ie(may be modified) and resolve all unresolved + references (URLs, playlist items). + + It will also download the videos if 'download'. + Returns the resolved ie_result. + """ + if extra_info is None: + extra_info = {} + result_type = ie_result.get('_type', 'video') + + if result_type in ('url', 'url_transparent'): + ie_result['url'] = sanitize_url( + ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https') + if ie_result.get('original_url') and not extra_info.get('original_url'): + extra_info = {'original_url': ie_result['original_url'], **extra_info} + + extract_flat = self.params.get('extract_flat', False) + if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) + or extract_flat is True): + info_copy = ie_result.copy() + ie = try_get(ie_result.get('ie_key'), self.get_info_extractor) + if ie and not ie_result.get('id'): + info_copy['id'] = ie.get_temp_id(ie_result['url']) + self.add_default_extra_info(info_copy, ie, ie_result['url']) + self.add_extra_info(info_copy, extra_info) + info_copy, _ = self.pre_process(info_copy) + self._fill_common_fields(info_copy, False) + self.__forced_printings(info_copy) + self._raise_pending_errors(info_copy) + if self.params.get('force_write_download_archive', False): + self.record_download_archive(info_copy) + return ie_result + + if result_type == 'video': + self.add_extra_info(ie_result, extra_info) + ie_result = self.process_video_result(ie_result, download=download) + self._raise_pending_errors(ie_result) + additional_urls = (ie_result or {}).get('additional_urls') + if additional_urls: + # TODO: Improve MetadataParserPP to allow setting a list + if isinstance(additional_urls, str): + additional_urls = [additional_urls] + self.to_screen( + '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls))) + self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls)) + ie_result['additional_entries'] = [ + self.extract_info( + url, download, extra_info=extra_info, + force_generic_extractor=self.params.get('force_generic_extractor')) + for url in additional_urls + ] + return ie_result + elif result_type == 'url': + # We have to add extra_info to the results because it may be + # contained in a playlist + return self.extract_info( + ie_result['url'], download, + ie_key=ie_result.get('ie_key'), + extra_info=extra_info) + elif result_type == 'url_transparent': + # Use the information from the embedding page + info = self.extract_info( + ie_result['url'], ie_key=ie_result.get('ie_key'), + extra_info=extra_info, download=False, process=False) + + # extract_info may return None when ignoreerrors is enabled and + # extraction failed with an error, don't crash and return early + # in this case + if not info: + return info + + exempted_fields = {'_type', 'url', 'ie_key'} + if not ie_result.get('section_end') and ie_result.get('section_start') is None: + # For video clips, the id etc of the clip extractor should be used + exempted_fields |= {'id', 'extractor', 'extractor_key'} + + new_result = info.copy() + new_result.update(filter_dict(ie_result, lambda k, v: v is not None and k not in exempted_fields)) + + # Extracted info may not be a video result (i.e. + # info.get('_type', 'video') != video) but rather an url or + # url_transparent. In such cases outer metadata (from ie_result) + # should be propagated to inner one (info). For this to happen + # _type of info should be overridden with url_transparent. This + # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163. + if new_result.get('_type') == 'url': + new_result['_type'] = 'url_transparent' + + return self.process_ie_result( + new_result, download=download, extra_info=extra_info) + elif result_type in ('playlist', 'multi_video'): + # Protect from infinite recursion due to recursively nested playlists + # (see https://github.com/ytdl-org/youtube-dl/issues/27833) + webpage_url = ie_result.get('webpage_url') # Playlists maynot have webpage_url + if webpage_url and webpage_url in self._playlist_urls: + self.to_screen( + '[download] Skipping already downloaded playlist: %s' + % ie_result.get('title') or ie_result.get('id')) + return + + self._playlist_level += 1 + self._playlist_urls.add(webpage_url) + self._fill_common_fields(ie_result, False) + self._sanitize_thumbnails(ie_result) + try: + return self.__process_playlist(ie_result, download) + finally: + self._playlist_level -= 1 + if not self._playlist_level: + self._playlist_urls.clear() + elif result_type == 'compat_list': + self.report_warning( + 'Extractor %s returned a compat_list result. ' + 'It needs to be updated.' % ie_result.get('extractor')) + + def _fixup(r): + self.add_extra_info(r, { + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'webpage_url_domain': get_domain(ie_result['webpage_url']), + 'extractor_key': ie_result['extractor_key'], + }) + return r + ie_result['entries'] = [ + self.process_ie_result(_fixup(r), download, extra_info) + for r in ie_result['entries'] + ] + return ie_result + else: + raise Exception('Invalid result type: %s' % result_type) + + def _ensure_dir_exists(self, path): + return make_dir(path, self.report_error) + + @staticmethod + def _playlist_infodict(ie_result, strict=False, **kwargs): + info = { + 'playlist_count': ie_result.get('playlist_count'), + 'playlist': ie_result.get('title') or ie_result.get('id'), + 'playlist_id': ie_result.get('id'), + 'playlist_title': ie_result.get('title'), + 'playlist_uploader': ie_result.get('uploader'), + 'playlist_uploader_id': ie_result.get('uploader_id'), + **kwargs, + } + if strict: + return info + if ie_result.get('webpage_url'): + info.update({ + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'webpage_url_domain': get_domain(ie_result['webpage_url']), + }) + return { + **info, + 'playlist_index': 0, + '__last_playlist_index': max(ie_result.get('requested_entries') or (0, 0)), + 'extractor': ie_result['extractor'], + 'extractor_key': ie_result['extractor_key'], + } + + def __process_playlist(self, ie_result, download): + """Process each entry in the playlist""" + assert ie_result['_type'] in ('playlist', 'multi_video') + + common_info = self._playlist_infodict(ie_result, strict=True) + title = common_info.get('playlist') or '' + if self._match_entry(common_info, incomplete=True) is not None: + return + self.to_screen(f'[download] Downloading {ie_result["_type"]}: {title}') + + all_entries = PlaylistEntries(self, ie_result) + entries = orderedSet(all_entries.get_requested_items(), lazy=True) + + lazy = self.params.get('lazy_playlist') + if lazy: + resolved_entries, n_entries = [], 'N/A' + ie_result['requested_entries'], ie_result['entries'] = None, None + else: + entries = resolved_entries = list(entries) + n_entries = len(resolved_entries) + ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], []) + if not ie_result.get('playlist_count'): + # Better to do this after potentially exhausting entries + ie_result['playlist_count'] = all_entries.get_full_count() + + extra = self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries)) + ie_copy = collections.ChainMap(ie_result, extra) + + _infojson_written = False + write_playlist_files = self.params.get('allow_playlist_files', True) + if write_playlist_files and self.params.get('list_thumbnails'): + self.list_thumbnails(ie_result) + if write_playlist_files and not self.params.get('simulate'): + _infojson_written = self._write_info_json( + 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson')) + if _infojson_written is None: + return + if self._write_description('playlist', ie_result, + self.prepare_filename(ie_copy, 'pl_description')) is None: + return + # TODO: This should be passed to ThumbnailsConvertor if necessary + self._write_thumbnails('playlist', ie_result, self.prepare_filename(ie_copy, 'pl_thumbnail')) + + if lazy: + if self.params.get('playlistreverse') or self.params.get('playlistrandom'): + self.report_warning('playlistreverse and playlistrandom are not supported with lazy_playlist', only_once=True) + elif self.params.get('playlistreverse'): + entries.reverse() + elif self.params.get('playlistrandom'): + random.shuffle(entries) + + self.to_screen(f'[{ie_result["extractor"]}] Playlist {title}: Downloading {n_entries} items' + f'{format_field(ie_result, "playlist_count", " of %s")}') + + keep_resolved_entries = self.params.get('extract_flat') != 'discard' + if self.params.get('extract_flat') == 'discard_in_playlist': + keep_resolved_entries = ie_result['_type'] != 'playlist' + if keep_resolved_entries: + self.write_debug('The information of all playlist entries will be held in memory') + + failures = 0 + max_failures = self.params.get('skip_playlist_after_errors') or float('inf') + for i, (playlist_index, entry) in enumerate(entries): + if lazy: + resolved_entries.append((playlist_index, entry)) + if not entry: + continue + + entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip') + if not lazy and 'playlist-index' in self.params['compat_opts']: + playlist_index = ie_result['requested_entries'][i] + + entry_copy = collections.ChainMap(entry, { + **common_info, + 'n_entries': int_or_none(n_entries), + 'playlist_index': playlist_index, + 'playlist_autonumber': i + 1, + }) + + if self._match_entry(entry_copy, incomplete=True) is not None: + # For compatabilty with youtube-dl. See https://github.com/yt-dlp/yt-dlp/issues/4369 + resolved_entries[i] = (playlist_index, NO_DEFAULT) + continue + + self.to_screen('[download] Downloading item %s of %s' % ( + self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS))) + + entry_result = self.__process_iterable_entry(entry, download, collections.ChainMap({ + 'playlist_index': playlist_index, + 'playlist_autonumber': i + 1, + }, extra)) + if not entry_result: + failures += 1 + if failures >= max_failures: + self.report_error( + f'Skipping the remaining entries in playlist "{title}" since {failures} items failed extraction') + break + if keep_resolved_entries: + resolved_entries[i] = (playlist_index, entry_result) + + # Update with processed data + ie_result['entries'] = [e for _, e in resolved_entries if e is not NO_DEFAULT] + ie_result['requested_entries'] = [i for i, e in resolved_entries if e is not NO_DEFAULT] + if ie_result['requested_entries'] == try_call(lambda: list(range(1, ie_result['playlist_count'] + 1))): + # Do not set for full playlist + ie_result.pop('requested_entries') + + # Write the updated info to json + if _infojson_written is True and self._write_info_json( + 'updated playlist', ie_result, + self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None: + return + + ie_result = self.run_all_pps('playlist', ie_result) + self.to_screen(f'[download] Finished downloading playlist: {title}') + return ie_result + + @_handle_extraction_exceptions + def __process_iterable_entry(self, entry, download, extra_info): + return self.process_ie_result( + entry, download=download, extra_info=extra_info) + + def _build_format_filter(self, filter_spec): + " Returns a function to filter the formats according to the filter_spec " + + OPERATORS = { + '<': operator.lt, + '<=': operator.le, + '>': operator.gt, + '>=': operator.ge, + '=': operator.eq, + '!=': operator.ne, + } + operator_rex = re.compile(r'''(?x)\s* + (?P[\w.-]+)\s* + (?P%s)(?P\s*\?)?\s* + (?P[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s* + ''' % '|'.join(map(re.escape, OPERATORS.keys()))) + m = operator_rex.fullmatch(filter_spec) + if m: + try: + comparison_value = int(m.group('value')) + except ValueError: + comparison_value = parse_filesize(m.group('value')) + if comparison_value is None: + comparison_value = parse_filesize(m.group('value') + 'B') + if comparison_value is None: + raise ValueError( + 'Invalid value %r in format specification %r' % ( + m.group('value'), filter_spec)) + op = OPERATORS[m.group('op')] + + if not m: + STR_OPERATORS = { + '=': operator.eq, + '^=': lambda attr, value: attr.startswith(value), + '$=': lambda attr, value: attr.endswith(value), + '*=': lambda attr, value: value in attr, + '~=': lambda attr, value: value.search(attr) is not None + } + str_operator_rex = re.compile(r'''(?x)\s* + (?P[a-zA-Z0-9._-]+)\s* + (?P!\s*)?(?P%s)\s*(?P\?\s*)? + (?P["'])? + (?P(?(quote)(?:(?!(?P=quote))[^\\]|\\.)+|[\w.-]+)) + (?(quote)(?P=quote))\s* + ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) + m = str_operator_rex.fullmatch(filter_spec) + if m: + if m.group('op') == '~=': + comparison_value = re.compile(m.group('value')) + else: + comparison_value = re.sub(r'''\\([\\"'])''', r'\1', m.group('value')) + str_op = STR_OPERATORS[m.group('op')] + if m.group('negation'): + op = lambda attr, value: not str_op(attr, value) + else: + op = str_op + + if not m: + raise SyntaxError('Invalid filter specification %r' % filter_spec) + + def _filter(f): + actual_value = f.get(m.group('key')) + if actual_value is None: + return m.group('none_inclusive') + return op(actual_value, comparison_value) + return _filter + + def _check_formats(self, formats): + for f in formats: + self.to_screen('[info] Testing format %s' % f['format_id']) + path = self.get_output_path('temp') + if not self._ensure_dir_exists(f'{path}/'): + continue + temp_file = tempfile.NamedTemporaryFile(suffix='.tmp', delete=False, dir=path or None) + temp_file.close() + try: + success, _ = self.dl(temp_file.name, f, test=True) + except (DownloadError, OSError, ValueError) + network_exceptions: + success = False + finally: + if os.path.exists(temp_file.name): + try: + os.remove(temp_file.name) + except OSError: + self.report_warning('Unable to delete temporary file "%s"' % temp_file.name) + if success: + yield f + else: + self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id']) + + def _default_format_spec(self, info_dict, download=True): + + def can_merge(): + merger = FFmpegMergerPP(self) + return merger.available and merger.can_merge() + + prefer_best = ( + not self.params.get('simulate') + and download + and ( + not can_merge() + or info_dict.get('is_live') and not self.params.get('live_from_start') + or self.params['outtmpl']['default'] == '-')) + compat = ( + prefer_best + or self.params.get('allow_multiple_audio_streams', False) + or 'format-spec' in self.params['compat_opts']) + + return ( + 'best/bestvideo+bestaudio' if prefer_best + else 'bestvideo*+bestaudio/best' if not compat + else 'bestvideo+bestaudio/best') + + def build_format_selector(self, format_spec): + def syntax_error(note, start): + message = ( + 'Invalid format specification: ' + '{}\n\t{}\n\t{}^'.format(note, format_spec, ' ' * start[1])) + return SyntaxError(message) + + PICKFIRST = 'PICKFIRST' + MERGE = 'MERGE' + SINGLE = 'SINGLE' + GROUP = 'GROUP' + FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) + + allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False), + 'video': self.params.get('allow_multiple_video_streams', False)} + + def _parse_filter(tokens): + filter_parts = [] + for type, string_, start, _, _ in tokens: + if type == tokenize.OP and string_ == ']': + return ''.join(filter_parts) + else: + filter_parts.append(string_) + + def _remove_unused_ops(tokens): + # Remove operators that we don't use and join them with the surrounding strings. + # E.g. 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' + ALLOWED_OPS = ('/', '+', ',', '(', ')') + last_string, last_start, last_end, last_line = None, None, None, None + for type, string_, start, end, line in tokens: + if type == tokenize.OP and string_ == '[': + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string_, start, end, line + # everything inside brackets will be handled by _parse_filter + for type, string_, start, end, line in tokens: + yield type, string_, start, end, line + if type == tokenize.OP and string_ == ']': + break + elif type == tokenize.OP and string_ in ALLOWED_OPS: + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string_, start, end, line + elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: + if not last_string: + last_string = string_ + last_start = start + last_end = end + else: + last_string += string_ + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + + def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): + selectors = [] + current_selector = None + for type, string_, start, _, _ in tokens: + # ENCODING is only defined in Python 3.x + if type == getattr(tokenize, 'ENCODING', None): + continue + elif type in [tokenize.NAME, tokenize.NUMBER]: + current_selector = FormatSelector(SINGLE, string_, []) + elif type == tokenize.OP: + if string_ == ')': + if not inside_group: + # ')' will be handled by the parentheses group + tokens.restore_last_token() + break + elif inside_merge and string_ in ['/', ',']: + tokens.restore_last_token() + break + elif inside_choice and string_ == ',': + tokens.restore_last_token() + break + elif string_ == ',': + if not current_selector: + raise syntax_error('"," must follow a format selector', start) + selectors.append(current_selector) + current_selector = None + elif string_ == '/': + if not current_selector: + raise syntax_error('"/" must follow a format selector', start) + first_choice = current_selector + second_choice = _parse_format_selection(tokens, inside_choice=True) + current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) + elif string_ == '[': + if not current_selector: + current_selector = FormatSelector(SINGLE, 'best', []) + format_filter = _parse_filter(tokens) + current_selector.filters.append(format_filter) + elif string_ == '(': + if current_selector: + raise syntax_error('Unexpected "("', start) + group = _parse_format_selection(tokens, inside_group=True) + current_selector = FormatSelector(GROUP, group, []) + elif string_ == '+': + if not current_selector: + raise syntax_error('Unexpected "+"', start) + selector_1 = current_selector + selector_2 = _parse_format_selection(tokens, inside_merge=True) + if not selector_2: + raise syntax_error('Expected a selector', start) + current_selector = FormatSelector(MERGE, (selector_1, selector_2), []) + else: + raise syntax_error(f'Operator not recognized: "{string_}"', start) + elif type == tokenize.ENDMARKER: + break + if current_selector: + selectors.append(current_selector) + return selectors + + def _merge(formats_pair): + format_1, format_2 = formats_pair + + formats_info = [] + formats_info.extend(format_1.get('requested_formats', (format_1,))) + formats_info.extend(format_2.get('requested_formats', (format_2,))) + + if not allow_multiple_streams['video'] or not allow_multiple_streams['audio']: + get_no_more = {'video': False, 'audio': False} + for (i, fmt_info) in enumerate(formats_info): + if fmt_info.get('acodec') == fmt_info.get('vcodec') == 'none': + formats_info.pop(i) + continue + for aud_vid in ['audio', 'video']: + if not allow_multiple_streams[aud_vid] and fmt_info.get(aud_vid[0] + 'codec') != 'none': + if get_no_more[aud_vid]: + formats_info.pop(i) + break + get_no_more[aud_vid] = True + + if len(formats_info) == 1: + return formats_info[0] + + video_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('vcodec') != 'none'] + audio_fmts = [fmt_info for fmt_info in formats_info if fmt_info.get('acodec') != 'none'] + + the_only_video = video_fmts[0] if len(video_fmts) == 1 else None + the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None + + output_ext = get_compatible_ext( + vcodecs=[f.get('vcodec') for f in video_fmts], + acodecs=[f.get('acodec') for f in audio_fmts], + vexts=[f['ext'] for f in video_fmts], + aexts=[f['ext'] for f in audio_fmts], + preferences=(try_call(lambda: self.params['merge_output_format'].split('/')) + or self.params.get('prefer_free_formats') and ('webm', 'mkv'))) + + filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info)) + + new_dict = { + 'requested_formats': formats_info, + 'format': '+'.join(filtered('format')), + 'format_id': '+'.join(filtered('format_id')), + 'ext': output_ext, + 'protocol': '+'.join(map(determine_protocol, formats_info)), + 'language': '+'.join(orderedSet(filtered('language'))) or None, + 'format_note': '+'.join(orderedSet(filtered('format_note'))) or None, + 'filesize_approx': sum(filtered('filesize', 'filesize_approx')) or None, + 'tbr': sum(filtered('tbr', 'vbr', 'abr')), + } + + if the_only_video: + new_dict.update({ + 'width': the_only_video.get('width'), + 'height': the_only_video.get('height'), + 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video), + 'fps': the_only_video.get('fps'), + 'dynamic_range': the_only_video.get('dynamic_range'), + 'vcodec': the_only_video.get('vcodec'), + 'vbr': the_only_video.get('vbr'), + 'stretched_ratio': the_only_video.get('stretched_ratio'), + 'aspect_ratio': the_only_video.get('aspect_ratio'), + }) + + if the_only_audio: + new_dict.update({ + 'acodec': the_only_audio.get('acodec'), + 'abr': the_only_audio.get('abr'), + 'asr': the_only_audio.get('asr'), + 'audio_channels': the_only_audio.get('audio_channels') + }) + + return new_dict + + def _check_formats(formats): + if self.params.get('check_formats') == 'selected': + yield from self._check_formats(formats) + return + elif (self.params.get('check_formats') is not None + or self.params.get('allow_unplayable_formats')): + yield from formats + return + + for f in formats: + if f.get('has_drm') or f.get('__needs_testing'): + yield from self._check_formats([f]) + else: + yield f + + def _build_selector_function(selector): + if isinstance(selector, list): # , + fs = [_build_selector_function(s) for s in selector] + + def selector_function(ctx): + for f in fs: + yield from f(ctx) + return selector_function + + elif selector.type == GROUP: # () + selector_function = _build_selector_function(selector.selector) + + elif selector.type == PICKFIRST: # / + fs = [_build_selector_function(s) for s in selector.selector] + + def selector_function(ctx): + for f in fs: + picked_formats = list(f(ctx)) + if picked_formats: + return picked_formats + return [] + + elif selector.type == MERGE: # + + selector_1, selector_2 = map(_build_selector_function, selector.selector) + + def selector_function(ctx): + for pair in itertools.product(selector_1(ctx), selector_2(ctx)): + yield _merge(pair) + + elif selector.type == SINGLE: # atom + format_spec = selector.selector or 'best' + + # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector + if format_spec == 'all': + def selector_function(ctx): + yield from _check_formats(ctx['formats'][::-1]) + elif format_spec == 'mergeall': + def selector_function(ctx): + formats = list(_check_formats( + f for f in ctx['formats'] if f.get('vcodec') != 'none' or f.get('acodec') != 'none')) + if not formats: + return + merged_format = formats[-1] + for f in formats[-2::-1]: + merged_format = _merge((merged_format, f)) + yield merged_format + + else: + format_fallback, seperate_fallback, format_reverse, format_idx = False, None, True, 1 + mobj = re.match( + r'(?Pbest|worst|b|w)(?Pvideo|audio|v|a)?(?P\*)?(?:\.(?P[1-9]\d*))?$', + format_spec) + if mobj is not None: + format_idx = int_or_none(mobj.group('n'), default=1) + format_reverse = mobj.group('bw')[0] == 'b' + format_type = (mobj.group('type') or [None])[0] + not_format_type = {'v': 'a', 'a': 'v'}.get(format_type) + format_modified = mobj.group('mod') is not None + + format_fallback = not format_type and not format_modified # for b, w + _filter_f = ( + (lambda f: f.get('%scodec' % format_type) != 'none') + if format_type and format_modified # bv*, ba*, wv*, wa* + else (lambda f: f.get('%scodec' % not_format_type) == 'none') + if format_type # bv, ba, wv, wa + else (lambda f: f.get('vcodec') != 'none' and f.get('acodec') != 'none') + if not format_modified # b, w + else lambda f: True) # b*, w* + filter_f = lambda f: _filter_f(f) and ( + f.get('vcodec') != 'none' or f.get('acodec') != 'none') + else: + if format_spec in self._format_selection_exts['audio']: + filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' + elif format_spec in self._format_selection_exts['video']: + filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none' + seperate_fallback = lambda f: f.get('ext') == format_spec and f.get('vcodec') != 'none' + elif format_spec in self._format_selection_exts['storyboards']: + filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none' + else: + filter_f = lambda f: f.get('format_id') == format_spec # id + + def selector_function(ctx): + formats = list(ctx['formats']) + matches = list(filter(filter_f, formats)) if filter_f is not None else formats + if not matches: + if format_fallback and ctx['incomplete_formats']: + # for extractors with incomplete formats (audio only (soundcloud) + # or video only (imgur)) best/worst will fallback to + # best/worst {video,audio}-only format + matches = list(filter(lambda f: f.get('vcodec') != 'none' or f.get('acodec') != 'none', formats)) + elif seperate_fallback and not ctx['has_merged_format']: + # for compatibility with youtube-dl when there is no pre-merged format + matches = list(filter(seperate_fallback, formats)) + matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1])) + try: + yield matches[format_idx - 1] + except LazyList.IndexError: + return + + filters = [self._build_format_filter(f) for f in selector.filters] + + def final_selector(ctx): + ctx_copy = dict(ctx) + for _filter in filters: + ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats'])) + return selector_function(ctx_copy) + return final_selector + + # HACK: Python 3.12 changed the underlying parser, rendering '7_a' invalid + # Prefix numbers with random letters to avoid it being classified as a number + # See: https://github.com/yt-dlp/yt-dlp/pulls/8797 + # TODO: Implement parser not reliant on tokenize.tokenize + prefix = ''.join(random.choices(string.ascii_letters, k=32)) + stream = io.BytesIO(re.sub(r'\d[_\d]*', rf'{prefix}\g<0>', format_spec).encode()) + try: + tokens = list(_remove_unused_ops( + token._replace(string=token.string.replace(prefix, '')) + for token in tokenize.tokenize(stream.readline))) + except tokenize.TokenError: + raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) + + class TokenIterator: + def __init__(self, tokens): + self.tokens = tokens + self.counter = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.counter >= len(self.tokens): + raise StopIteration() + value = self.tokens[self.counter] + self.counter += 1 + return value + + next = __next__ + + def restore_last_token(self): + self.counter -= 1 + + parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) + return _build_selector_function(parsed_selector) + + def _calc_headers(self, info_dict, load_cookies=False): + res = HTTPHeaderDict(self.params['http_headers'], info_dict.get('http_headers')) + clean_headers(res) + + if load_cookies: # For --load-info-json + self._load_cookies(res.get('Cookie'), autoscope=info_dict['url']) # compat + self._load_cookies(info_dict.get('cookies'), autoscope=False) + # The `Cookie` header is removed to prevent leaks and unscoped cookies. + # See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj + res.pop('Cookie', None) + cookies = self.cookiejar.get_cookies_for_url(info_dict['url']) + if cookies: + encoder = LenientSimpleCookie() + values = [] + for cookie in cookies: + _, value = encoder.value_encode(cookie.value) + values.append(f'{cookie.name}={value}') + if cookie.domain: + values.append(f'Domain={cookie.domain}') + if cookie.path: + values.append(f'Path={cookie.path}') + if cookie.secure: + values.append('Secure') + if cookie.expires: + values.append(f'Expires={cookie.expires}') + if cookie.version: + values.append(f'Version={cookie.version}') + info_dict['cookies'] = '; '.join(values) + + if 'X-Forwarded-For' not in res: + x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip') + if x_forwarded_for_ip: + res['X-Forwarded-For'] = x_forwarded_for_ip + + return res + + def _calc_cookies(self, url): + self.deprecation_warning('"YoutubeDL._calc_cookies" is deprecated and may be removed in a future version') + return self.cookiejar.get_cookie_header(url) + + def _sort_thumbnails(self, thumbnails): + thumbnails.sort(key=lambda t: ( + t.get('preference') if t.get('preference') is not None else -1, + t.get('width') if t.get('width') is not None else -1, + t.get('height') if t.get('height') is not None else -1, + t.get('id') if t.get('id') is not None else '', + t.get('url'))) + + def _sanitize_thumbnails(self, info_dict): + thumbnails = info_dict.get('thumbnails') + if thumbnails is None: + thumbnail = info_dict.get('thumbnail') + if thumbnail: + info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] + if not thumbnails: + return + + def check_thumbnails(thumbnails): + for t in thumbnails: + self.to_screen(f'[info] Testing thumbnail {t["id"]}') + try: + self.urlopen(HEADRequest(t['url'])) + except network_exceptions as err: + self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...') + continue + yield t + + self._sort_thumbnails(thumbnails) + for i, t in enumerate(thumbnails): + if t.get('id') is None: + t['id'] = '%d' % i + if t.get('width') and t.get('height'): + t['resolution'] = '%dx%d' % (t['width'], t['height']) + t['url'] = sanitize_url(t['url']) + + if self.params.get('check_formats') is True: + info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1]), reverse=True) + else: + info_dict['thumbnails'] = thumbnails + + def _fill_common_fields(self, info_dict, final=True): + # TODO: move sanitization here + if final: + title = info_dict['fulltitle'] = info_dict.get('title') + if not title: + if title == '': + self.write_debug('Extractor gave empty title. Creating a generic title') + else: + self.report_warning('Extractor failed to obtain "title". Creating a generic title instead') + info_dict['title'] = f'{info_dict["extractor"].replace(":", "-")} video #{info_dict["id"]}' + + if info_dict.get('duration') is not None: + info_dict['duration_string'] = formatSeconds(info_dict['duration']) + + for ts_key, date_key in ( + ('timestamp', 'upload_date'), + ('release_timestamp', 'release_date'), + ('modified_timestamp', 'modified_date'), + ): + if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + with contextlib.suppress(ValueError, OverflowError, OSError): + upload_date = datetime.datetime.fromtimestamp(info_dict[ts_key], datetime.timezone.utc) + info_dict[date_key] = upload_date.strftime('%Y%m%d') + + if not info_dict.get('release_year'): + info_dict['release_year'] = traverse_obj(info_dict, ('release_date', {lambda x: int(x[:4])})) + + live_keys = ('is_live', 'was_live') + live_status = info_dict.get('live_status') + if live_status is None: + for key in live_keys: + if info_dict.get(key) is False: + continue + if info_dict.get(key): + live_status = key + break + if all(info_dict.get(key) is False for key in live_keys): + live_status = 'not_live' + if live_status: + info_dict['live_status'] = live_status + for key in live_keys: + if info_dict.get(key) is None: + info_dict[key] = (live_status == key) + if live_status == 'post_live': + info_dict['was_live'] = True + + # Auto generate title fields corresponding to the *_number fields when missing + # in order to always have clean titles. This is very common for TV series. + for field in ('chapter', 'season', 'episode'): + if final and info_dict.get('%s_number' % field) is not None and not info_dict.get(field): + info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + + for old_key, new_key in self._deprecated_multivalue_fields.items(): + if new_key in info_dict and old_key in info_dict: + if '_version' not in info_dict: # HACK: Do not warn when using --load-info-json + self.deprecation_warning(f'Do not return {old_key!r} when {new_key!r} is present') + elif old_value := info_dict.get(old_key): + info_dict[new_key] = old_value.split(', ') + elif new_value := info_dict.get(new_key): + info_dict[old_key] = ', '.join(v.replace(',', '\N{FULLWIDTH COMMA}') for v in new_value) + + def _raise_pending_errors(self, info): + err = info.pop('__pending_error', None) + if err: + self.report_error(err, tb=False) + + def sort_formats(self, info_dict): + formats = self._get_formats(info_dict) + formats.sort(key=FormatSorter( + self, info_dict.get('_format_sort_fields') or []).calculate_preference) + + def process_video_result(self, info_dict, download=True): + assert info_dict.get('_type', 'video') == 'video' + self._num_videos += 1 + + if 'id' not in info_dict: + raise ExtractorError('Missing "id" field in extractor result', ie=info_dict['extractor']) + elif not info_dict.get('id'): + raise ExtractorError('Extractor failed to obtain "id"', ie=info_dict['extractor']) + + def report_force_conversion(field, field_not, conversion): + self.report_warning( + '"%s" field is not %s - forcing %s conversion, there is an error in extractor' + % (field, field_not, conversion)) + + def sanitize_string_field(info, string_field): + field = info.get(string_field) + if field is None or isinstance(field, str): + return + report_force_conversion(string_field, 'a string', 'string') + info[string_field] = str(field) + + def sanitize_numeric_fields(info): + for numeric_field in self._NUMERIC_FIELDS: + field = info.get(numeric_field) + if field is None or isinstance(field, (int, float)): + continue + report_force_conversion(numeric_field, 'numeric', 'int') + info[numeric_field] = int_or_none(field) + + sanitize_string_field(info_dict, 'id') + sanitize_numeric_fields(info_dict) + if info_dict.get('section_end') and info_dict.get('section_start') is not None: + info_dict['duration'] = round(info_dict['section_end'] - info_dict['section_start'], 3) + if (info_dict.get('duration') or 0) <= 0 and info_dict.pop('duration', None): + self.report_warning('"duration" field is negative, there is an error in extractor') + + chapters = info_dict.get('chapters') or [] + if chapters and chapters[0].get('start_time'): + chapters.insert(0, {'start_time': 0}) + + dummy_chapter = {'end_time': 0, 'start_time': info_dict.get('duration')} + for idx, (prev, current, next_) in enumerate(zip( + (dummy_chapter, *chapters), chapters, (*chapters[1:], dummy_chapter)), 1): + if current.get('start_time') is None: + current['start_time'] = prev.get('end_time') + if not current.get('end_time'): + current['end_time'] = next_.get('start_time') + if not current.get('title'): + current['title'] = f'' + + if 'playlist' not in info_dict: + # It isn't part of a playlist + info_dict['playlist'] = None + info_dict['playlist_index'] = None + + self._sanitize_thumbnails(info_dict) + + thumbnail = info_dict.get('thumbnail') + thumbnails = info_dict.get('thumbnails') + if thumbnail: + info_dict['thumbnail'] = sanitize_url(thumbnail) + elif thumbnails: + info_dict['thumbnail'] = thumbnails[-1]['url'] + + if info_dict.get('display_id') is None and 'id' in info_dict: + info_dict['display_id'] = info_dict['id'] + + self._fill_common_fields(info_dict) + + for cc_kind in ('subtitles', 'automatic_captions'): + cc = info_dict.get(cc_kind) + if cc: + for _, subtitle in cc.items(): + for subtitle_format in subtitle: + if subtitle_format.get('url'): + subtitle_format['url'] = sanitize_url(subtitle_format['url']) + if subtitle_format.get('ext') is None: + subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() + + automatic_captions = info_dict.get('automatic_captions') + subtitles = info_dict.get('subtitles') + + info_dict['requested_subtitles'] = self.process_subtitles( + info_dict['id'], subtitles, automatic_captions) + + formats = self._get_formats(info_dict) + + # Backward compatibility with InfoExtractor._sort_formats + field_preference = (formats or [{}])[0].pop('__sort_fields', None) + if field_preference: + info_dict['_format_sort_fields'] = field_preference + + info_dict['_has_drm'] = any( # or None ensures --clean-infojson removes it + f.get('has_drm') and f['has_drm'] != 'maybe' for f in formats) or None + if not self.params.get('allow_unplayable_formats'): + formats = [f for f in formats if not f.get('has_drm') or f['has_drm'] == 'maybe'] + + if formats and all(f.get('acodec') == f.get('vcodec') == 'none' for f in formats): + self.report_warning( + f'{"This video is DRM protected and " if info_dict["_has_drm"] else ""}' + 'only images are available for download. Use --list-formats to see them'.capitalize()) + + get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start')) + if not get_from_start: + info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M') + if info_dict.get('is_live') and formats: + formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start] + if get_from_start and not formats: + self.raise_no_formats(info_dict, msg=( + '--live-from-start is passed, but there are no formats that can be downloaded from the start. ' + 'If you want to download from the current time, use --no-live-from-start')) + + def is_wellformed(f): + url = f.get('url') + if not url: + self.report_warning( + '"url" field is missing or empty - skipping format, ' + 'there is an error in extractor') + return False + if isinstance(url, bytes): + sanitize_string_field(f, 'url') + return True + + # Filter out malformed formats for better extraction robustness + formats = list(filter(is_wellformed, formats or [])) + + if not formats: + self.raise_no_formats(info_dict) + + for format in formats: + sanitize_string_field(format, 'format_id') + sanitize_numeric_fields(format) + format['url'] = sanitize_url(format['url']) + if format.get('ext') is None: + format['ext'] = determine_ext(format['url']).lower() + if format.get('protocol') is None: + format['protocol'] = determine_protocol(format) + if format.get('resolution') is None: + format['resolution'] = self.format_resolution(format, default=None) + if format.get('dynamic_range') is None and format.get('vcodec') != 'none': + format['dynamic_range'] = 'SDR' + if format.get('aspect_ratio') is None: + format['aspect_ratio'] = try_call(lambda: round(format['width'] / format['height'], 2)) + # For fragmented formats, "tbr" is often max bitrate and not average + if (('manifest-filesize-approx' in self.params['compat_opts'] or not format.get('manifest_url')) + and info_dict.get('duration') and format.get('tbr') + and not format.get('filesize') and not format.get('filesize_approx')): + format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8)) + format['http_headers'] = self._calc_headers(collections.ChainMap(format, info_dict), load_cookies=True) + + # Safeguard against old/insecure infojson when using --load-info-json + if info_dict.get('http_headers'): + info_dict['http_headers'] = HTTPHeaderDict(info_dict['http_headers']) + info_dict['http_headers'].pop('Cookie', None) + + # This is copied to http_headers by the above _calc_headers and can now be removed + if '__x_forwarded_for_ip' in info_dict: + del info_dict['__x_forwarded_for_ip'] + + self.sort_formats({ + 'formats': formats, + '_format_sort_fields': info_dict.get('_format_sort_fields') + }) + + # Sanitize and group by format_id + formats_dict = {} + for i, format in enumerate(formats): + if not format.get('format_id'): + format['format_id'] = str(i) + else: + # Sanitize format_id from characters used in format selector expression + format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id']) + formats_dict.setdefault(format['format_id'], []).append(format) + + # Make sure all formats have unique format_id + common_exts = set(itertools.chain(*self._format_selection_exts.values())) + for format_id, ambiguous_formats in formats_dict.items(): + ambigious_id = len(ambiguous_formats) > 1 + for i, format in enumerate(ambiguous_formats): + if ambigious_id: + format['format_id'] = '%s-%d' % (format_id, i) + # Ensure there is no conflict between id and ext in format selection + # See https://github.com/yt-dlp/yt-dlp/issues/1282 + if format['format_id'] != format['ext'] and format['format_id'] in common_exts: + format['format_id'] = 'f%s' % format['format_id'] + + if format.get('format') is None: + format['format'] = '{id} - {res}{note}'.format( + id=format['format_id'], + res=self.format_resolution(format), + note=format_field(format, 'format_note', ' (%s)'), + ) + + if self.params.get('check_formats') is True: + formats = LazyList(self._check_formats(formats[::-1]), reverse=True) + + if not formats or formats[0] is not info_dict: + # only set the 'formats' fields if the original info_dict list them + # otherwise we end up with a circular reference, the first (and unique) + # element in the 'formats' field in info_dict is info_dict itself, + # which can't be exported to json + info_dict['formats'] = formats + + info_dict, _ = self.pre_process(info_dict) + + if self._match_entry(info_dict, incomplete=self._format_fields) is not None: + return info_dict + + self.post_extract(info_dict) + info_dict, _ = self.pre_process(info_dict, 'after_filter') + + # The pre-processors may have modified the formats + formats = self._get_formats(info_dict) + + list_only = self.params.get('simulate') == 'list_only' + interactive_format_selection = not list_only and self.format_selector == '-' + if self.params.get('list_thumbnails'): + self.list_thumbnails(info_dict) + if self.params.get('listsubtitles'): + if 'automatic_captions' in info_dict: + self.list_subtitles( + info_dict['id'], automatic_captions, 'automatic captions') + self.list_subtitles(info_dict['id'], subtitles, 'subtitles') + if self.params.get('listformats') or interactive_format_selection: + self.list_formats(info_dict) + if list_only: + # Without this printing, -F --print-json will not work + self.__forced_printings(info_dict) + return info_dict + + format_selector = self.format_selector + while True: + if interactive_format_selection: + req_format = input(self._format_screen('\nEnter format selector ', self.Styles.EMPHASIS) + + '(Press ENTER for default, or Ctrl+C to quit)' + + self._format_screen(': ', self.Styles.EMPHASIS)) + try: + format_selector = self.build_format_selector(req_format) if req_format else None + except SyntaxError as err: + self.report_error(err, tb=False, is_error=False) + continue + + if format_selector is None: + req_format = self._default_format_spec(info_dict, download=download) + self.write_debug(f'Default format spec: {req_format}') + format_selector = self.build_format_selector(req_format) + + formats_to_download = list(format_selector({ + 'formats': formats, + 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats), + 'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video + or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio + })) + if interactive_format_selection and not formats_to_download: + self.report_error('Requested format is not available', tb=False, is_error=False) + continue + break + + if not formats_to_download: + if not self.params.get('ignore_no_formats_error'): + raise ExtractorError( + 'Requested format is not available. Use --list-formats for a list of available formats', + expected=True, video_id=info_dict['id'], ie=info_dict['extractor']) + self.report_warning('Requested format is not available') + # Process what we can, even without any available formats. + formats_to_download = [{}] + + requested_ranges = tuple(self.params.get('download_ranges', lambda *_: [{}])(info_dict, self)) + best_format, downloaded_formats = formats_to_download[-1], [] + if download: + if best_format and requested_ranges: + def to_screen(*msg): + self.to_screen(f'[info] {info_dict["id"]}: {" ".join(", ".join(variadic(m)) for m in msg)}') + + to_screen(f'Downloading {len(formats_to_download)} format(s):', + (f['format_id'] for f in formats_to_download)) + if requested_ranges != ({}, ): + to_screen(f'Downloading {len(requested_ranges)} time ranges:', + (f'{c["start_time"]:.1f}-{c["end_time"]:.1f}' for c in requested_ranges)) + max_downloads_reached = False + + for fmt, chapter in itertools.product(formats_to_download, requested_ranges): + new_info = self._copy_infodict(info_dict) + new_info.update(fmt) + offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf') + end_time = offset + min(chapter.get('end_time', duration), duration) + # duration may not be accurate. So allow deviations <1sec + if end_time == float('inf') or end_time > offset + duration + 1: + end_time = None + if chapter or offset: + new_info.update({ + 'section_start': offset + chapter.get('start_time', 0), + 'section_end': end_time, + 'section_title': chapter.get('title'), + 'section_number': chapter.get('index'), + }) + downloaded_formats.append(new_info) + try: + self.process_info(new_info) + except MaxDownloadsReached: + max_downloads_reached = True + self._raise_pending_errors(new_info) + # Remove copied info + for key, val in tuple(new_info.items()): + if info_dict.get(key) == val: + new_info.pop(key) + if max_downloads_reached: + break + + write_archive = {f.get('__write_download_archive', False) for f in downloaded_formats} + assert write_archive.issubset({True, False, 'ignore'}) + if True in write_archive and False not in write_archive: + self.record_download_archive(info_dict) + + info_dict['requested_downloads'] = downloaded_formats + info_dict = self.run_all_pps('after_video', info_dict) + if max_downloads_reached: + raise MaxDownloadsReached() + + # We update the info dict with the selected best quality format (backwards compatibility) + info_dict.update(best_format) + return info_dict + + def process_subtitles(self, video_id, normal_subtitles, automatic_captions): + """Select the requested subtitles and their format""" + available_subs, normal_sub_langs = {}, [] + if normal_subtitles and self.params.get('writesubtitles'): + available_subs.update(normal_subtitles) + normal_sub_langs = tuple(normal_subtitles.keys()) + if automatic_captions and self.params.get('writeautomaticsub'): + for lang, cap_info in automatic_captions.items(): + if lang not in available_subs: + available_subs[lang] = cap_info + + if not available_subs or ( + not self.params.get('writesubtitles') + and not self.params.get('writeautomaticsub')): + return None + + all_sub_langs = tuple(available_subs.keys()) + if self.params.get('allsubtitles', False): + requested_langs = all_sub_langs + elif self.params.get('subtitleslangs', False): + try: + requested_langs = orderedSet_from_options( + self.params.get('subtitleslangs'), {'all': all_sub_langs}, use_regex=True) + except re.error as e: + raise ValueError(f'Wrong regex for subtitlelangs: {e.pattern}') + else: + requested_langs = LazyList(itertools.chain( + ['en'] if 'en' in normal_sub_langs else [], + filter(lambda f: f.startswith('en'), normal_sub_langs), + ['en'] if 'en' in all_sub_langs else [], + filter(lambda f: f.startswith('en'), all_sub_langs), + normal_sub_langs, all_sub_langs, + ))[:1] + if requested_langs: + self.to_screen(f'[info] {video_id}: Downloading subtitles: {", ".join(requested_langs)}') + + formats_query = self.params.get('subtitlesformat', 'best') + formats_preference = formats_query.split('/') if formats_query else [] + subs = {} + for lang in requested_langs: + formats = available_subs.get(lang) + if formats is None: + self.report_warning(f'{lang} subtitles not available for {video_id}') + continue + for ext in formats_preference: + if ext == 'best': + f = formats[-1] + break + matches = list(filter(lambda f: f['ext'] == ext, formats)) + if matches: + f = matches[-1] + break + else: + f = formats[-1] + self.report_warning( + 'No subtitle format found matching "%s" for language %s, ' + 'using %s' % (formats_query, lang, f['ext'])) + subs[lang] = f + return subs + + def _forceprint(self, key, info_dict): + if info_dict is None: + return + info_copy = info_dict.copy() + info_copy.setdefault('filename', self.prepare_filename(info_dict)) + if info_dict.get('requested_formats') is not None: + # For RTMP URLs, also include the playpath + info_copy['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats']) + elif info_dict.get('url'): + info_copy['urls'] = info_dict['url'] + info_dict.get('play_path', '') + info_copy['formats_table'] = self.render_formats_table(info_dict) + info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict) + info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles')) + info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions')) + + def format_tmpl(tmpl): + mobj = re.fullmatch(r'([\w.:,]|-\d|(?P{([\w.:,]|-\d)+}))+=?', tmpl) + if not mobj: + return tmpl + + fmt = '%({})s' + if tmpl.startswith('{'): + tmpl, fmt = f'.{tmpl}', '%({})j' + if tmpl.endswith('='): + tmpl, fmt = tmpl[:-1], '{0} = %({0})#j' + return '\n'.join(map(fmt.format, [tmpl] if mobj.group('dict') else tmpl.split(','))) + + for tmpl in self.params['forceprint'].get(key, []): + self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy)) + + for tmpl, file_tmpl in self.params['print_to_file'].get(key, []): + filename = self.prepare_filename(info_dict, outtmpl=file_tmpl) + tmpl = format_tmpl(tmpl) + self.to_screen(f'[info] Writing {tmpl!r} to: {filename}') + if self._ensure_dir_exists(filename): + with open(filename, 'a', encoding='utf-8', newline='') as f: + f.write(self.evaluate_outtmpl(tmpl, info_copy) + os.linesep) + + return info_copy + + def __forced_printings(self, info_dict, filename=None, incomplete=True): + if (self.params.get('forcejson') + or self.params['forceprint'].get('video') + or self.params['print_to_file'].get('video')): + self.post_extract(info_dict) + if filename: + info_dict['filename'] = filename + info_copy = self._forceprint('video', info_dict) + + def print_field(field, actual_field=None, optional=False): + if actual_field is None: + actual_field = field + if self.params.get(f'force{field}') and ( + info_copy.get(field) is not None or (not optional and not incomplete)): + self.to_stdout(info_copy[actual_field]) + + print_field('title') + print_field('id') + print_field('url', 'urls') + print_field('thumbnail', optional=True) + print_field('description', optional=True) + print_field('filename') + if self.params.get('forceduration') and info_copy.get('duration') is not None: + self.to_stdout(formatSeconds(info_copy['duration'])) + print_field('format') + + if self.params.get('forcejson'): + self.to_stdout(json.dumps(self.sanitize_info(info_dict))) + + def dl(self, name, info, subtitle=False, test=False): + if not info.get('url'): + self.raise_no_formats(info, True) + + if test: + verbose = self.params.get('verbose') + params = { + 'test': True, + 'quiet': self.params.get('quiet') or not verbose, + 'verbose': verbose, + 'noprogress': not verbose, + 'nopart': True, + 'skip_unavailable_fragments': False, + 'keep_fragments': False, + 'overwrites': True, + '_no_ytdl_file': True, + } + else: + params = self.params + fd = get_suitable_downloader(info, params, to_stdout=(name == '-'))(self, params) + if not test: + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + urls = '", "'.join( + (f['url'].split(',')[0] + ',' if f['url'].startswith('data:') else f['url']) + for f in info.get('requested_formats', []) or [info]) + self.write_debug(f'Invoking {fd.FD_NAME} downloader on "{urls}"') + + # Note: Ideally info should be a deep-copied so that hooks cannot modify it. + # But it may contain objects that are not deep-copyable + new_info = self._copy_infodict(info) + if new_info.get('http_headers') is None: + new_info['http_headers'] = self._calc_headers(new_info) + return fd.download(name, new_info, subtitle) + + def existing_file(self, filepaths, *, default_overwrite=True): + existing_files = list(filter(os.path.exists, orderedSet(filepaths))) + if existing_files and not self.params.get('overwrites', default_overwrite): + return existing_files[0] + + for file in existing_files: + self.report_file_delete(file) + os.remove(file) + return None + + def process_info(self, info_dict): + """Process a single resolved IE result. (Modifies it in-place)""" + + assert info_dict.get('_type', 'video') == 'video' + original_infodict = info_dict + + if 'format' not in info_dict and 'ext' in info_dict: + info_dict['format'] = info_dict['ext'] + + if self._match_entry(info_dict) is not None: + info_dict['__write_download_archive'] = 'ignore' + return + + # Does nothing under normal operation - for backward compatibility of process_info + self.post_extract(info_dict) + + def replace_info_dict(new_info): + nonlocal info_dict + if new_info == info_dict: + return + info_dict.clear() + info_dict.update(new_info) + + new_info, _ = self.pre_process(info_dict, 'video') + replace_info_dict(new_info) + self._num_downloads += 1 + + # info_dict['_filename'] needs to be set for backward compatibility + info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True) + temp_filename = self.prepare_filename(info_dict, 'temp') + files_to_move = {} + + # Forced printings + self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict)) + + def check_max_downloads(): + if self._num_downloads >= float(self.params.get('max_downloads') or 'inf'): + raise MaxDownloadsReached() + + if self.params.get('simulate'): + info_dict['__write_download_archive'] = self.params.get('force_write_download_archive') + check_max_downloads() + return + + if full_filename is None: + return + if not self._ensure_dir_exists(encodeFilename(full_filename)): + return + if not self._ensure_dir_exists(encodeFilename(temp_filename)): + return + + if self._write_description('video', info_dict, + self.prepare_filename(info_dict, 'description')) is None: + return + + sub_files = self._write_subtitles(info_dict, temp_filename) + if sub_files is None: + return + files_to_move.update(dict(sub_files)) + + thumb_files = self._write_thumbnails( + 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail')) + if thumb_files is None: + return + files_to_move.update(dict(thumb_files)) + + infofn = self.prepare_filename(info_dict, 'infojson') + _infojson_written = self._write_info_json('video', info_dict, infofn) + if _infojson_written: + info_dict['infojson_filename'] = infofn + # For backward compatibility, even though it was a private field + info_dict['__infojson_filename'] = infofn + elif _infojson_written is None: + return + + # Note: Annotations are deprecated + annofn = None + if self.params.get('writeannotations', False): + annofn = self.prepare_filename(info_dict, 'annotation') + if annofn: + if not self._ensure_dir_exists(encodeFilename(annofn)): + return + if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)): + self.to_screen('[info] Video annotations are already present') + elif not info_dict.get('annotations'): + self.report_warning('There are no annotations to write.') + else: + try: + self.to_screen('[info] Writing video annotations to: ' + annofn) + with open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + annofile.write(info_dict['annotations']) + except (KeyError, TypeError): + self.report_warning('There are no annotations to write.') + except OSError: + self.report_error('Cannot write annotations file: ' + annofn) + return + + # Write internet shortcut files + def _write_link_file(link_type): + url = try_get(info_dict['webpage_url'], iri_to_uri) + if not url: + self.report_warning( + f'Cannot write internet shortcut file because the actual URL of "{info_dict["webpage_url"]}" is unknown') + return True + linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext')) + if not self._ensure_dir_exists(encodeFilename(linkfn)): + return False + if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)): + self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present') + return True + try: + self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}') + with open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', + newline='\r\n' if link_type == 'url' else '\n') as linkfile: + template_vars = {'url': url} + if link_type == 'desktop': + template_vars['filename'] = linkfn[:-(len(link_type) + 1)] + linkfile.write(LINK_TEMPLATES[link_type] % template_vars) + except OSError: + self.report_error(f'Cannot write internet shortcut {linkfn}') + return False + return True + + write_links = { + 'url': self.params.get('writeurllink'), + 'webloc': self.params.get('writewebloclink'), + 'desktop': self.params.get('writedesktoplink'), + } + if self.params.get('writelink'): + link_type = ('webloc' if sys.platform == 'darwin' + else 'desktop' if sys.platform.startswith('linux') + else 'url') + write_links[link_type] = True + + if any(should_write and not _write_link_file(link_type) + for link_type, should_write in write_links.items()): + return + + new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move) + replace_info_dict(new_info) + + if self.params.get('skip_download'): + info_dict['filepath'] = temp_filename + info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename))) + info_dict['__files_to_move'] = files_to_move + replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)) + info_dict['__write_download_archive'] = self.params.get('force_write_download_archive') + else: + # Download + info_dict.setdefault('__postprocessors', []) + try: + + def existing_video_file(*filepaths): + ext = info_dict.get('ext') + converted = lambda file: replace_extension(file, self.params.get('final_ext') or ext, ext) + file = self.existing_file(itertools.chain(*zip(map(converted, filepaths), filepaths)), + default_overwrite=False) + if file: + info_dict['ext'] = os.path.splitext(file)[1][1:] + return file + + fd, success = None, True + if info_dict.get('protocol') or info_dict.get('url'): + fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') + if fd != FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and ( + info_dict.get('section_start') or info_dict.get('section_end')): + msg = ('This format cannot be partially downloaded' if FFmpegFD.available() + else 'You have requested downloading the video partially, but ffmpeg is not installed') + self.report_error(f'{msg}. Aborting') + return + + if info_dict.get('requested_formats') is not None: + old_ext = info_dict['ext'] + if self.params.get('merge_output_format') is None: + if (info_dict['ext'] == 'webm' + and info_dict.get('thumbnails') + # check with type instead of pp_key, __name__, or isinstance + # since we dont want any custom PPs to trigger this + and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): # noqa: E721 + info_dict['ext'] = 'mkv' + self.report_warning( + 'webm doesn\'t support embedding a thumbnail, mkv will be used') + new_ext = info_dict['ext'] + + def correct_ext(filename, ext=new_ext): + if filename == '-': + return filename + filename_real_ext = os.path.splitext(filename)[1][1:] + filename_wo_ext = ( + os.path.splitext(filename)[0] + if filename_real_ext in (old_ext, new_ext) + else filename) + return f'{filename_wo_ext}.{ext}' + + # Ensure filename always has a correct extension for successful merge + full_filename = correct_ext(full_filename) + temp_filename = correct_ext(temp_filename) + dl_filename = existing_video_file(full_filename, temp_filename) + + info_dict['__real_download'] = False + # NOTE: Copy so that original format dicts are not modified + info_dict['requested_formats'] = list(map(dict, info_dict['requested_formats'])) + + merger = FFmpegMergerPP(self) + downloaded = [] + if dl_filename is not None: + self.report_file_already_downloaded(dl_filename) + elif fd: + for f in info_dict['requested_formats'] if fd != FFmpegFD else []: + f['filepath'] = fname = prepend_extension( + correct_ext(temp_filename, info_dict['ext']), + 'f%s' % f['format_id'], info_dict['ext']) + downloaded.append(fname) + info_dict['url'] = '\n'.join(f['url'] for f in info_dict['requested_formats']) + success, real_download = self.dl(temp_filename, info_dict) + info_dict['__real_download'] = real_download + else: + if self.params.get('allow_unplayable_formats'): + self.report_warning( + 'You have requested merging of multiple formats ' + 'while also allowing unplayable formats to be downloaded. ' + 'The formats won\'t be merged to prevent data corruption.') + elif not merger.available: + msg = 'You have requested merging of multiple formats but ffmpeg is not installed' + if not self.params.get('ignoreerrors'): + self.report_error(f'{msg}. Aborting due to --abort-on-error') + return + self.report_warning(f'{msg}. The formats won\'t be merged') + + if temp_filename == '-': + reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict, self.params) + else 'but the formats are incompatible for simultaneous download' if merger.available + else 'but ffmpeg is not installed') + self.report_warning( + f'You have requested downloading multiple formats to stdout {reason}. ' + 'The formats will be streamed one after the other') + fname = temp_filename + for f in info_dict['requested_formats']: + new_info = dict(info_dict) + del new_info['requested_formats'] + new_info.update(f) + if temp_filename != '-': + fname = prepend_extension( + correct_ext(temp_filename, new_info['ext']), + 'f%s' % f['format_id'], new_info['ext']) + if not self._ensure_dir_exists(fname): + return + f['filepath'] = fname + downloaded.append(fname) + partial_success, real_download = self.dl(fname, new_info) + info_dict['__real_download'] = info_dict['__real_download'] or real_download + success = success and partial_success + + if downloaded and merger.available and not self.params.get('allow_unplayable_formats'): + info_dict['__postprocessors'].append(merger) + info_dict['__files_to_merge'] = downloaded + # Even if there were no downloads, it is being merged only now + info_dict['__real_download'] = True + else: + for file in downloaded: + files_to_move[file] = None + else: + # Just a single file + dl_filename = existing_video_file(full_filename, temp_filename) + if dl_filename is None or dl_filename == temp_filename: + # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part. + # So we should try to resume the download + success, real_download = self.dl(temp_filename, info_dict) + info_dict['__real_download'] = real_download + else: + self.report_file_already_downloaded(dl_filename) + + dl_filename = dl_filename or temp_filename + info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename))) + + except network_exceptions as err: + self.report_error('unable to download video data: %s' % error_to_compat_str(err)) + return + except OSError as err: + raise UnavailableVideoError(err) + except (ContentTooShortError, ) as err: + self.report_error(f'content too short (expected {err.expected} bytes and served {err.downloaded})') + return + + self._raise_pending_errors(info_dict) + if success and full_filename != '-': + + def fixup(): + do_fixup = True + fixup_policy = self.params.get('fixup') + vid = info_dict['id'] + + if fixup_policy in ('ignore', 'never'): + return + elif fixup_policy == 'warn': + do_fixup = 'warn' + elif fixup_policy != 'force': + assert fixup_policy in ('detect_or_warn', None) + if not info_dict.get('__real_download'): + do_fixup = False + + def ffmpeg_fixup(cndn, msg, cls): + if not (do_fixup and cndn): + return + elif do_fixup == 'warn': + self.report_warning(f'{vid}: {msg}') + return + pp = cls(self) + if pp.available: + info_dict['__postprocessors'].append(pp) + else: + self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically') + + stretched_ratio = info_dict.get('stretched_ratio') + ffmpeg_fixup(stretched_ratio not in (1, None), + f'Non-uniform pixel ratio {stretched_ratio}', + FFmpegFixupStretchedPP) + + downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None + downloader = downloader.FD_NAME if downloader else None + + ext = info_dict.get('ext') + postprocessed_by_ffmpeg = info_dict.get('requested_formats') or any(( + isinstance(pp, FFmpegVideoConvertorPP) + and resolve_recode_mapping(ext, pp.mapping)[0] not in (ext, None) + ) for pp in self._pps['post_process']) + + if not postprocessed_by_ffmpeg: + ffmpeg_fixup(fd != FFmpegFD and ext == 'm4a' + and info_dict.get('container') == 'm4a_dash', + 'writing DASH m4a. Only some players support this container', + FFmpegFixupM4aPP) + ffmpeg_fixup(downloader == 'hlsnative' and not self.params.get('hls_use_mpegts') + or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None, + 'Possible MPEG-TS in MP4 container or malformed AAC timestamps', + FFmpegFixupM3u8PP) + ffmpeg_fixup(downloader == 'dashsegments' + and (info_dict.get('is_live') or info_dict.get('is_dash_periods')), + 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP) + + ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP) + ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed duration detected', FFmpegFixupDurationPP) + + fixup() + try: + replace_info_dict(self.post_process(dl_filename, info_dict, files_to_move)) + except PostProcessingError as err: + self.report_error('Postprocessing: %s' % str(err)) + return + try: + for ph in self._post_hooks: + ph(info_dict['filepath']) + except Exception as err: + self.report_error('post hooks: %s' % str(err)) + return + info_dict['__write_download_archive'] = True + + assert info_dict is original_infodict # Make sure the info_dict was modified in-place + if self.params.get('force_write_download_archive'): + info_dict['__write_download_archive'] = True + check_max_downloads() + + def __download_wrapper(self, func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + res = func(*args, **kwargs) + except UnavailableVideoError as e: + self.report_error(e) + except DownloadCancelled as e: + self.to_screen(f'[info] {e}') + if not self.params.get('break_per_url'): + raise + self._num_downloads = 0 + else: + if self.params.get('dump_single_json', False): + self.post_extract(res) + self.to_stdout(json.dumps(self.sanitize_info(res))) + return wrapper + + def download(self, url_list): + """Download a given list of URLs.""" + url_list = variadic(url_list) # Passing a single URL is a common mistake + outtmpl = self.params['outtmpl']['default'] + if (len(url_list) > 1 + and outtmpl != '-' + and '%' not in outtmpl + and self.params.get('max_downloads') != 1): + raise SameFileError(outtmpl) + + for url in url_list: + self.__download_wrapper(self.extract_info)( + url, force_generic_extractor=self.params.get('force_generic_extractor', False)) + + return self._download_retcode + + def download_with_info_file(self, info_filename): + with contextlib.closing(fileinput.FileInput( + [info_filename], mode='r', + openhook=fileinput.hook_encoded('utf-8'))) as f: + # FileInput doesn't have a read method, we can't call json.load + infos = [self.sanitize_info(info, self.params.get('clean_infojson', True)) + for info in variadic(json.loads('\n'.join(f)))] + for info in infos: + try: + self.__download_wrapper(self.process_ie_result)(info, download=True) + except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e: + if not isinstance(e, EntryNotInPlaylist): + self.to_stderr('\r') + webpage_url = info.get('webpage_url') + if webpage_url is None: + raise + self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}') + self.download([webpage_url]) + except ExtractorError as e: + self.report_error(e) + return self._download_retcode + + @staticmethod + def sanitize_info(info_dict, remove_private_keys=False): + ''' Sanitize the infodict for converting to json ''' + if info_dict is None: + return info_dict + info_dict.setdefault('epoch', int(time.time())) + info_dict.setdefault('_type', 'video') + info_dict.setdefault('_version', { + 'version': __version__, + 'current_git_head': current_git_head(), + 'release_git_head': RELEASE_GIT_HEAD, + 'repository': ORIGIN, + }) + + if remove_private_keys: + reject = lambda k, v: v is None or k.startswith('__') or k in { + 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries', + 'entries', 'filepath', '_filename', 'filename', 'infojson_filename', 'original_url', + 'playlist_autonumber', + } + else: + reject = lambda k, v: False + + def filter_fn(obj): + if isinstance(obj, dict): + return {k: filter_fn(v) for k, v in obj.items() if not reject(k, v)} + elif isinstance(obj, (list, tuple, set, LazyList)): + return list(map(filter_fn, obj)) + elif obj is None or isinstance(obj, (str, int, float, bool)): + return obj + else: + return repr(obj) + + return filter_fn(info_dict) + + @staticmethod + def filter_requested_info(info_dict, actually_filter=True): + ''' Alias of sanitize_info for backward compatibility ''' + return YoutubeDL.sanitize_info(info_dict, actually_filter) + + def _delete_downloaded_files(self, *files_to_delete, info={}, msg=None): + for filename in set(filter(None, files_to_delete)): + if msg: + self.to_screen(msg % filename) + try: + os.remove(filename) + except OSError: + self.report_warning(f'Unable to delete file {filename}') + if filename in info.get('__files_to_move', []): # NB: Delete even if None + del info['__files_to_move'][filename] + + @staticmethod + def post_extract(info_dict): + def actual_post_extract(info_dict): + if info_dict.get('_type') in ('playlist', 'multi_video'): + for video_dict in info_dict.get('entries', {}): + actual_post_extract(video_dict or {}) + return + + post_extractor = info_dict.pop('__post_extractor', None) or (lambda: {}) + info_dict.update(post_extractor()) + + actual_post_extract(info_dict or {}) + + def run_pp(self, pp, infodict): + files_to_delete = [] + if '__files_to_move' not in infodict: + infodict['__files_to_move'] = {} + try: + files_to_delete, infodict = pp.run(infodict) + except PostProcessingError as e: + # Must be True and not 'only_download' + if self.params.get('ignoreerrors') is True: + self.report_error(e) + return infodict + raise + + if not files_to_delete: + return infodict + if self.params.get('keepvideo', False): + for f in files_to_delete: + infodict['__files_to_move'].setdefault(f, '') + else: + self._delete_downloaded_files( + *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)') + return infodict + + def run_all_pps(self, key, info, *, additional_pps=None): + if key != 'video': + self._forceprint(key, info) + for pp in (additional_pps or []) + self._pps[key]: + info = self.run_pp(pp, info) + return info + + def pre_process(self, ie_info, key='pre_process', files_to_move=None): + info = dict(ie_info) + info['__files_to_move'] = files_to_move or {} + try: + info = self.run_all_pps(key, info) + except PostProcessingError as err: + msg = f'Preprocessing: {err}' + info.setdefault('__pending_error', msg) + self.report_error(msg, is_error=False) + return info, info.pop('__files_to_move', None) + + def post_process(self, filename, info, files_to_move=None): + """Run all the postprocessors on the given file.""" + info['filepath'] = filename + info['__files_to_move'] = files_to_move or {} + info = self.run_all_pps('post_process', info, additional_pps=info.get('__postprocessors')) + info = self.run_pp(MoveFilesAfterDownloadPP(self), info) + del info['__files_to_move'] + return self.run_all_pps('after_move', info) + + def _make_archive_id(self, info_dict): + video_id = info_dict.get('id') + if not video_id: + return + # Future-proof against any change in case + # and backwards compatibility with prior versions + extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist + if extractor is None: + url = str_or_none(info_dict.get('url')) + if not url: + return + # Try to find matching extractor for the URL and take its ie_key + for ie_key, ie in self._ies.items(): + if ie.suitable(url): + extractor = ie_key + break + else: + return + return make_archive_id(extractor, video_id) + + def in_download_archive(self, info_dict): + if not self.archive: + return False + + vid_ids = [self._make_archive_id(info_dict)] + vid_ids.extend(info_dict.get('_old_archive_ids') or []) + return any(id_ in self.archive for id_ in vid_ids) + + def record_download_archive(self, info_dict): + fn = self.params.get('download_archive') + if fn is None: + return + vid_id = self._make_archive_id(info_dict) + assert vid_id + + self.write_debug(f'Adding to archive: {vid_id}') + if is_path_like(fn): + with locked_file(fn, 'a', encoding='utf-8') as archive_file: + archive_file.write(vid_id + '\n') + self.archive.add(vid_id) + + @staticmethod + def format_resolution(format, default='unknown'): + if format.get('vcodec') == 'none' and format.get('acodec') != 'none': + return 'audio only' + if format.get('resolution') is not None: + return format['resolution'] + if format.get('width') and format.get('height'): + return '%dx%d' % (format['width'], format['height']) + elif format.get('height'): + return '%sp' % format['height'] + elif format.get('width'): + return '%dx?' % format['width'] + return default + + def _list_format_headers(self, *headers): + if self.params.get('listformats_table', True) is not False: + return [self._format_out(header, self.Styles.HEADERS) for header in headers] + return headers + + def _format_note(self, fdict): + res = '' + if fdict.get('ext') in ['f4f', 'f4m']: + res += '(unsupported)' + if fdict.get('language'): + if res: + res += ' ' + res += '[%s]' % fdict['language'] + if fdict.get('format_note') is not None: + if res: + res += ' ' + res += fdict['format_note'] + if fdict.get('tbr') is not None: + if res: + res += ', ' + res += '%4dk' % fdict['tbr'] + if fdict.get('container') is not None: + if res: + res += ', ' + res += '%s container' % fdict['container'] + if (fdict.get('vcodec') is not None + and fdict.get('vcodec') != 'none'): + if res: + res += ', ' + res += fdict['vcodec'] + if fdict.get('vbr') is not None: + res += '@' + elif fdict.get('vbr') is not None and fdict.get('abr') is not None: + res += 'video@' + if fdict.get('vbr') is not None: + res += '%4dk' % fdict['vbr'] + if fdict.get('fps') is not None: + if res: + res += ', ' + res += '%sfps' % fdict['fps'] + if fdict.get('acodec') is not None: + if res: + res += ', ' + if fdict['acodec'] == 'none': + res += 'video only' + else: + res += '%-5s' % fdict['acodec'] + elif fdict.get('abr') is not None: + if res: + res += ', ' + res += 'audio' + if fdict.get('abr') is not None: + res += '@%3dk' % fdict['abr'] + if fdict.get('asr') is not None: + res += ' (%5dHz)' % fdict['asr'] + if fdict.get('filesize') is not None: + if res: + res += ', ' + res += format_bytes(fdict['filesize']) + elif fdict.get('filesize_approx') is not None: + if res: + res += ', ' + res += '~' + format_bytes(fdict['filesize_approx']) + return res + + def _get_formats(self, info_dict): + if info_dict.get('formats') is None: + if info_dict.get('url') and info_dict.get('_type', 'video') == 'video': + return [info_dict] + return [] + return info_dict['formats'] + + def render_formats_table(self, info_dict): + formats = self._get_formats(info_dict) + if not formats: + return + if not self.params.get('listformats_table', True) is not False: + table = [ + [ + format_field(f, 'format_id'), + format_field(f, 'ext'), + self.format_resolution(f), + self._format_note(f) + ] for f in formats if (f.get('preference') or 0) >= -1000] + return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1) + + def simplified_codec(f, field): + assert field in ('acodec', 'vcodec') + codec = f.get(field) + if not codec: + return 'unknown' + elif codec != 'none': + return '.'.join(codec.split('.')[:4]) + + if field == 'vcodec' and f.get('acodec') == 'none': + return 'images' + elif field == 'acodec' and f.get('vcodec') == 'none': + return '' + return self._format_out('audio only' if field == 'vcodec' else 'video only', + self.Styles.SUPPRESS) + + delim = self._format_out('\u2502', self.Styles.DELIM, '|', test_encoding=True) + table = [ + [ + self._format_out(format_field(f, 'format_id'), self.Styles.ID), + format_field(f, 'ext'), + format_field(f, func=self.format_resolution, ignore=('audio only', 'images')), + format_field(f, 'fps', '\t%d', func=round), + format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''), + format_field(f, 'audio_channels', '\t%s'), + delim, ( + format_field(f, 'filesize', ' \t%s', func=format_bytes) + or format_field(f, 'filesize_approx', '≈\t%s', func=format_bytes) + or format_field(try_call(lambda: format_bytes(int(info_dict['duration'] * f['tbr'] * (1024 / 8)))), + None, self._format_out('~\t%s', self.Styles.SUPPRESS))), + format_field(f, 'tbr', '\t%dk', func=round), + shorten_protocol_name(f.get('protocol', '')), + delim, + simplified_codec(f, 'vcodec'), + format_field(f, 'vbr', '\t%dk', func=round), + simplified_codec(f, 'acodec'), + format_field(f, 'abr', '\t%dk', func=round), + format_field(f, 'asr', '\t%s', func=format_decimal_suffix), + join_nonempty(format_field(f, 'language', '[%s]'), join_nonempty( + self._format_out('UNSUPPORTED', self.Styles.BAD_FORMAT) if f.get('ext') in ('f4f', 'f4m') else None, + (self._format_out('Maybe DRM', self.Styles.WARNING) if f.get('has_drm') == 'maybe' + else self._format_out('DRM', self.Styles.BAD_FORMAT) if f.get('has_drm') else None), + format_field(f, 'format_note'), + format_field(f, 'container', ignore=(None, f.get('ext'))), + delim=', '), delim=' '), + ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] + header_line = self._list_format_headers( + 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', 'CH', delim, '\tFILESIZE', '\tTBR', 'PROTO', + delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO') + + return render_table( + header_line, table, hide_empty=True, + delim=self._format_out('\u2500', self.Styles.DELIM, '-', test_encoding=True)) + + def render_thumbnails_table(self, info_dict): + thumbnails = list(info_dict.get('thumbnails') or []) + if not thumbnails: + return None + return render_table( + self._list_format_headers('ID', 'Width', 'Height', 'URL'), + [[t.get('id'), t.get('width') or 'unknown', t.get('height') or 'unknown', t['url']] for t in thumbnails]) + + def render_subtitles_table(self, video_id, subtitles): + def _row(lang, formats): + exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats))) + if len(set(names)) == 1: + names = [] if names[0] == 'unknown' else names[:1] + return [lang, ', '.join(names), ', '.join(exts)] + + if not subtitles: + return None + return render_table( + self._list_format_headers('Language', 'Name', 'Formats'), + [_row(lang, formats) for lang, formats in subtitles.items()], + hide_empty=True) + + def __list_table(self, video_id, name, func, *args): + table = func(*args) + if not table: + self.to_screen(f'{video_id} has no {name}') + return + self.to_screen(f'[info] Available {name} for {video_id}:') + self.to_stdout(table) + + def list_formats(self, info_dict): + self.__list_table(info_dict['id'], 'formats', self.render_formats_table, info_dict) + + def list_thumbnails(self, info_dict): + self.__list_table(info_dict['id'], 'thumbnails', self.render_thumbnails_table, info_dict) + + def list_subtitles(self, video_id, subtitles, name='subtitles'): + self.__list_table(video_id, name, self.render_subtitles_table, video_id, subtitles) + + def print_debug_header(self): + if not self.params.get('verbose'): + return + + from . import _IN_CLI # Must be delayed import + + # These imports can be slow. So import them only as needed + from .extractor.extractors import _LAZY_LOADER + from .extractor.extractors import ( + _PLUGIN_CLASSES as plugin_ies, + _PLUGIN_OVERRIDES as plugin_ie_overrides + ) + + def get_encoding(stream): + ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)) + additional_info = [] + if os.environ.get('TERM', '').lower() == 'dumb': + additional_info.append('dumb') + if not supports_terminal_sequences(stream): + from .utils import WINDOWS_VT_MODE # Must be imported locally + additional_info.append('No VT' if WINDOWS_VT_MODE is False else 'No ANSI') + if additional_info: + ret = f'{ret} ({",".join(additional_info)})' + return ret + + encoding_str = 'Encodings: locale %s, fs %s, pref %s, %s' % ( + locale.getpreferredencoding(), + sys.getfilesystemencoding(), + self.get_encoding(), + ', '.join( + f'{key} {get_encoding(stream)}' for key, stream in self._out_files.items_ + if stream is not None and key != 'console') + ) + + logger = self.params.get('logger') + if logger: + write_debug = lambda msg: logger.debug(f'[debug] {msg}') + write_debug(encoding_str) + else: + write_string(f'[debug] {encoding_str}\n', encoding=None) + write_debug = lambda msg: self._write_string(f'[debug] {msg}\n') + + source = detect_variant() + if VARIANT not in (None, 'pip'): + source += '*' + klass = type(self) + write_debug(join_nonempty( + f'{REPOSITORY.rpartition("/")[2]} version', + _make_label(ORIGIN, CHANNEL.partition('@')[2] or __version__, __version__), + f'[{RELEASE_GIT_HEAD[:9]}]' if RELEASE_GIT_HEAD else '', + '' if source == 'unknown' else f'({source})', + '' if _IN_CLI else 'API' if klass == YoutubeDL else f'API:{self.__module__}.{klass.__qualname__}', + delim=' ')) + + if not _IN_CLI: + write_debug(f'params: {self.params}') + + if not _LAZY_LOADER: + if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): + write_debug('Lazy loading extractors is forcibly disabled') + else: + write_debug('Lazy loading extractors is disabled') + if self.params['compat_opts']: + write_debug('Compatibility options: %s' % ', '.join(self.params['compat_opts'])) + + if current_git_head(): + write_debug(f'Git HEAD: {current_git_head()}') + write_debug(system_identifier()) + + exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self) + ffmpeg_features = {key for key, val in ffmpeg_features.items() if val} + if ffmpeg_features: + exe_versions['ffmpeg'] += ' (%s)' % ','.join(sorted(ffmpeg_features)) + + exe_versions['rtmpdump'] = rtmpdump_version() + exe_versions['phantomjs'] = PhantomJSwrapper._version() + exe_str = ', '.join( + f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v + ) or 'none' + write_debug('exe versions: %s' % exe_str) + + from .compat.compat_utils import get_package_info + from .dependencies import available_dependencies + + write_debug('Optional libraries: %s' % (', '.join(sorted({ + join_nonempty(*get_package_info(m)) for m in available_dependencies.values() + })) or 'none')) + + write_debug(f'Proxy map: {self.proxies}') + write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}') + for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items(): + display_list = ['%s%s' % ( + klass.__name__, '' if klass.__name__ == name else f' as {name}') + for name, klass in plugins.items()] + if plugin_type == 'Extractor': + display_list.extend(f'{plugins[-1].IE_NAME.partition("+")[2]} ({parent.__name__})' + for parent, plugins in plugin_ie_overrides.items()) + if not display_list: + continue + write_debug(f'{plugin_type} Plugins: {", ".join(sorted(display_list))}') + + plugin_dirs = plugin_directories() + if plugin_dirs: + write_debug(f'Plugin directories: {plugin_dirs}') + + # Not implemented + if False and self.params.get('call_home'): + ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode() + write_debug('Public IP address: %s' % ipaddr) + latest_version = self.urlopen( + 'https://yt-dl.org/latest/version').read().decode() + if version_tuple(latest_version) > version_tuple(__version__): + self.report_warning( + 'You are using an outdated version (newest version: %s)! ' + 'See https://yt-dl.org/update if you need help updating.' % + latest_version) + + @functools.cached_property + def proxies(self): + """Global proxy configuration""" + opts_proxy = self.params.get('proxy') + if opts_proxy is not None: + if opts_proxy == '': + opts_proxy = '__noproxy__' + proxies = {'all': opts_proxy} + else: + proxies = urllib.request.getproxies() + # compat. Set HTTPS_PROXY to __noproxy__ to revert + if 'http' in proxies and 'https' not in proxies: + proxies['https'] = proxies['http'] + + return proxies + + @functools.cached_property + def cookiejar(self): + """Global cookiejar instance""" + return load_cookies( + self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self) + + @property + def _opener(self): + """ + Get a urllib OpenerDirector from the Urllib handler (deprecated). + """ + self.deprecation_warning('YoutubeDL._opener is deprecated, use YoutubeDL.urlopen()') + handler = self._request_director.handlers['Urllib'] + return handler._get_instance(cookiejar=self.cookiejar, proxies=self.proxies) + + def urlopen(self, req): + """ Start an HTTP download """ + if isinstance(req, str): + req = Request(req) + elif isinstance(req, urllib.request.Request): + self.deprecation_warning( + 'Passing a urllib.request.Request object to YoutubeDL.urlopen() is deprecated. ' + 'Use yt_dlp.networking.common.Request instead.') + req = urllib_req_to_req(req) + assert isinstance(req, Request) + + # compat: Assume user:pass url params are basic auth + url, basic_auth_header = extract_basic_auth(req.url) + if basic_auth_header: + req.headers['Authorization'] = basic_auth_header + req.url = sanitize_url(url) + + clean_proxies(proxies=req.proxies, headers=req.headers) + clean_headers(req.headers) + + try: + return self._request_director.send(req) + except NoSupportingHandlers as e: + for ue in e.unsupported_errors: + # FIXME: This depends on the order of errors. + if not (ue.handler and ue.msg): + continue + if ue.handler.RH_KEY == 'Urllib' and 'unsupported url scheme: "file"' in ue.msg.lower(): + raise RequestError( + 'file:// URLs are disabled by default in yt-dlp for security reasons. ' + 'Use --enable-file-urls to enable at your own risk.', cause=ue) from ue + if 'unsupported proxy type: "https"' in ue.msg.lower(): + raise RequestError( + 'To use an HTTPS proxy for this request, one of the following dependencies needs to be installed: requests') + + elif ( + re.match(r'unsupported url scheme: "wss?"', ue.msg.lower()) + and 'websockets' not in self._request_director.handlers + ): + raise RequestError( + 'This request requires WebSocket support. ' + 'Ensure one of the following dependencies are installed: websockets', + cause=ue) from ue + raise + except SSLError as e: + if 'UNSAFE_LEGACY_RENEGOTIATION_DISABLED' in str(e): + raise RequestError('UNSAFE_LEGACY_RENEGOTIATION_DISABLED: Try using --legacy-server-connect', cause=e) from e + elif 'SSLV3_ALERT_HANDSHAKE_FAILURE' in str(e): + raise RequestError( + 'SSLV3_ALERT_HANDSHAKE_FAILURE: The server may not support the current cipher list. ' + 'Try using --legacy-server-connect', cause=e) from e + raise + + def build_request_director(self, handlers, preferences=None): + logger = _YDLLogger(self) + headers = self.params['http_headers'].copy() + proxies = self.proxies.copy() + clean_headers(headers) + clean_proxies(proxies, headers) + + director = RequestDirector(logger=logger, verbose=self.params.get('debug_printtraffic')) + for handler in handlers: + director.add_handler(handler( + logger=logger, + headers=headers, + cookiejar=self.cookiejar, + proxies=proxies, + prefer_system_certs='no-certifi' in self.params['compat_opts'], + verify=not self.params.get('nocheckcertificate'), + **traverse_obj(self.params, { + 'verbose': 'debug_printtraffic', + 'source_address': 'source_address', + 'timeout': 'socket_timeout', + 'legacy_ssl_support': 'legacyserverconnect', + 'enable_file_urls': 'enable_file_urls', + 'client_cert': { + 'client_certificate': 'client_certificate', + 'client_certificate_key': 'client_certificate_key', + 'client_certificate_password': 'client_certificate_password', + }, + }), + )) + director.preferences.update(preferences or []) + if 'prefer-legacy-http-handler' in self.params['compat_opts']: + director.preferences.add(lambda rh, _: 500 if rh.RH_KEY == 'Urllib' else 0) + return director + + @functools.cached_property + def _request_director(self): + return self.build_request_director(_REQUEST_HANDLERS.values(), _RH_PREFERENCES) + + def encode(self, s): + if isinstance(s, bytes): + return s # Already encoded + + try: + return s.encode(self.get_encoding()) + except UnicodeEncodeError as err: + err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.' + raise + + def get_encoding(self): + encoding = self.params.get('encoding') + if encoding is None: + encoding = preferredencoding() + return encoding + + def _write_info_json(self, label, ie_result, infofn, overwrite=None): + ''' Write infojson and returns True = written, 'exists' = Already exists, False = skip, None = error ''' + if overwrite is None: + overwrite = self.params.get('overwrites', True) + if not self.params.get('writeinfojson'): + return False + elif not infofn: + self.write_debug(f'Skipping writing {label} infojson') + return False + elif not self._ensure_dir_exists(infofn): + return None + elif not overwrite and os.path.exists(infofn): + self.to_screen(f'[info] {label.title()} metadata is already present') + return 'exists' + + self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}') + try: + write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn) + return True + except OSError: + self.report_error(f'Cannot write {label} metadata to JSON file {infofn}') + return None + + def _write_description(self, label, ie_result, descfn): + ''' Write description and returns True = written, False = skip, None = error ''' + if not self.params.get('writedescription'): + return False + elif not descfn: + self.write_debug(f'Skipping writing {label} description') + return False + elif not self._ensure_dir_exists(descfn): + return None + elif not self.params.get('overwrites', True) and os.path.exists(descfn): + self.to_screen(f'[info] {label.title()} description is already present') + elif ie_result.get('description') is None: + self.to_screen(f'[info] There\'s no {label} description to write') + return False + else: + try: + self.to_screen(f'[info] Writing {label} description to: {descfn}') + with open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: + descfile.write(ie_result['description']) + except OSError: + self.report_error(f'Cannot write {label} description file {descfn}') + return None + return True + + def _write_subtitles(self, info_dict, filename): + ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error''' + ret = [] + subtitles = info_dict.get('requested_subtitles') + if not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')): + # subtitles download errors are already managed as troubles in relevant IE + # that way it will silently go on when used with unsupporting IE + return ret + elif not subtitles: + self.to_screen('[info] There are no subtitles for the requested languages') + return ret + sub_filename_base = self.prepare_filename(info_dict, 'subtitle') + if not sub_filename_base: + self.to_screen('[info] Skipping writing video subtitles') + return ret + + for sub_lang, sub_info in subtitles.items(): + sub_format = sub_info['ext'] + sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) + sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext')) + existing_sub = self.existing_file((sub_filename_final, sub_filename)) + if existing_sub: + self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present') + sub_info['filepath'] = existing_sub + ret.append((existing_sub, sub_filename_final)) + continue + + self.to_screen(f'[info] Writing video subtitles to: {sub_filename}') + if sub_info.get('data') is not None: + try: + # Use newline='' to prevent conversion of newline characters + # See https://github.com/ytdl-org/youtube-dl/issues/10268 + with open(sub_filename, 'w', encoding='utf-8', newline='') as subfile: + subfile.write(sub_info['data']) + sub_info['filepath'] = sub_filename + ret.append((sub_filename, sub_filename_final)) + continue + except OSError: + self.report_error(f'Cannot write video subtitles file {sub_filename}') + return None + + try: + sub_copy = sub_info.copy() + sub_copy.setdefault('http_headers', info_dict.get('http_headers')) + self.dl(sub_filename, sub_copy, subtitle=True) + sub_info['filepath'] = sub_filename + ret.append((sub_filename, sub_filename_final)) + except (DownloadError, ExtractorError, IOError, OSError, ValueError) + network_exceptions as err: + msg = f'Unable to download video subtitles for {sub_lang!r}: {err}' + if self.params.get('ignoreerrors') is not True: # False or 'only_download' + if not self.params.get('ignoreerrors'): + self.report_error(msg) + raise DownloadError(msg) + self.report_warning(msg) + return ret + + def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None): + ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename); or None if error ''' + write_all = self.params.get('write_all_thumbnails', False) + thumbnails, ret = [], [] + if write_all or self.params.get('writethumbnail', False): + thumbnails = info_dict.get('thumbnails') or [] + if not thumbnails: + self.to_screen(f'[info] There are no {label} thumbnails to download') + return ret + multiple = write_all and len(thumbnails) > 1 + + if thumb_filename_base is None: + thumb_filename_base = filename + if thumbnails and not thumb_filename_base: + self.write_debug(f'Skipping writing {label} thumbnail') + return ret + + if thumbnails and not self._ensure_dir_exists(filename): + return None + + for idx, t in list(enumerate(thumbnails))[::-1]: + thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg') + thumb_display_id = f'{label} thumbnail {t["id"]}' + thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext')) + thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext')) + + existing_thumb = self.existing_file((thumb_filename_final, thumb_filename)) + if existing_thumb: + self.to_screen('[info] %s is already present' % ( + thumb_display_id if multiple else f'{label} thumbnail').capitalize()) + t['filepath'] = existing_thumb + ret.append((existing_thumb, thumb_filename_final)) + else: + self.to_screen(f'[info] Downloading {thumb_display_id} ...') + try: + uf = self.urlopen(Request(t['url'], headers=t.get('http_headers', {}))) + self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}') + with open(encodeFilename(thumb_filename), 'wb') as thumbf: + shutil.copyfileobj(uf, thumbf) + ret.append((thumb_filename, thumb_filename_final)) + t['filepath'] = thumb_filename + except network_exceptions as err: + if isinstance(err, HTTPError) and err.status == 404: + self.to_screen(f'[info] {thumb_display_id.title()} does not exist') + else: + self.report_warning(f'Unable to download {thumb_display_id}: {err}') + thumbnails.pop(idx) + if ret and not write_all: + break + return ret diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py new file mode 100644 index 0000000..aeea262 --- /dev/null +++ b/yt_dlp/__init__.py @@ -0,0 +1,1054 @@ +import sys + +if sys.version_info < (3, 8): + raise ImportError( + f'You are using an unsupported version of Python. Only Python versions 3.8 and above are supported by yt-dlp') # noqa: F541 + +__license__ = 'The Unlicense' + +import collections +import getpass +import itertools +import optparse +import os +import re +import traceback + +from .compat import compat_os_name, compat_shlex_quote +from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS +from .downloader.external import get_external_downloader +from .extractor import list_extractor_classes +from .extractor.adobepass import MSO_INFO +from .options import parseOpts +from .postprocessor import ( + FFmpegExtractAudioPP, + FFmpegMergerPP, + FFmpegPostProcessor, + FFmpegSubtitlesConvertorPP, + FFmpegThumbnailsConvertorPP, + FFmpegVideoConvertorPP, + FFmpegVideoRemuxerPP, + MetadataFromFieldPP, + MetadataParserPP, +) +from .update import Updater +from .utils import ( + NO_DEFAULT, + POSTPROCESS_WHEN, + DateRange, + DownloadCancelled, + DownloadError, + FormatSorter, + GeoUtils, + PlaylistEntries, + SameFileError, + decodeOption, + download_range_func, + expand_path, + float_or_none, + format_field, + int_or_none, + match_filter_func, + parse_bytes, + parse_duration, + preferredencoding, + read_batch_urls, + read_stdin, + render_table, + setproctitle, + traverse_obj, + variadic, + write_string, +) +from .utils.networking import std_headers +from .YoutubeDL import YoutubeDL + +_IN_CLI = False + + +def _exit(status=0, *args): + for msg in args: + sys.stderr.write(msg) + raise SystemExit(status) + + +def get_urls(urls, batchfile, verbose): + """ + @param verbose -1: quiet, 0: normal, 1: verbose + """ + batch_urls = [] + if batchfile is not None: + try: + batch_urls = read_batch_urls( + read_stdin(None if verbose == -1 else 'URLs') if batchfile == '-' + else open(expand_path(batchfile), encoding='utf-8', errors='ignore')) + if verbose == 1: + write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') + except OSError: + _exit(f'ERROR: batch file {batchfile} could not be read') + _enc = preferredencoding() + return [ + url.strip().decode(_enc, 'ignore') if isinstance(url, bytes) else url.strip() + for url in batch_urls + urls] + + +def print_extractor_information(opts, urls): + out = '' + if opts.list_extractors: + # Importing GenericIE is currently slow since it imports YoutubeIE + from .extractor.generic import GenericIE + + urls = dict.fromkeys(urls, False) + for ie in list_extractor_classes(opts.age_limit): + out += ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie.working() else '') + '\n' + if ie == GenericIE: + matched_urls = [url for url, matched in urls.items() if not matched] + else: + matched_urls = tuple(filter(ie.suitable, urls.keys())) + urls.update(dict.fromkeys(matched_urls, True)) + out += ''.join(f' {url}\n' for url in matched_urls) + elif opts.list_extractor_descriptions: + _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow') + out = '\n'.join( + ie.description(markdown=False, search_examples=_SEARCHES) + for ie in list_extractor_classes(opts.age_limit) if ie.working() and ie.IE_DESC is not False) + elif opts.ap_list_mso: + out = 'Supported TV Providers:\n%s\n' % render_table( + ['mso', 'mso name'], + [[mso_id, mso_info['name']] for mso_id, mso_info in MSO_INFO.items()]) + else: + return False + write_string(out, out=sys.stdout) + return True + + +def set_compat_opts(opts): + def _unused_compat_opt(name): + if name not in opts.compat_opts: + return False + opts.compat_opts.discard(name) + opts.compat_opts.update(['*%s' % name]) + return True + + def set_default_compat(compat_name, opt_name, default=True, remove_compat=True): + attr = getattr(opts, opt_name) + if compat_name in opts.compat_opts: + if attr is None: + setattr(opts, opt_name, not default) + return True + else: + if remove_compat: + _unused_compat_opt(compat_name) + return False + elif attr is None: + setattr(opts, opt_name, default) + return None + + set_default_compat('abort-on-error', 'ignoreerrors', 'only_download') + set_default_compat('no-playlist-metafiles', 'allow_playlist_files') + set_default_compat('no-clean-infojson', 'clean_infojson') + if 'no-attach-info-json' in opts.compat_opts: + if opts.embed_infojson: + _unused_compat_opt('no-attach-info-json') + else: + opts.embed_infojson = False + if 'format-sort' in opts.compat_opts: + opts.format_sort.extend(FormatSorter.ytdl_default) + _video_multistreams_set = set_default_compat('multistreams', 'allow_multiple_video_streams', False, remove_compat=False) + _audio_multistreams_set = set_default_compat('multistreams', 'allow_multiple_audio_streams', False, remove_compat=False) + if _video_multistreams_set is False and _audio_multistreams_set is False: + _unused_compat_opt('multistreams') + if 'filename' in opts.compat_opts: + if opts.outtmpl.get('default') is None: + opts.outtmpl.update({'default': '%(title)s-%(id)s.%(ext)s'}) + else: + _unused_compat_opt('filename') + + +def validate_options(opts): + def validate(cndn, name, value=None, msg=None): + if cndn: + return True + raise ValueError((msg or 'invalid {name} "{value}" given').format(name=name, value=value)) + + def validate_in(name, value, items, msg=None): + return validate(value is None or value in items, name, value, msg) + + def validate_regex(name, value, regex): + return validate(value is None or re.match(regex, value), name, value) + + def validate_positive(name, value, strict=False): + return validate(value is None or value > 0 or (not strict and value == 0), + name, value, '{name} "{value}" must be positive' + ('' if strict else ' or 0')) + + def validate_minmax(min_val, max_val, min_name, max_name=None): + if max_val is None or min_val is None or max_val >= min_val: + return + if not max_name: + min_name, max_name = f'min {min_name}', f'max {min_name}' + raise ValueError(f'{max_name} "{max_val}" must be must be greater than or equal to {min_name} "{min_val}"') + + # Usernames and passwords + validate(sum(map(bool, (opts.usenetrc, opts.netrc_cmd, opts.username))) <= 1, '.netrc', + msg='{name}, netrc command and username/password are mutually exclusive options') + validate(opts.password is None or opts.username is not None, 'account username', msg='{name} missing') + validate(opts.ap_password is None or opts.ap_username is not None, + 'TV Provider account username', msg='{name} missing') + validate_in('TV Provider', opts.ap_mso, MSO_INFO, + 'Unsupported {name} "{value}", use --ap-list-mso to get a list of supported TV Providers') + + # Numbers + validate_positive('autonumber start', opts.autonumber_start) + validate_positive('autonumber size', opts.autonumber_size, True) + validate_positive('concurrent fragments', opts.concurrent_fragment_downloads, True) + validate_positive('playlist start', opts.playliststart, True) + if opts.playlistend != -1: + validate_minmax(opts.playliststart, opts.playlistend, 'playlist start', 'playlist end') + + # Time ranges + validate_positive('subtitles sleep interval', opts.sleep_interval_subtitles) + validate_positive('requests sleep interval', opts.sleep_interval_requests) + validate_positive('sleep interval', opts.sleep_interval) + validate_positive('max sleep interval', opts.max_sleep_interval) + if opts.sleep_interval is None: + validate( + opts.max_sleep_interval is None, 'min sleep interval', + msg='{name} must be specified; use --min-sleep-interval') + elif opts.max_sleep_interval is None: + opts.max_sleep_interval = opts.sleep_interval + else: + validate_minmax(opts.sleep_interval, opts.max_sleep_interval, 'sleep interval') + + if opts.wait_for_video is not None: + min_wait, max_wait, *_ = map(parse_duration, opts.wait_for_video.split('-', 1) + [None]) + validate(min_wait is not None and not (max_wait is None and '-' in opts.wait_for_video), + 'time range to wait for video', opts.wait_for_video) + validate_minmax(min_wait, max_wait, 'time range to wait for video') + opts.wait_for_video = (min_wait, max_wait) + + # Format sort + for f in opts.format_sort: + validate_regex('format sorting', f, FormatSorter.regex) + + # Postprocessor formats + validate_regex('merge output format', opts.merge_output_format, + r'({0})(/({0}))*'.format('|'.join(map(re.escape, FFmpegMergerPP.SUPPORTED_EXTS)))) + validate_regex('audio format', opts.audioformat, FFmpegExtractAudioPP.FORMAT_RE) + validate_in('subtitle format', opts.convertsubtitles, FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS) + validate_regex('thumbnail format', opts.convertthumbnails, FFmpegThumbnailsConvertorPP.FORMAT_RE) + validate_regex('recode video format', opts.recodevideo, FFmpegVideoConvertorPP.FORMAT_RE) + validate_regex('remux video format', opts.remuxvideo, FFmpegVideoRemuxerPP.FORMAT_RE) + if opts.audioquality: + opts.audioquality = opts.audioquality.strip('k').strip('K') + # int_or_none prevents inf, nan + validate_positive('audio quality', int_or_none(float_or_none(opts.audioquality), default=0)) + + # Retries + def parse_retries(name, value): + if value is None: + return None + elif value in ('inf', 'infinite'): + return float('inf') + try: + return int(value) + except (TypeError, ValueError): + validate(False, f'{name} retry count', value) + + opts.retries = parse_retries('download', opts.retries) + opts.fragment_retries = parse_retries('fragment', opts.fragment_retries) + opts.extractor_retries = parse_retries('extractor', opts.extractor_retries) + opts.file_access_retries = parse_retries('file access', opts.file_access_retries) + + # Retry sleep function + def parse_sleep_func(expr): + NUMBER_RE = r'\d+(?:\.\d+)?' + op, start, limit, step, *_ = tuple(re.fullmatch( + rf'(?:(linear|exp)=)?({NUMBER_RE})(?::({NUMBER_RE})?)?(?::({NUMBER_RE}))?', + expr.strip()).groups()) + (None, None) + + if op == 'exp': + return lambda n: min(float(start) * (float(step or 2) ** n), float(limit or 'inf')) + else: + default_step = start if op or limit else 0 + return lambda n: min(float(start) + float(step or default_step) * n, float(limit or 'inf')) + + for key, expr in opts.retry_sleep.items(): + if not expr: + del opts.retry_sleep[key] + continue + try: + opts.retry_sleep[key] = parse_sleep_func(expr) + except AttributeError: + raise ValueError(f'invalid {key} retry sleep expression {expr!r}') + + # Bytes + def validate_bytes(name, value): + if value is None: + return None + numeric_limit = parse_bytes(value) + validate(numeric_limit is not None, 'rate limit', value) + return numeric_limit + + opts.ratelimit = validate_bytes('rate limit', opts.ratelimit) + opts.throttledratelimit = validate_bytes('throttled rate limit', opts.throttledratelimit) + opts.min_filesize = validate_bytes('min filesize', opts.min_filesize) + opts.max_filesize = validate_bytes('max filesize', opts.max_filesize) + opts.buffersize = validate_bytes('buffer size', opts.buffersize) + opts.http_chunk_size = validate_bytes('http chunk size', opts.http_chunk_size) + + # Output templates + def validate_outtmpl(tmpl, msg): + err = YoutubeDL.validate_outtmpl(tmpl) + if err: + raise ValueError(f'invalid {msg} "{tmpl}": {err}') + + for k, tmpl in opts.outtmpl.items(): + validate_outtmpl(tmpl, f'{k} output template') + for type_, tmpl_list in opts.forceprint.items(): + for tmpl in tmpl_list: + validate_outtmpl(tmpl, f'{type_} print template') + for type_, tmpl_list in opts.print_to_file.items(): + for tmpl, file in tmpl_list: + validate_outtmpl(tmpl, f'{type_} print to file template') + validate_outtmpl(file, f'{type_} print to file filename') + validate_outtmpl(opts.sponsorblock_chapter_title, 'SponsorBlock chapter title') + for k, tmpl in opts.progress_template.items(): + k = f'{k[:-6]} console title' if '-title' in k else f'{k} progress' + validate_outtmpl(tmpl, f'{k} template') + + outtmpl_default = opts.outtmpl.get('default') + if outtmpl_default == '': + opts.skip_download = None + del opts.outtmpl['default'] + + def parse_chapters(name, value, advanced=False): + parse_timestamp = lambda x: float('inf') if x in ('inf', 'infinite') else parse_duration(x) + TIMESTAMP_RE = r'''(?x)(?: + (?P-?)(?P[^-]+) + )?\s*-\s*(?: + (?P-?)(?P[^-]+) + )?''' + + chapters, ranges, from_url = [], [], False + for regex in value or []: + if advanced and regex == '*from-url': + from_url = True + continue + elif not regex.startswith('*'): + try: + chapters.append(re.compile(regex)) + except re.error as err: + raise ValueError(f'invalid {name} regex "{regex}" - {err}') + continue + + for range_ in map(str.strip, regex[1:].split(',')): + mobj = range_ != '-' and re.fullmatch(TIMESTAMP_RE, range_) + dur = mobj and [parse_timestamp(mobj.group('start') or '0'), parse_timestamp(mobj.group('end') or 'inf')] + signs = mobj and (mobj.group('start_sign'), mobj.group('end_sign')) + + err = None + if None in (dur or [None]): + err = 'Must be of the form "*start-end"' + elif not advanced and any(signs): + err = 'Negative timestamps are not allowed' + else: + dur[0] *= -1 if signs[0] else 1 + dur[1] *= -1 if signs[1] else 1 + if dur[1] == float('-inf'): + err = '"-inf" is not a valid end' + if err: + raise ValueError(f'invalid {name} time range "{regex}". {err}') + ranges.append(dur) + + return chapters, ranges, from_url + + opts.remove_chapters, opts.remove_ranges, _ = parse_chapters('--remove-chapters', opts.remove_chapters) + opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges, True)) + + # Cookies from browser + if opts.cookiesfrombrowser: + container = None + mobj = re.fullmatch(r'''(?x) + (?P[^+:]+) + (?:\s*\+\s*(?P[^:]+))? + (?:\s*:\s*(?!:)(?P.+?))? + (?:\s*::\s*(?P.+))? + ''', opts.cookiesfrombrowser) + if mobj is None: + raise ValueError(f'invalid cookies from browser arguments: {opts.cookiesfrombrowser}') + browser_name, keyring, profile, container = mobj.group('name', 'keyring', 'profile', 'container') + browser_name = browser_name.lower() + if browser_name not in SUPPORTED_BROWSERS: + raise ValueError(f'unsupported browser specified for cookies: "{browser_name}". ' + f'Supported browsers are: {", ".join(sorted(SUPPORTED_BROWSERS))}') + if keyring is not None: + keyring = keyring.upper() + if keyring not in SUPPORTED_KEYRINGS: + raise ValueError(f'unsupported keyring specified for cookies: "{keyring}". ' + f'Supported keyrings are: {", ".join(sorted(SUPPORTED_KEYRINGS))}') + opts.cookiesfrombrowser = (browser_name, profile, keyring, container) + + # MetadataParser + def metadataparser_actions(f): + if isinstance(f, str): + cmd = '--parse-metadata %s' % compat_shlex_quote(f) + try: + actions = [MetadataFromFieldPP.to_action(f)] + except Exception as err: + raise ValueError(f'{cmd} is invalid; {err}') + else: + cmd = '--replace-in-metadata %s' % ' '.join(map(compat_shlex_quote, f)) + actions = ((MetadataParserPP.Actions.REPLACE, x, *f[1:]) for x in f[0].split(',')) + + for action in actions: + try: + MetadataParserPP.validate_action(*action) + except Exception as err: + raise ValueError(f'{cmd} is invalid; {err}') + yield action + + if opts.metafromtitle is not None: + opts.parse_metadata.setdefault('pre_process', []).append('title:%s' % opts.metafromtitle) + opts.parse_metadata = { + k: list(itertools.chain(*map(metadataparser_actions, v))) + for k, v in opts.parse_metadata.items() + } + + # Other options + if opts.playlist_items is not None: + try: + tuple(PlaylistEntries.parse_playlist_items(opts.playlist_items)) + except Exception as err: + raise ValueError(f'Invalid playlist-items {opts.playlist_items!r}: {err}') + + opts.geo_bypass_country, opts.geo_bypass_ip_block = None, None + if opts.geo_bypass.lower() not in ('default', 'never'): + try: + GeoUtils.random_ipv4(opts.geo_bypass) + except Exception: + raise ValueError(f'Unsupported --xff "{opts.geo_bypass}"') + if len(opts.geo_bypass) == 2: + opts.geo_bypass_country = opts.geo_bypass + else: + opts.geo_bypass_ip_block = opts.geo_bypass + opts.geo_bypass = opts.geo_bypass.lower() != 'never' + + opts.match_filter = match_filter_func(opts.match_filter, opts.breaking_match_filter) + + if opts.download_archive is not None: + opts.download_archive = expand_path(opts.download_archive) + + if opts.ffmpeg_location is not None: + opts.ffmpeg_location = expand_path(opts.ffmpeg_location) + + if opts.user_agent is not None: + opts.headers.setdefault('User-Agent', opts.user_agent) + if opts.referer is not None: + opts.headers.setdefault('Referer', opts.referer) + + if opts.no_sponsorblock: + opts.sponsorblock_mark = opts.sponsorblock_remove = set() + + default_downloader = None + for proto, path in opts.external_downloader.items(): + if path == 'native': + continue + ed = get_external_downloader(path) + if ed is None: + raise ValueError( + f'No such {format_field(proto, None, "%s ", ignore="default")}external downloader "{path}"') + elif ed and proto == 'default': + default_downloader = ed.get_basename() + + for policy in opts.color.values(): + if policy not in ('always', 'auto', 'no_color', 'never'): + raise ValueError(f'"{policy}" is not a valid color policy') + + warnings, deprecation_warnings = [], [] + + # Common mistake: -f best + if opts.format == 'best': + warnings.append('.\n '.join(( + '"-f best" selects the best pre-merged format which is often not the best option', + 'To let yt-dlp download and merge the best available formats, simply do not pass any format selection', + 'If you know what you are doing and want only the best pre-merged format, use "-f b" instead to suppress this warning'))) + + # --(postprocessor/downloader)-args without name + def report_args_compat(name, value, key1, key2=None, where=None): + if key1 in value and key2 not in value: + warnings.append(f'{name.title()} arguments given without specifying name. ' + f'The arguments will be given to {where or f"all {name}s"}') + return True + return False + + if report_args_compat('external downloader', opts.external_downloader_args, + 'default', where=default_downloader) and default_downloader: + # Compat with youtube-dl's behavior. See https://github.com/ytdl-org/youtube-dl/commit/49c5293014bc11ec8c009856cd63cffa6296c1e1 + opts.external_downloader_args.setdefault(default_downloader, opts.external_downloader_args.pop('default')) + + if report_args_compat('post-processor', opts.postprocessor_args, 'default-compat', 'default'): + opts.postprocessor_args['default'] = opts.postprocessor_args.pop('default-compat') + opts.postprocessor_args.setdefault('sponskrub', []) + + def report_conflict(arg1, opt1, arg2='--allow-unplayable-formats', opt2='allow_unplayable_formats', + val1=NO_DEFAULT, val2=NO_DEFAULT, default=False): + if val2 is NO_DEFAULT: + val2 = getattr(opts, opt2) + if not val2: + return + + if val1 is NO_DEFAULT: + val1 = getattr(opts, opt1) + if val1: + warnings.append(f'{arg1} is ignored since {arg2} was given') + setattr(opts, opt1, default) + + # Conflicting options + report_conflict('--playlist-reverse', 'playlist_reverse', '--playlist-random', 'playlist_random') + report_conflict('--playlist-reverse', 'playlist_reverse', '--lazy-playlist', 'lazy_playlist') + report_conflict('--playlist-random', 'playlist_random', '--lazy-playlist', 'lazy_playlist') + report_conflict('--dateafter', 'dateafter', '--date', 'date', default=None) + report_conflict('--datebefore', 'datebefore', '--date', 'date', default=None) + report_conflict('--exec-before-download', 'exec_before_dl_cmd', + '"--exec before_dl:"', 'exec_cmd', val2=opts.exec_cmd.get('before_dl')) + report_conflict('--id', 'useid', '--output', 'outtmpl', val2=opts.outtmpl.get('default')) + report_conflict('--remux-video', 'remuxvideo', '--recode-video', 'recodevideo') + report_conflict('--sponskrub', 'sponskrub', '--remove-chapters', 'remove_chapters') + report_conflict('--sponskrub', 'sponskrub', '--sponsorblock-mark', 'sponsorblock_mark') + report_conflict('--sponskrub', 'sponskrub', '--sponsorblock-remove', 'sponsorblock_remove') + report_conflict('--sponskrub-cut', 'sponskrub_cut', '--split-chapter', 'split_chapters', + val1=opts.sponskrub and opts.sponskrub_cut) + + # Conflicts with --allow-unplayable-formats + report_conflict('--embed-metadata', 'addmetadata') + report_conflict('--embed-chapters', 'addchapters') + report_conflict('--embed-info-json', 'embed_infojson') + report_conflict('--embed-subs', 'embedsubtitles') + report_conflict('--embed-thumbnail', 'embedthumbnail') + report_conflict('--extract-audio', 'extractaudio') + report_conflict('--fixup', 'fixup', val1=opts.fixup not in (None, 'never', 'ignore'), default='never') + report_conflict('--recode-video', 'recodevideo') + report_conflict('--remove-chapters', 'remove_chapters', default=[]) + report_conflict('--remux-video', 'remuxvideo') + report_conflict('--sponskrub', 'sponskrub') + report_conflict('--sponsorblock-remove', 'sponsorblock_remove', default=set()) + report_conflict('--xattrs', 'xattrs') + + # Fully deprecated options + def report_deprecation(val, old, new=None): + if not val: + return + deprecation_warnings.append( + f'{old} is deprecated and may be removed in a future version. Use {new} instead' if new + else f'{old} is deprecated and may not work as expected') + + report_deprecation(opts.sponskrub, '--sponskrub', '--sponsorblock-mark or --sponsorblock-remove') + report_deprecation(not opts.prefer_ffmpeg, '--prefer-avconv', 'ffmpeg') + # report_deprecation(opts.include_ads, '--include-ads') # We may re-implement this in future + # report_deprecation(opts.call_home, '--call-home') # We may re-implement this in future + # report_deprecation(opts.writeannotations, '--write-annotations') # It's just that no website has it + + # Dependent options + opts.date = DateRange.day(opts.date) if opts.date else DateRange(opts.dateafter, opts.datebefore) + + if opts.exec_before_dl_cmd: + opts.exec_cmd['before_dl'] = opts.exec_before_dl_cmd + + if opts.useid: # --id is not deprecated in youtube-dl + opts.outtmpl['default'] = '%(id)s.%(ext)s' + + if opts.overwrites: # --force-overwrites implies --no-continue + opts.continue_dl = False + + if (opts.addmetadata or opts.sponsorblock_mark) and opts.addchapters is None: + # Add chapters when adding metadata or marking sponsors + opts.addchapters = True + + if opts.extractaudio and not opts.keepvideo and opts.format is None: + # Do not unnecessarily download audio + opts.format = 'bestaudio/best' + + if opts.getcomments and opts.writeinfojson is None and not opts.embed_infojson: + # If JSON is not printed anywhere, but comments are requested, save it to file + if not opts.dumpjson or opts.print_json or opts.dump_single_json: + opts.writeinfojson = True + + if opts.allsubtitles and not (opts.embedsubtitles or opts.writeautomaticsub): + # --all-sub automatically sets --write-sub if --write-auto-sub is not given + opts.writesubtitles = True + + if opts.addmetadata and opts.embed_infojson is None: + # If embedding metadata and infojson is present, embed it + opts.embed_infojson = 'if_exists' + + # Ask for passwords + if opts.username is not None and opts.password is None: + opts.password = getpass.getpass('Type account password and press [Return]: ') + if opts.ap_username is not None and opts.ap_password is None: + opts.ap_password = getpass.getpass('Type TV provider account password and press [Return]: ') + + return warnings, deprecation_warnings + + +def get_postprocessors(opts): + yield from opts.add_postprocessors + + for when, actions in opts.parse_metadata.items(): + yield { + 'key': 'MetadataParser', + 'actions': actions, + 'when': when + } + sponsorblock_query = opts.sponsorblock_mark | opts.sponsorblock_remove + if sponsorblock_query: + yield { + 'key': 'SponsorBlock', + 'categories': sponsorblock_query, + 'api': opts.sponsorblock_api, + 'when': 'after_filter' + } + if opts.convertsubtitles: + yield { + 'key': 'FFmpegSubtitlesConvertor', + 'format': opts.convertsubtitles, + 'when': 'before_dl' + } + if opts.convertthumbnails: + yield { + 'key': 'FFmpegThumbnailsConvertor', + 'format': opts.convertthumbnails, + 'when': 'before_dl' + } + if opts.extractaudio: + yield { + 'key': 'FFmpegExtractAudio', + 'preferredcodec': opts.audioformat, + 'preferredquality': opts.audioquality, + 'nopostoverwrites': opts.nopostoverwrites, + } + if opts.remuxvideo: + yield { + 'key': 'FFmpegVideoRemuxer', + 'preferedformat': opts.remuxvideo, + } + if opts.recodevideo: + yield { + 'key': 'FFmpegVideoConvertor', + 'preferedformat': opts.recodevideo, + } + # If ModifyChapters is going to remove chapters, subtitles must already be in the container. + if opts.embedsubtitles: + keep_subs = 'no-keep-subs' not in opts.compat_opts + yield { + 'key': 'FFmpegEmbedSubtitle', + # already_have_subtitle = True prevents the file from being deleted after embedding + 'already_have_subtitle': opts.writesubtitles and keep_subs + } + if not opts.writeautomaticsub and keep_subs: + opts.writesubtitles = True + + # ModifyChapters must run before FFmpegMetadataPP + if opts.remove_chapters or sponsorblock_query: + yield { + 'key': 'ModifyChapters', + 'remove_chapters_patterns': opts.remove_chapters, + 'remove_sponsor_segments': opts.sponsorblock_remove, + 'remove_ranges': opts.remove_ranges, + 'sponsorblock_chapter_title': opts.sponsorblock_chapter_title, + 'force_keyframes': opts.force_keyframes_at_cuts + } + # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and + # FFmpegExtractAudioPP as containers before conversion may not support + # metadata (3gp, webm, etc.) + # By default ffmpeg preserves metadata applicable for both + # source and target containers. From this point the container won't change, + # so metadata can be added here. + if opts.addmetadata or opts.addchapters or opts.embed_infojson: + yield { + 'key': 'FFmpegMetadata', + 'add_chapters': opts.addchapters, + 'add_metadata': opts.addmetadata, + 'add_infojson': opts.embed_infojson, + } + # Deprecated + # This should be above EmbedThumbnail since sponskrub removes the thumbnail attachment + # but must be below EmbedSubtitle and FFmpegMetadata + # See https://github.com/yt-dlp/yt-dlp/issues/204 , https://github.com/faissaloo/SponSkrub/issues/29 + # If opts.sponskrub is None, sponskrub is used, but it silently fails if the executable can't be found + if opts.sponskrub is not False: + yield { + 'key': 'SponSkrub', + 'path': opts.sponskrub_path, + 'args': opts.sponskrub_args, + 'cut': opts.sponskrub_cut, + 'force': opts.sponskrub_force, + 'ignoreerror': opts.sponskrub is None, + '_from_cli': True, + } + if opts.embedthumbnail: + yield { + 'key': 'EmbedThumbnail', + # already_have_thumbnail = True prevents the file from being deleted after embedding + 'already_have_thumbnail': opts.writethumbnail + } + if not opts.writethumbnail: + opts.writethumbnail = True + opts.outtmpl['pl_thumbnail'] = '' + if opts.split_chapters: + yield { + 'key': 'FFmpegSplitChapters', + 'force_keyframes': opts.force_keyframes_at_cuts, + } + # XAttrMetadataPP should be run after post-processors that may change file contents + if opts.xattrs: + yield {'key': 'XAttrMetadata'} + if opts.concat_playlist != 'never': + yield { + 'key': 'FFmpegConcat', + 'only_multi_video': opts.concat_playlist != 'always', + 'when': 'playlist', + } + # Exec must be the last PP of each category + for when, exec_cmd in opts.exec_cmd.items(): + yield { + 'key': 'Exec', + 'exec_cmd': exec_cmd, + 'when': when, + } + + +ParsedOptions = collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts')) + + +def parse_options(argv=None): + """@returns ParsedOptions(parser, opts, urls, ydl_opts)""" + parser, opts, urls = parseOpts(argv) + urls = get_urls(urls, opts.batchfile, -1 if opts.quiet and not opts.verbose else opts.verbose) + + set_compat_opts(opts) + try: + warnings, deprecation_warnings = validate_options(opts) + except ValueError as err: + parser.error(f'{err}\n') + + postprocessors = list(get_postprocessors(opts)) + + print_only = bool(opts.forceprint) and all(k not in opts.forceprint for k in POSTPROCESS_WHEN[3:]) + any_getting = any(getattr(opts, k) for k in ( + 'dumpjson', 'dump_single_json', 'getdescription', 'getduration', 'getfilename', + 'getformat', 'getid', 'getthumbnail', 'gettitle', 'geturl' + )) + if opts.quiet is None: + opts.quiet = any_getting or opts.print_json or bool(opts.forceprint) + + playlist_pps = [pp for pp in postprocessors if pp.get('when') == 'playlist'] + write_playlist_infojson = (opts.writeinfojson and not opts.clean_infojson + and opts.allow_playlist_files and opts.outtmpl.get('pl_infojson') != '') + if not any(( + opts.extract_flat, + opts.dump_single_json, + opts.forceprint.get('playlist'), + opts.print_to_file.get('playlist'), + write_playlist_infojson, + )): + if not playlist_pps: + opts.extract_flat = 'discard' + elif playlist_pps == [{'key': 'FFmpegConcat', 'only_multi_video': True, 'when': 'playlist'}]: + opts.extract_flat = 'discard_in_playlist' + + final_ext = ( + opts.recodevideo if opts.recodevideo in FFmpegVideoConvertorPP.SUPPORTED_EXTS + else opts.remuxvideo if opts.remuxvideo in FFmpegVideoRemuxerPP.SUPPORTED_EXTS + else opts.audioformat if (opts.extractaudio and opts.audioformat in FFmpegExtractAudioPP.SUPPORTED_EXTS) + else None) + + return ParsedOptions(parser, opts, urls, { + 'usenetrc': opts.usenetrc, + 'netrc_location': opts.netrc_location, + 'netrc_cmd': opts.netrc_cmd, + 'username': opts.username, + 'password': opts.password, + 'twofactor': opts.twofactor, + 'videopassword': opts.videopassword, + 'ap_mso': opts.ap_mso, + 'ap_username': opts.ap_username, + 'ap_password': opts.ap_password, + 'client_certificate': opts.client_certificate, + 'client_certificate_key': opts.client_certificate_key, + 'client_certificate_password': opts.client_certificate_password, + 'quiet': opts.quiet, + 'no_warnings': opts.no_warnings, + 'forceurl': opts.geturl, + 'forcetitle': opts.gettitle, + 'forceid': opts.getid, + 'forcethumbnail': opts.getthumbnail, + 'forcedescription': opts.getdescription, + 'forceduration': opts.getduration, + 'forcefilename': opts.getfilename, + 'forceformat': opts.getformat, + 'forceprint': opts.forceprint, + 'print_to_file': opts.print_to_file, + 'forcejson': opts.dumpjson or opts.print_json, + 'dump_single_json': opts.dump_single_json, + 'force_write_download_archive': opts.force_write_download_archive, + 'simulate': (print_only or any_getting or None) if opts.simulate is None else opts.simulate, + 'skip_download': opts.skip_download, + 'format': opts.format, + 'allow_unplayable_formats': opts.allow_unplayable_formats, + 'ignore_no_formats_error': opts.ignore_no_formats_error, + 'format_sort': opts.format_sort, + 'format_sort_force': opts.format_sort_force, + 'allow_multiple_video_streams': opts.allow_multiple_video_streams, + 'allow_multiple_audio_streams': opts.allow_multiple_audio_streams, + 'check_formats': opts.check_formats, + 'listformats': opts.listformats, + 'listformats_table': opts.listformats_table, + 'outtmpl': opts.outtmpl, + 'outtmpl_na_placeholder': opts.outtmpl_na_placeholder, + 'paths': opts.paths, + 'autonumber_size': opts.autonumber_size, + 'autonumber_start': opts.autonumber_start, + 'restrictfilenames': opts.restrictfilenames, + 'windowsfilenames': opts.windowsfilenames, + 'ignoreerrors': opts.ignoreerrors, + 'force_generic_extractor': opts.force_generic_extractor, + 'allowed_extractors': opts.allowed_extractors or ['default'], + 'ratelimit': opts.ratelimit, + 'throttledratelimit': opts.throttledratelimit, + 'overwrites': opts.overwrites, + 'retries': opts.retries, + 'file_access_retries': opts.file_access_retries, + 'fragment_retries': opts.fragment_retries, + 'extractor_retries': opts.extractor_retries, + 'retry_sleep_functions': opts.retry_sleep, + 'skip_unavailable_fragments': opts.skip_unavailable_fragments, + 'keep_fragments': opts.keep_fragments, + 'concurrent_fragment_downloads': opts.concurrent_fragment_downloads, + 'buffersize': opts.buffersize, + 'noresizebuffer': opts.noresizebuffer, + 'http_chunk_size': opts.http_chunk_size, + 'continuedl': opts.continue_dl, + 'noprogress': opts.quiet if opts.noprogress is None else opts.noprogress, + 'progress_with_newline': opts.progress_with_newline, + 'progress_template': opts.progress_template, + 'playliststart': opts.playliststart, + 'playlistend': opts.playlistend, + 'playlistreverse': opts.playlist_reverse, + 'playlistrandom': opts.playlist_random, + 'lazy_playlist': opts.lazy_playlist, + 'noplaylist': opts.noplaylist, + 'logtostderr': opts.outtmpl.get('default') == '-', + 'consoletitle': opts.consoletitle, + 'nopart': opts.nopart, + 'updatetime': opts.updatetime, + 'writedescription': opts.writedescription, + 'writeannotations': opts.writeannotations, + 'writeinfojson': opts.writeinfojson, + 'allow_playlist_files': opts.allow_playlist_files, + 'clean_infojson': opts.clean_infojson, + 'getcomments': opts.getcomments, + 'writethumbnail': opts.writethumbnail is True, + 'write_all_thumbnails': opts.writethumbnail == 'all', + 'writelink': opts.writelink, + 'writeurllink': opts.writeurllink, + 'writewebloclink': opts.writewebloclink, + 'writedesktoplink': opts.writedesktoplink, + 'writesubtitles': opts.writesubtitles, + 'writeautomaticsub': opts.writeautomaticsub, + 'allsubtitles': opts.allsubtitles, + 'listsubtitles': opts.listsubtitles, + 'subtitlesformat': opts.subtitlesformat, + 'subtitleslangs': opts.subtitleslangs, + 'matchtitle': decodeOption(opts.matchtitle), + 'rejecttitle': decodeOption(opts.rejecttitle), + 'max_downloads': opts.max_downloads, + 'prefer_free_formats': opts.prefer_free_formats, + 'trim_file_name': opts.trim_file_name, + 'verbose': opts.verbose, + 'dump_intermediate_pages': opts.dump_intermediate_pages, + 'write_pages': opts.write_pages, + 'load_pages': opts.load_pages, + 'test': opts.test, + 'keepvideo': opts.keepvideo, + 'min_filesize': opts.min_filesize, + 'max_filesize': opts.max_filesize, + 'min_views': opts.min_views, + 'max_views': opts.max_views, + 'daterange': opts.date, + 'cachedir': opts.cachedir, + 'youtube_print_sig_code': opts.youtube_print_sig_code, + 'age_limit': opts.age_limit, + 'download_archive': opts.download_archive, + 'break_on_existing': opts.break_on_existing, + 'break_on_reject': opts.break_on_reject, + 'break_per_url': opts.break_per_url, + 'skip_playlist_after_errors': opts.skip_playlist_after_errors, + 'cookiefile': opts.cookiefile, + 'cookiesfrombrowser': opts.cookiesfrombrowser, + 'legacyserverconnect': opts.legacy_server_connect, + 'nocheckcertificate': opts.no_check_certificate, + 'prefer_insecure': opts.prefer_insecure, + 'enable_file_urls': opts.enable_file_urls, + 'http_headers': opts.headers, + 'proxy': opts.proxy, + 'socket_timeout': opts.socket_timeout, + 'bidi_workaround': opts.bidi_workaround, + 'debug_printtraffic': opts.debug_printtraffic, + 'prefer_ffmpeg': opts.prefer_ffmpeg, + 'include_ads': opts.include_ads, + 'default_search': opts.default_search, + 'dynamic_mpd': opts.dynamic_mpd, + 'extractor_args': opts.extractor_args, + 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, + 'youtube_include_hls_manifest': opts.youtube_include_hls_manifest, + 'encoding': opts.encoding, + 'extract_flat': opts.extract_flat, + 'live_from_start': opts.live_from_start, + 'wait_for_video': opts.wait_for_video, + 'mark_watched': opts.mark_watched, + 'merge_output_format': opts.merge_output_format, + 'final_ext': final_ext, + 'postprocessors': postprocessors, + 'fixup': opts.fixup, + 'source_address': opts.source_address, + 'call_home': opts.call_home, + 'sleep_interval_requests': opts.sleep_interval_requests, + 'sleep_interval': opts.sleep_interval, + 'max_sleep_interval': opts.max_sleep_interval, + 'sleep_interval_subtitles': opts.sleep_interval_subtitles, + 'external_downloader': opts.external_downloader, + 'download_ranges': opts.download_ranges, + 'force_keyframes_at_cuts': opts.force_keyframes_at_cuts, + 'list_thumbnails': opts.list_thumbnails, + 'playlist_items': opts.playlist_items, + 'xattr_set_filesize': opts.xattr_set_filesize, + 'match_filter': opts.match_filter, + 'color': opts.color, + 'ffmpeg_location': opts.ffmpeg_location, + 'hls_prefer_native': opts.hls_prefer_native, + 'hls_use_mpegts': opts.hls_use_mpegts, + 'hls_split_discontinuity': opts.hls_split_discontinuity, + 'external_downloader_args': opts.external_downloader_args, + 'postprocessor_args': opts.postprocessor_args, + 'cn_verification_proxy': opts.cn_verification_proxy, + 'geo_verification_proxy': opts.geo_verification_proxy, + 'geo_bypass': opts.geo_bypass, + 'geo_bypass_country': opts.geo_bypass_country, + 'geo_bypass_ip_block': opts.geo_bypass_ip_block, + '_warnings': warnings, + '_deprecation_warnings': deprecation_warnings, + 'compat_opts': opts.compat_opts, + }) + + +def _real_main(argv=None): + setproctitle('yt-dlp') + + parser, opts, all_urls, ydl_opts = parse_options(argv) + + # Dump user agent + if opts.dump_user_agent: + ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent']) + write_string(f'{ua}\n', out=sys.stdout) + return + + if print_extractor_information(opts, all_urls): + return + + # We may need ffmpeg_location without having access to the YoutubeDL instance + # See https://github.com/yt-dlp/yt-dlp/issues/2191 + if opts.ffmpeg_location: + FFmpegPostProcessor._ffmpeg_location.set(opts.ffmpeg_location) + + with YoutubeDL(ydl_opts) as ydl: + pre_process = opts.update_self or opts.rm_cachedir + actual_use = all_urls or opts.load_info_filename + + if opts.rm_cachedir: + ydl.cache.remove() + + try: + updater = Updater(ydl, opts.update_self) + if opts.update_self and updater.update() and actual_use: + if updater.cmd: + return updater.restart() + # This code is reachable only for zip variant in py < 3.10 + # It makes sense to exit here, but the old behavior is to continue + ydl.report_warning('Restart yt-dlp to use the updated version') + # return 100, 'ERROR: The program must exit for the update to complete' + except Exception: + traceback.print_exc() + ydl._download_retcode = 100 + + if not actual_use: + if pre_process: + return ydl._download_retcode + + args = sys.argv[1:] if argv is None else argv + ydl.warn_if_short_id(args) + + # Show a useful error message and wait for keypress if not launched from shell on Windows + if not args and compat_os_name == 'nt' and getattr(sys, 'frozen', False): + import ctypes.wintypes + import msvcrt + + kernel32 = ctypes.WinDLL('Kernel32') + + buffer = (1 * ctypes.wintypes.DWORD)() + attached_processes = kernel32.GetConsoleProcessList(buffer, 1) + # If we only have a single process attached, then the executable was double clicked + # When using `pyinstaller` with `--onefile`, two processes get attached + is_onefile = hasattr(sys, '_MEIPASS') and os.path.basename(sys._MEIPASS).startswith('_MEI') + if attached_processes == 1 or is_onefile and attached_processes == 2: + print(parser._generate_error_message( + 'Do not double-click the executable, instead call it from a command line.\n' + 'Please read the README for further information on how to use yt-dlp: ' + 'https://github.com/yt-dlp/yt-dlp#readme')) + msvcrt.getch() + _exit(2) + parser.error( + 'You must provide at least one URL.\n' + 'Type yt-dlp --help to see a list of all options.') + + parser.destroy() + try: + if opts.load_info_filename is not None: + if all_urls: + ydl.report_warning('URLs are ignored due to --load-info-json') + return ydl.download_with_info_file(expand_path(opts.load_info_filename)) + else: + return ydl.download(all_urls) + except DownloadCancelled: + ydl.to_screen('Aborting remaining downloads') + return 101 + + +def main(argv=None): + global _IN_CLI + _IN_CLI = True + try: + _exit(*variadic(_real_main(argv))) + except DownloadError: + _exit(1) + except SameFileError as e: + _exit(f'ERROR: {e}') + except KeyboardInterrupt: + _exit('\nERROR: Interrupted by user') + except BrokenPipeError as e: + # https://docs.python.org/3/library/signal.html#note-on-sigpipe + devnull = os.open(os.devnull, os.O_WRONLY) + os.dup2(devnull, sys.stdout.fileno()) + _exit(f'\nERROR: {e}') + except optparse.OptParseError as e: + _exit(2, f'\n{e}') + + +from .extractor import gen_extractors, list_extractors + +__all__ = [ + 'main', + 'YoutubeDL', + 'parse_options', + 'gen_extractors', + 'list_extractors', +] diff --git a/yt_dlp/__main__.py b/yt_dlp/__main__.py new file mode 100644 index 0000000..06c3920 --- /dev/null +++ b/yt_dlp/__main__.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +# Execute with +# $ python3 -m yt_dlp + +import sys + +if __package__ is None and not getattr(sys, 'frozen', False): + # direct call of __main__.py + import os.path + path = os.path.realpath(os.path.abspath(__file__)) + sys.path.insert(0, os.path.dirname(os.path.dirname(path))) + +import yt_dlp + +if __name__ == '__main__': + yt_dlp.main() diff --git a/yt_dlp/__pyinstaller/__init__.py b/yt_dlp/__pyinstaller/__init__.py new file mode 100644 index 0000000..1c52aad --- /dev/null +++ b/yt_dlp/__pyinstaller/__init__.py @@ -0,0 +1,5 @@ +import os + + +def get_hook_dirs(): + return [os.path.dirname(__file__)] diff --git a/yt_dlp/__pyinstaller/hook-yt_dlp.py b/yt_dlp/__pyinstaller/hook-yt_dlp.py new file mode 100644 index 0000000..7c3dbfb --- /dev/null +++ b/yt_dlp/__pyinstaller/hook-yt_dlp.py @@ -0,0 +1,34 @@ +import sys + +from PyInstaller.utils.hooks import collect_submodules + + +def pycryptodome_module(): + try: + import Cryptodome # noqa: F401 + except ImportError: + try: + import Crypto # noqa: F401 + print('WARNING: Using Crypto since Cryptodome is not available. ' + 'Install with: python3 -m pip install pycryptodomex', file=sys.stderr) + return 'Crypto' + except ImportError: + pass + return 'Cryptodome' + + +def get_hidden_imports(): + yield from ('yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated') + yield from ('yt_dlp.utils._legacy', 'yt_dlp.utils._deprecated') + yield pycryptodome_module() + # Only `websockets` is required, others are collected just in case + for module in ('websockets', 'requests', 'urllib3'): + yield from collect_submodules(module) + # These are auto-detected, but explicitly add them just in case + yield from ('mutagen', 'brotli', 'certifi', 'secretstorage') + + +hiddenimports = list(get_hidden_imports()) +print(f'Adding imports: {hiddenimports}') + +excludedimports = ['youtube_dl', 'youtube_dlc', 'test', 'ytdlp_plugins', 'devscripts', 'bundle'] diff --git a/yt_dlp/aes.py b/yt_dlp/aes.py new file mode 100644 index 0000000..b3a383c --- /dev/null +++ b/yt_dlp/aes.py @@ -0,0 +1,567 @@ +import base64 +from math import ceil + +from .compat import compat_ord +from .dependencies import Cryptodome +from .utils import bytes_to_intlist, intlist_to_bytes + +if Cryptodome.AES: + def aes_cbc_decrypt_bytes(data, key, iv): + """ Decrypt bytes with AES-CBC using pycryptodome """ + return Cryptodome.AES.new(key, Cryptodome.AES.MODE_CBC, iv).decrypt(data) + + def aes_gcm_decrypt_and_verify_bytes(data, key, tag, nonce): + """ Decrypt bytes with AES-GCM using pycryptodome """ + return Cryptodome.AES.new(key, Cryptodome.AES.MODE_GCM, nonce).decrypt_and_verify(data, tag) + +else: + def aes_cbc_decrypt_bytes(data, key, iv): + """ Decrypt bytes with AES-CBC using native implementation since pycryptodome is unavailable """ + return intlist_to_bytes(aes_cbc_decrypt(*map(bytes_to_intlist, (data, key, iv)))) + + def aes_gcm_decrypt_and_verify_bytes(data, key, tag, nonce): + """ Decrypt bytes with AES-GCM using native implementation since pycryptodome is unavailable """ + return intlist_to_bytes(aes_gcm_decrypt_and_verify(*map(bytes_to_intlist, (data, key, tag, nonce)))) + + +def aes_cbc_encrypt_bytes(data, key, iv, **kwargs): + return intlist_to_bytes(aes_cbc_encrypt(*map(bytes_to_intlist, (data, key, iv)), **kwargs)) + + +BLOCK_SIZE_BYTES = 16 + + +def unpad_pkcs7(data): + return data[:-compat_ord(data[-1])] + + +def pkcs7_padding(data): + """ + PKCS#7 padding + + @param {int[]} data cleartext + @returns {int[]} padding data + """ + + remaining_length = BLOCK_SIZE_BYTES - len(data) % BLOCK_SIZE_BYTES + return data + [remaining_length] * remaining_length + + +def pad_block(block, padding_mode): + """ + Pad a block with the given padding mode + @param {int[]} block block to pad + @param padding_mode padding mode + """ + padding_size = BLOCK_SIZE_BYTES - len(block) + + PADDING_BYTE = { + 'pkcs7': padding_size, + 'iso7816': 0x0, + 'whitespace': 0x20, + 'zero': 0x0, + } + + if padding_size < 0: + raise ValueError('Block size exceeded') + elif padding_mode not in PADDING_BYTE: + raise NotImplementedError(f'Padding mode {padding_mode} is not implemented') + + if padding_mode == 'iso7816' and padding_size: + block = block + [0x80] # NB: += mutates list + padding_size -= 1 + + return block + [PADDING_BYTE[padding_mode]] * padding_size + + +def aes_ecb_encrypt(data, key, iv=None): + """ + Encrypt with aes in ECB mode. Using PKCS#7 padding + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv Unused for this mode + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + encrypted_data += aes_encrypt(pkcs7_padding(block), expanded_key) + + return encrypted_data + + +def aes_ecb_decrypt(data, key, iv=None): + """ + Decrypt with aes in ECB mode + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv Unused for this mode + @returns {int[]} decrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + encrypted_data += aes_decrypt(block, expanded_key) + encrypted_data = encrypted_data[:len(data)] + + return encrypted_data + + +def aes_ctr_decrypt(data, key, iv): + """ + Decrypt with aes in counter mode + + @param {int[]} data cipher + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte initialization vector + @returns {int[]} decrypted data + """ + return aes_ctr_encrypt(data, key, iv) + + +def aes_ctr_encrypt(data, key, iv): + """ + Encrypt with aes in counter mode + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte initialization vector + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + counter = iter_vector(iv) + + encrypted_data = [] + for i in range(block_count): + counter_block = next(counter) + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + block += [0] * (BLOCK_SIZE_BYTES - len(block)) + + cipher_counter_block = aes_encrypt(counter_block, expanded_key) + encrypted_data += xor(block, cipher_counter_block) + encrypted_data = encrypted_data[:len(data)] + + return encrypted_data + + +def aes_cbc_decrypt(data, key, iv): + """ + Decrypt with aes in CBC mode + + @param {int[]} data cipher + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte IV + @returns {int[]} decrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + decrypted_data = [] + previous_cipher_block = iv + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + block += [0] * (BLOCK_SIZE_BYTES - len(block)) + + decrypted_block = aes_decrypt(block, expanded_key) + decrypted_data += xor(decrypted_block, previous_cipher_block) + previous_cipher_block = block + decrypted_data = decrypted_data[:len(data)] + + return decrypted_data + + +def aes_cbc_encrypt(data, key, iv, *, padding_mode='pkcs7'): + """ + Encrypt with aes in CBC mode + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte IV + @param padding_mode Padding mode to use + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + previous_cipher_block = iv + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + block = pad_block(block, padding_mode) + + mixed_block = xor(block, previous_cipher_block) + + encrypted_block = aes_encrypt(mixed_block, expanded_key) + encrypted_data += encrypted_block + + previous_cipher_block = encrypted_block + + return encrypted_data + + +def aes_gcm_decrypt_and_verify(data, key, tag, nonce): + """ + Decrypt with aes in GBM mode and checks authenticity using tag + + @param {int[]} data cipher + @param {int[]} key 16-Byte cipher key + @param {int[]} tag authentication tag + @param {int[]} nonce IV (recommended 12-Byte) + @returns {int[]} decrypted data + """ + + # XXX: check aes, gcm param + + hash_subkey = aes_encrypt([0] * BLOCK_SIZE_BYTES, key_expansion(key)) + + if len(nonce) == 12: + j0 = nonce + [0, 0, 0, 1] + else: + fill = (BLOCK_SIZE_BYTES - (len(nonce) % BLOCK_SIZE_BYTES)) % BLOCK_SIZE_BYTES + 8 + ghash_in = nonce + [0] * fill + bytes_to_intlist((8 * len(nonce)).to_bytes(8, 'big')) + j0 = ghash(hash_subkey, ghash_in) + + # TODO: add nonce support to aes_ctr_decrypt + + # nonce_ctr = j0[:12] + iv_ctr = inc(j0) + + decrypted_data = aes_ctr_decrypt(data, key, iv_ctr + [0] * (BLOCK_SIZE_BYTES - len(iv_ctr))) + pad_len = len(data) // 16 * 16 + s_tag = ghash( + hash_subkey, + data + + [0] * (BLOCK_SIZE_BYTES - len(data) + pad_len) # pad + + bytes_to_intlist((0 * 8).to_bytes(8, 'big') # length of associated data + + ((len(data) * 8).to_bytes(8, 'big'))) # length of data + ) + + if tag != aes_ctr_encrypt(s_tag, key, j0): + raise ValueError("Mismatching authentication tag") + + return decrypted_data + + +def aes_encrypt(data, expanded_key): + """ + Encrypt one block with aes + + @param {int[]} data 16-Byte state + @param {int[]} expanded_key 176/208/240-Byte expanded key + @returns {int[]} 16-Byte cipher + """ + rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 + + data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) + for i in range(1, rounds + 1): + data = sub_bytes(data) + data = shift_rows(data) + if i != rounds: + data = list(iter_mix_columns(data, MIX_COLUMN_MATRIX)) + data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]) + + return data + + +def aes_decrypt(data, expanded_key): + """ + Decrypt one block with aes + + @param {int[]} data 16-Byte cipher + @param {int[]} expanded_key 176/208/240-Byte expanded key + @returns {int[]} 16-Byte state + """ + rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 + + for i in range(rounds, 0, -1): + data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]) + if i != rounds: + data = list(iter_mix_columns(data, MIX_COLUMN_MATRIX_INV)) + data = shift_rows_inv(data) + data = sub_bytes_inv(data) + data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) + + return data + + +def aes_decrypt_text(data, password, key_size_bytes): + """ + Decrypt text + - The first 8 Bytes of decoded 'data' are the 8 high Bytes of the counter + - The cipher key is retrieved by encrypting the first 16 Byte of 'password' + with the first 'key_size_bytes' Bytes from 'password' (if necessary filled with 0's) + - Mode of operation is 'counter' + + @param {str} data Base64 encoded string + @param {str,unicode} password Password (will be encoded with utf-8) + @param {int} key_size_bytes Possible values: 16 for 128-Bit, 24 for 192-Bit or 32 for 256-Bit + @returns {str} Decrypted data + """ + NONCE_LENGTH_BYTES = 8 + + data = bytes_to_intlist(base64.b64decode(data)) + password = bytes_to_intlist(password.encode()) + + key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) + key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes // BLOCK_SIZE_BYTES) + + nonce = data[:NONCE_LENGTH_BYTES] + cipher = data[NONCE_LENGTH_BYTES:] + + decrypted_data = aes_ctr_decrypt(cipher, key, nonce + [0] * (BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES)) + plaintext = intlist_to_bytes(decrypted_data) + + return plaintext + + +RCON = (0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36) +SBOX = (0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, + 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, + 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, + 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, + 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, + 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, + 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, + 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, + 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, + 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, + 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16) +SBOX_INV = (0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d) +MIX_COLUMN_MATRIX = ((0x2, 0x3, 0x1, 0x1), + (0x1, 0x2, 0x3, 0x1), + (0x1, 0x1, 0x2, 0x3), + (0x3, 0x1, 0x1, 0x2)) +MIX_COLUMN_MATRIX_INV = ((0xE, 0xB, 0xD, 0x9), + (0x9, 0xE, 0xB, 0xD), + (0xD, 0x9, 0xE, 0xB), + (0xB, 0xD, 0x9, 0xE)) +RIJNDAEL_EXP_TABLE = (0x01, 0x03, 0x05, 0x0F, 0x11, 0x33, 0x55, 0xFF, 0x1A, 0x2E, 0x72, 0x96, 0xA1, 0xF8, 0x13, 0x35, + 0x5F, 0xE1, 0x38, 0x48, 0xD8, 0x73, 0x95, 0xA4, 0xF7, 0x02, 0x06, 0x0A, 0x1E, 0x22, 0x66, 0xAA, + 0xE5, 0x34, 0x5C, 0xE4, 0x37, 0x59, 0xEB, 0x26, 0x6A, 0xBE, 0xD9, 0x70, 0x90, 0xAB, 0xE6, 0x31, + 0x53, 0xF5, 0x04, 0x0C, 0x14, 0x3C, 0x44, 0xCC, 0x4F, 0xD1, 0x68, 0xB8, 0xD3, 0x6E, 0xB2, 0xCD, + 0x4C, 0xD4, 0x67, 0xA9, 0xE0, 0x3B, 0x4D, 0xD7, 0x62, 0xA6, 0xF1, 0x08, 0x18, 0x28, 0x78, 0x88, + 0x83, 0x9E, 0xB9, 0xD0, 0x6B, 0xBD, 0xDC, 0x7F, 0x81, 0x98, 0xB3, 0xCE, 0x49, 0xDB, 0x76, 0x9A, + 0xB5, 0xC4, 0x57, 0xF9, 0x10, 0x30, 0x50, 0xF0, 0x0B, 0x1D, 0x27, 0x69, 0xBB, 0xD6, 0x61, 0xA3, + 0xFE, 0x19, 0x2B, 0x7D, 0x87, 0x92, 0xAD, 0xEC, 0x2F, 0x71, 0x93, 0xAE, 0xE9, 0x20, 0x60, 0xA0, + 0xFB, 0x16, 0x3A, 0x4E, 0xD2, 0x6D, 0xB7, 0xC2, 0x5D, 0xE7, 0x32, 0x56, 0xFA, 0x15, 0x3F, 0x41, + 0xC3, 0x5E, 0xE2, 0x3D, 0x47, 0xC9, 0x40, 0xC0, 0x5B, 0xED, 0x2C, 0x74, 0x9C, 0xBF, 0xDA, 0x75, + 0x9F, 0xBA, 0xD5, 0x64, 0xAC, 0xEF, 0x2A, 0x7E, 0x82, 0x9D, 0xBC, 0xDF, 0x7A, 0x8E, 0x89, 0x80, + 0x9B, 0xB6, 0xC1, 0x58, 0xE8, 0x23, 0x65, 0xAF, 0xEA, 0x25, 0x6F, 0xB1, 0xC8, 0x43, 0xC5, 0x54, + 0xFC, 0x1F, 0x21, 0x63, 0xA5, 0xF4, 0x07, 0x09, 0x1B, 0x2D, 0x77, 0x99, 0xB0, 0xCB, 0x46, 0xCA, + 0x45, 0xCF, 0x4A, 0xDE, 0x79, 0x8B, 0x86, 0x91, 0xA8, 0xE3, 0x3E, 0x42, 0xC6, 0x51, 0xF3, 0x0E, + 0x12, 0x36, 0x5A, 0xEE, 0x29, 0x7B, 0x8D, 0x8C, 0x8F, 0x8A, 0x85, 0x94, 0xA7, 0xF2, 0x0D, 0x17, + 0x39, 0x4B, 0xDD, 0x7C, 0x84, 0x97, 0xA2, 0xFD, 0x1C, 0x24, 0x6C, 0xB4, 0xC7, 0x52, 0xF6, 0x01) +RIJNDAEL_LOG_TABLE = (0x00, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03, + 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1, + 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78, + 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e, + 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38, + 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10, + 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba, + 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57, + 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8, + 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0, + 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7, + 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d, + 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1, + 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab, + 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5, + 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07) + + +def key_expansion(data): + """ + Generate key schedule + + @param {int[]} data 16/24/32-Byte cipher key + @returns {int[]} 176/208/240-Byte expanded key + """ + data = data[:] # copy + rcon_iteration = 1 + key_size_bytes = len(data) + expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES + + while len(data) < expanded_key_size_bytes: + temp = data[-4:] + temp = key_schedule_core(temp, rcon_iteration) + rcon_iteration += 1 + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + for _ in range(3): + temp = data[-4:] + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + if key_size_bytes == 32: + temp = data[-4:] + temp = sub_bytes(temp) + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + for _ in range(3 if key_size_bytes == 32 else 2 if key_size_bytes == 24 else 0): + temp = data[-4:] + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + data = data[:expanded_key_size_bytes] + + return data + + +def iter_vector(iv): + while True: + yield iv + iv = inc(iv) + + +def sub_bytes(data): + return [SBOX[x] for x in data] + + +def sub_bytes_inv(data): + return [SBOX_INV[x] for x in data] + + +def rotate(data): + return data[1:] + [data[0]] + + +def key_schedule_core(data, rcon_iteration): + data = rotate(data) + data = sub_bytes(data) + data[0] = data[0] ^ RCON[rcon_iteration] + + return data + + +def xor(data1, data2): + return [x ^ y for x, y in zip(data1, data2)] + + +def iter_mix_columns(data, matrix): + for i in (0, 4, 8, 12): + for row in matrix: + mixed = 0 + for j in range(4): + # xor is (+) and (-) + mixed ^= (0 if data[i:i + 4][j] == 0 or row[j] == 0 else + RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[data[i + j]] + RIJNDAEL_LOG_TABLE[row[j]]) % 0xFF]) + yield mixed + + +def shift_rows(data): + return [data[((column + row) & 0b11) * 4 + row] for column in range(4) for row in range(4)] + + +def shift_rows_inv(data): + return [data[((column - row) & 0b11) * 4 + row] for column in range(4) for row in range(4)] + + +def shift_block(data): + data_shifted = [] + + bit = 0 + for n in data: + if bit: + n |= 0x100 + bit = n & 1 + n >>= 1 + data_shifted.append(n) + + return data_shifted + + +def inc(data): + data = data[:] # copy + for i in range(len(data) - 1, -1, -1): + if data[i] == 255: + data[i] = 0 + else: + data[i] = data[i] + 1 + break + return data + + +def block_product(block_x, block_y): + # NIST SP 800-38D, Algorithm 1 + + if len(block_x) != BLOCK_SIZE_BYTES or len(block_y) != BLOCK_SIZE_BYTES: + raise ValueError("Length of blocks need to be %d bytes" % BLOCK_SIZE_BYTES) + + block_r = [0xE1] + [0] * (BLOCK_SIZE_BYTES - 1) + block_v = block_y[:] + block_z = [0] * BLOCK_SIZE_BYTES + + for i in block_x: + for bit in range(7, -1, -1): + if i & (1 << bit): + block_z = xor(block_z, block_v) + + do_xor = block_v[-1] & 1 + block_v = shift_block(block_v) + if do_xor: + block_v = xor(block_v, block_r) + + return block_z + + +def ghash(subkey, data): + # NIST SP 800-38D, Algorithm 2 + + if len(data) % BLOCK_SIZE_BYTES: + raise ValueError("Length of data should be %d bytes" % BLOCK_SIZE_BYTES) + + last_y = [0] * BLOCK_SIZE_BYTES + for i in range(0, len(data), BLOCK_SIZE_BYTES): + block = data[i: i + BLOCK_SIZE_BYTES] + last_y = block_product(xor(last_y, block), subkey) + + return last_y + + +__all__ = [ + 'aes_cbc_decrypt', + 'aes_cbc_decrypt_bytes', + 'aes_ctr_decrypt', + 'aes_decrypt_text', + 'aes_decrypt', + 'aes_ecb_decrypt', + 'aes_gcm_decrypt_and_verify', + 'aes_gcm_decrypt_and_verify_bytes', + + 'aes_cbc_encrypt', + 'aes_cbc_encrypt_bytes', + 'aes_ctr_encrypt', + 'aes_ecb_encrypt', + 'aes_encrypt', + + 'key_expansion', + 'pad_block', + 'pkcs7_padding', + 'unpad_pkcs7', +] diff --git a/yt_dlp/cache.py b/yt_dlp/cache.py new file mode 100644 index 0000000..9dd4f2f --- /dev/null +++ b/yt_dlp/cache.py @@ -0,0 +1,91 @@ +import contextlib +import json +import os +import re +import shutil +import traceback +import urllib.parse + +from .utils import expand_path, traverse_obj, version_tuple, write_json_file +from .version import __version__ + + +class Cache: + def __init__(self, ydl): + self._ydl = ydl + + def _get_root_dir(self): + res = self._ydl.params.get('cachedir') + if res is None: + cache_root = os.getenv('XDG_CACHE_HOME', '~/.cache') + res = os.path.join(cache_root, 'yt-dlp') + return expand_path(res) + + def _get_cache_fn(self, section, key, dtype): + assert re.match(r'^[\w.-]+$', section), f'invalid section {section!r}' + key = urllib.parse.quote(key, safe='').replace('%', ',') # encode non-ascii characters + return os.path.join(self._get_root_dir(), section, f'{key}.{dtype}') + + @property + def enabled(self): + return self._ydl.params.get('cachedir') is not False + + def store(self, section, key, data, dtype='json'): + assert dtype in ('json',) + + if not self.enabled: + return + + fn = self._get_cache_fn(section, key, dtype) + try: + os.makedirs(os.path.dirname(fn), exist_ok=True) + self._ydl.write_debug(f'Saving {section}.{key} to cache') + write_json_file({'yt-dlp_version': __version__, 'data': data}, fn) + except Exception: + tb = traceback.format_exc() + self._ydl.report_warning(f'Writing cache to {fn!r} failed: {tb}') + + def _validate(self, data, min_ver): + version = traverse_obj(data, 'yt-dlp_version') + if not version: # Backward compatibility + data, version = {'data': data}, '2022.08.19' + if not min_ver or version_tuple(version) >= version_tuple(min_ver): + return data['data'] + self._ydl.write_debug(f'Discarding old cache from version {version} (needs {min_ver})') + + def load(self, section, key, dtype='json', default=None, *, min_ver=None): + assert dtype in ('json',) + + if not self.enabled: + return default + + cache_fn = self._get_cache_fn(section, key, dtype) + with contextlib.suppress(OSError): + try: + with open(cache_fn, encoding='utf-8') as cachef: + self._ydl.write_debug(f'Loading {section}.{key} from cache') + return self._validate(json.load(cachef), min_ver) + except (ValueError, KeyError): + try: + file_size = os.path.getsize(cache_fn) + except OSError as oe: + file_size = str(oe) + self._ydl.report_warning(f'Cache retrieval from {cache_fn} failed ({file_size})') + + return default + + def remove(self): + if not self.enabled: + self._ydl.to_screen('Cache is disabled (Did you combine --no-cache-dir and --rm-cache-dir?)') + return + + cachedir = self._get_root_dir() + if not any((term in cachedir) for term in ('cache', 'tmp')): + raise Exception('Not removing directory %s - this does not look like a cache dir' % cachedir) + + self._ydl.to_screen( + 'Removing cache dir %s .' % cachedir, skip_eol=True) + if os.path.exists(cachedir): + self._ydl.to_screen('.', skip_eol=True) + shutil.rmtree(cachedir) + self._ydl.to_screen('.') diff --git a/yt_dlp/compat/__init__.py b/yt_dlp/compat/__init__.py new file mode 100644 index 0000000..5ad5c70 --- /dev/null +++ b/yt_dlp/compat/__init__.py @@ -0,0 +1,79 @@ +import os +import sys +import xml.etree.ElementTree as etree + +from .compat_utils import passthrough_module + +passthrough_module(__name__, '._deprecated') +del passthrough_module + + +# HTMLParseError has been deprecated in Python 3.3 and removed in +# Python 3.5. Introducing dummy exception for Python >3.5 for compatible +# and uniform cross-version exception handling +class compat_HTMLParseError(ValueError): + pass + + +class _TreeBuilder(etree.TreeBuilder): + def doctype(self, name, pubid, system): + pass + + +def compat_etree_fromstring(text): + return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) + + +compat_os_name = os._name if os.name == 'java' else os.name + + +if compat_os_name == 'nt': + def compat_shlex_quote(s): + import re + return s if re.match(r'^[-_\w./]+$', s) else s.replace('"', '""').join('""') +else: + from shlex import quote as compat_shlex_quote # noqa: F401 + + +def compat_ord(c): + return c if isinstance(c, int) else ord(c) + + +if compat_os_name == 'nt' and sys.version_info < (3, 8): + # os.path.realpath on Windows does not follow symbolic links + # prior to Python 3.8 (see https://bugs.python.org/issue9949) + def compat_realpath(path): + while os.path.islink(path): + path = os.path.abspath(os.readlink(path)) + return os.path.realpath(path) +else: + compat_realpath = os.path.realpath + + +# Python 3.8+ does not honor %HOME% on windows, but this breaks compatibility with youtube-dl +# See https://github.com/yt-dlp/yt-dlp/issues/792 +# https://docs.python.org/3/library/os.path.html#os.path.expanduser +if compat_os_name in ('nt', 'ce'): + def compat_expanduser(path): + HOME = os.environ.get('HOME') + if not HOME: + return os.path.expanduser(path) + elif not path.startswith('~'): + return path + i = path.replace('\\', '/', 1).find('/') # ~user + if i < 0: + i = len(path) + userhome = os.path.join(os.path.dirname(HOME), path[1:i]) if i > 1 else HOME + return userhome + path[i:] +else: + compat_expanduser = os.path.expanduser + + +def urllib_req_to_req(urllib_request): + """Convert urllib Request to a networking Request""" + from ..networking import Request + from ..utils.networking import HTTPHeaderDict + return Request( + urllib_request.get_full_url(), data=urllib_request.data, method=urllib_request.get_method(), + headers=HTTPHeaderDict(urllib_request.headers, urllib_request.unredirected_hdrs), + extensions={'timeout': urllib_request.timeout} if hasattr(urllib_request, 'timeout') else None) diff --git a/yt_dlp/compat/_deprecated.py b/yt_dlp/compat/_deprecated.py new file mode 100644 index 0000000..607bae9 --- /dev/null +++ b/yt_dlp/compat/_deprecated.py @@ -0,0 +1,23 @@ +"""Deprecated - New code should avoid these""" +import warnings + +from .compat_utils import passthrough_module + +# XXX: Implement this the same way as other DeprecationWarnings without circular import +passthrough_module(__name__, '.._legacy', callback=lambda attr: warnings.warn( + DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=6)) +del passthrough_module + +import base64 +import urllib.error +import urllib.parse + +compat_str = str + +compat_b64decode = base64.b64decode + +compat_urlparse = urllib.parse +compat_parse_qs = urllib.parse.parse_qs +compat_urllib_parse_unquote = urllib.parse.unquote +compat_urllib_parse_urlencode = urllib.parse.urlencode +compat_urllib_parse_urlparse = urllib.parse.urlparse diff --git a/yt_dlp/compat/_legacy.py b/yt_dlp/compat/_legacy.py new file mode 100644 index 0000000..7ea5d08 --- /dev/null +++ b/yt_dlp/compat/_legacy.py @@ -0,0 +1,108 @@ +""" Do not use! """ + +import base64 +import collections +import ctypes +import getpass +import html.entities +import html.parser +import http.client +import http.cookiejar +import http.cookies +import http.server +import itertools +import os +import shlex +import shutil +import socket +import struct +import subprocess +import tokenize +import urllib.error +import urllib.parse +import urllib.request +import xml.etree.ElementTree as etree + +# isort: split +import asyncio # noqa: F401 +import re # noqa: F401 +from asyncio import run as compat_asyncio_run # noqa: F401 +from re import Pattern as compat_Pattern # noqa: F401 +from re import match as compat_Match # noqa: F401 + +from . import compat_expanduser, compat_HTMLParseError, compat_realpath +from .compat_utils import passthrough_module +from ..dependencies import brotli as compat_brotli # noqa: F401 +from ..dependencies import websockets as compat_websockets # noqa: F401 +from ..dependencies.Cryptodome import AES as compat_pycrypto_AES # noqa: F401 +from ..networking.exceptions import HTTPError as compat_HTTPError # noqa: F401 + +passthrough_module(__name__, '...utils', ('WINDOWS_VT_MODE', 'windows_enable_vt_mode')) + + +# compat_ctypes_WINFUNCTYPE = ctypes.WINFUNCTYPE +# will not work since ctypes.WINFUNCTYPE does not exist in UNIX machines +def compat_ctypes_WINFUNCTYPE(*args, **kwargs): + return ctypes.WINFUNCTYPE(*args, **kwargs) + + +def compat_setenv(key, value, env=os.environ): + env[key] = value + + +compat_base64_b64decode = base64.b64decode +compat_basestring = str +compat_casefold = str.casefold +compat_chr = chr +compat_collections_abc = collections.abc +compat_cookiejar = compat_http_cookiejar = http.cookiejar +compat_cookiejar_Cookie = compat_http_cookiejar_Cookie = http.cookiejar.Cookie +compat_cookies = compat_http_cookies = http.cookies +compat_cookies_SimpleCookie = compat_http_cookies_SimpleCookie = http.cookies.SimpleCookie +compat_etree_Element = compat_xml_etree_ElementTree_Element = etree.Element +compat_etree_register_namespace = compat_xml_etree_register_namespace = etree.register_namespace +compat_filter = filter +compat_get_terminal_size = shutil.get_terminal_size +compat_getenv = os.getenv +compat_getpass = compat_getpass_getpass = getpass.getpass +compat_html_entities = html.entities +compat_html_entities_html5 = html.entities.html5 +compat_html_parser_HTMLParseError = compat_HTMLParseError +compat_HTMLParser = compat_html_parser_HTMLParser = html.parser.HTMLParser +compat_http_client = http.client +compat_http_server = http.server +compat_input = input +compat_integer_types = (int, ) +compat_itertools_count = itertools.count +compat_kwargs = lambda kwargs: kwargs +compat_map = map +compat_numeric_types = (int, float, complex) +compat_os_path_expanduser = compat_expanduser +compat_os_path_realpath = compat_realpath +compat_print = print +compat_shlex_split = shlex.split +compat_socket_create_connection = socket.create_connection +compat_Struct = struct.Struct +compat_struct_pack = struct.pack +compat_struct_unpack = struct.unpack +compat_subprocess_get_DEVNULL = lambda: subprocess.DEVNULL +compat_tokenize_tokenize = tokenize.tokenize +compat_urllib_error = urllib.error +compat_urllib_HTTPError = compat_HTTPError +compat_urllib_parse = urllib.parse +compat_urllib_parse_parse_qs = urllib.parse.parse_qs +compat_urllib_parse_quote = urllib.parse.quote +compat_urllib_parse_quote_plus = urllib.parse.quote_plus +compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus +compat_urllib_parse_unquote_to_bytes = urllib.parse.unquote_to_bytes +compat_urllib_parse_urlunparse = urllib.parse.urlunparse +compat_urllib_request = urllib.request +compat_urllib_request_DataHandler = urllib.request.DataHandler +compat_urllib_response = urllib.response +compat_urlretrieve = compat_urllib_request_urlretrieve = urllib.request.urlretrieve +compat_xml_parse_error = compat_xml_etree_ElementTree_ParseError = etree.ParseError +compat_xpath = lambda xpath: xpath +compat_zip = zip +workaround_optparse_bug9161 = lambda: None + +legacy = [] diff --git a/yt_dlp/compat/compat_utils.py b/yt_dlp/compat/compat_utils.py new file mode 100644 index 0000000..d62b7d0 --- /dev/null +++ b/yt_dlp/compat/compat_utils.py @@ -0,0 +1,83 @@ +import collections +import contextlib +import functools +import importlib +import sys +import types + +_NO_ATTRIBUTE = object() + +_Package = collections.namedtuple('Package', ('name', 'version')) + + +def get_package_info(module): + return _Package( + name=getattr(module, '_yt_dlp__identifier', module.__name__), + version=str(next(filter(None, ( + getattr(module, attr, None) + for attr in ('_yt_dlp__version', '__version__', 'version_string', 'version') + )), None))) + + +def _is_package(module): + return '__path__' in vars(module) + + +def _is_dunder(name): + return name.startswith('__') and name.endswith('__') + + +class EnhancedModule(types.ModuleType): + def __bool__(self): + return vars(self).get('__bool__', lambda: True)() + + def __getattribute__(self, attr): + try: + ret = super().__getattribute__(attr) + except AttributeError: + if _is_dunder(attr): + raise + getter = getattr(self, '__getattr__', None) + if not getter: + raise + ret = getter(attr) + return ret.fget() if isinstance(ret, property) else ret + + +def passthrough_module(parent, child, allowed_attributes=(..., ), *, callback=lambda _: None): + """Passthrough parent module into a child module, creating the parent if necessary""" + def __getattr__(attr): + if _is_package(parent): + with contextlib.suppress(ModuleNotFoundError): + return importlib.import_module(f'.{attr}', parent.__name__) + + ret = from_child(attr) + if ret is _NO_ATTRIBUTE: + raise AttributeError(f'module {parent.__name__} has no attribute {attr}') + callback(attr) + return ret + + @functools.lru_cache(maxsize=None) + def from_child(attr): + nonlocal child + if attr not in allowed_attributes: + if ... not in allowed_attributes or _is_dunder(attr): + return _NO_ATTRIBUTE + + if isinstance(child, str): + child = importlib.import_module(child, parent.__name__) + + if _is_package(child): + with contextlib.suppress(ImportError): + return passthrough_module(f'{parent.__name__}.{attr}', + importlib.import_module(f'.{attr}', child.__name__)) + + with contextlib.suppress(AttributeError): + return getattr(child, attr) + + return _NO_ATTRIBUTE + + parent = sys.modules.get(parent, types.ModuleType(parent)) + parent.__class__ = EnhancedModule + parent.__getattr__ = __getattr__ + return parent diff --git a/yt_dlp/compat/functools.py b/yt_dlp/compat/functools.py new file mode 100644 index 0000000..36c9836 --- /dev/null +++ b/yt_dlp/compat/functools.py @@ -0,0 +1,12 @@ +# flake8: noqa: F405 +from functools import * # noqa: F403 + +from .compat_utils import passthrough_module + +passthrough_module(__name__, 'functools') +del passthrough_module + +try: + cache # >= 3.9 +except NameError: + cache = lru_cache(maxsize=None) diff --git a/yt_dlp/compat/imghdr.py b/yt_dlp/compat/imghdr.py new file mode 100644 index 0000000..5d64ab0 --- /dev/null +++ b/yt_dlp/compat/imghdr.py @@ -0,0 +1,16 @@ +tests = { + 'webp': lambda h: h[0:4] == b'RIFF' and h[8:] == b'WEBP', + 'png': lambda h: h[:8] == b'\211PNG\r\n\032\n', + 'jpeg': lambda h: h[6:10] in (b'JFIF', b'Exif'), + 'gif': lambda h: h[:6] in (b'GIF87a', b'GIF89a'), +} + + +def what(file=None, h=None): + """Detect format of image (Currently supports jpeg, png, webp, gif only) + Ref: https://github.com/python/cpython/blob/3.10/Lib/imghdr.py + """ + if h is None: + with open(file, 'rb') as f: + h = f.read(12) + return next((type_ for type_, test in tests.items() if test(h)), None) diff --git a/yt_dlp/compat/shutil.py b/yt_dlp/compat/shutil.py new file mode 100644 index 0000000..23239d5 --- /dev/null +++ b/yt_dlp/compat/shutil.py @@ -0,0 +1,30 @@ +# flake8: noqa: F405 +from shutil import * # noqa: F403 + +from .compat_utils import passthrough_module + +passthrough_module(__name__, 'shutil') +del passthrough_module + + +import sys + +if sys.platform.startswith('freebsd'): + import errno + import os + import shutil + + # Workaround for PermissionError when using restricted ACL mode on FreeBSD + def copy2(src, dst, *args, **kwargs): + if os.path.isdir(dst): + dst = os.path.join(dst, os.path.basename(src)) + shutil.copyfile(src, dst, *args, **kwargs) + try: + shutil.copystat(src, dst, *args, **kwargs) + except PermissionError as e: + if e.errno != getattr(errno, 'EPERM', None): + raise + return dst + + def move(*args, copy_function=copy2, **kwargs): + return shutil.move(*args, copy_function=copy_function, **kwargs) diff --git a/yt_dlp/compat/types.py b/yt_dlp/compat/types.py new file mode 100644 index 0000000..4aa3b0e --- /dev/null +++ b/yt_dlp/compat/types.py @@ -0,0 +1,13 @@ +# flake8: noqa: F405 +from types import * # noqa: F403 + +from .compat_utils import passthrough_module + +passthrough_module(__name__, 'types') +del passthrough_module + +try: + # NB: pypy has builtin NoneType, so checking NameError won't work + from types import NoneType # >= 3.10 +except ImportError: + NoneType = type(None) diff --git a/yt_dlp/compat/urllib/__init__.py b/yt_dlp/compat/urllib/__init__.py new file mode 100644 index 0000000..9084b3c --- /dev/null +++ b/yt_dlp/compat/urllib/__init__.py @@ -0,0 +1,10 @@ +# flake8: noqa: F405 +from urllib import * # noqa: F403 + +del request # noqa: F821 +from . import request # noqa: F401 + +from ..compat_utils import passthrough_module + +passthrough_module(__name__, 'urllib') +del passthrough_module diff --git a/yt_dlp/compat/urllib/request.py b/yt_dlp/compat/urllib/request.py new file mode 100644 index 0000000..ad9fa83 --- /dev/null +++ b/yt_dlp/compat/urllib/request.py @@ -0,0 +1,40 @@ +# flake8: noqa: F405 +from urllib.request import * # noqa: F403 + +from ..compat_utils import passthrough_module + +passthrough_module(__name__, 'urllib.request') +del passthrough_module + + +from .. import compat_os_name + +if compat_os_name == 'nt': + # On older Python versions, proxies are extracted from Windows registry erroneously. [1] + # If the https proxy in the registry does not have a scheme, urllib will incorrectly add https:// to it. [2] + # It is unlikely that the user has actually set it to be https, so we should be fine to safely downgrade + # it to http on these older Python versions to avoid issues + # This also applies for ftp proxy type, as ftp:// proxy scheme is not supported. + # 1: https://github.com/python/cpython/issues/86793 + # 2: https://github.com/python/cpython/blob/51f1ae5ceb0673316c4e4b0175384e892e33cc6e/Lib/urllib/request.py#L2683-L2698 + import sys + from urllib.request import getproxies_environment, getproxies_registry + + def getproxies_registry_patched(): + proxies = getproxies_registry() + if ( + sys.version_info >= (3, 10, 5) # https://docs.python.org/3.10/whatsnew/changelog.html#python-3-10-5-final + or (3, 9, 13) <= sys.version_info < (3, 10) # https://docs.python.org/3.9/whatsnew/changelog.html#python-3-9-13-final + ): + return proxies + + for scheme in ('https', 'ftp'): + if scheme in proxies and proxies[scheme].startswith(f'{scheme}://'): + proxies[scheme] = 'http' + proxies[scheme][len(scheme):] + + return proxies + + def getproxies(): + return getproxies_environment() or getproxies_registry_patched() + +del compat_os_name diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py new file mode 100644 index 0000000..28d174a --- /dev/null +++ b/yt_dlp/cookies.py @@ -0,0 +1,1346 @@ +import base64 +import collections +import contextlib +import glob +import http.cookiejar +import http.cookies +import io +import json +import os +import re +import shutil +import struct +import subprocess +import sys +import tempfile +import time +import urllib.request +from datetime import datetime, timedelta, timezone +from enum import Enum, auto +from hashlib import pbkdf2_hmac + +from .aes import ( + aes_cbc_decrypt_bytes, + aes_gcm_decrypt_and_verify_bytes, + unpad_pkcs7, +) +from .compat import functools # isort: split +from .compat import compat_os_name +from .dependencies import ( + _SECRETSTORAGE_UNAVAILABLE_REASON, + secretstorage, + sqlite3, +) +from .minicurses import MultilinePrinter, QuietMultilinePrinter +from .utils import ( + DownloadError, + Popen, + error_to_str, + expand_path, + is_path_like, + sanitize_url, + str_or_none, + try_call, + write_string, +) +from .utils._utils import _YDLLogger +from .utils.networking import normalize_url + +CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} +SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} + + +class YDLLogger(_YDLLogger): + def warning(self, message, only_once=False): # compat + return super().warning(message, once=only_once) + + class ProgressBar(MultilinePrinter): + _DELAY, _timer = 0.1, 0 + + def print(self, message): + if time.time() - self._timer > self._DELAY: + self.print_at_line(f'[Cookies] {message}', 0) + self._timer = time.time() + + def progress_bar(self): + """Return a context manager with a print method. (Optional)""" + # Do not print to files/pipes, loggers, or when --no-progress is used + if not self._ydl or self._ydl.params.get('noprogress') or self._ydl.params.get('logger'): + return + file = self._ydl._out_files.error + try: + if not file.isatty(): + return + except BaseException: + return + return self.ProgressBar(file, preserve_output=False) + + +def _create_progress_bar(logger): + if hasattr(logger, 'progress_bar'): + printer = logger.progress_bar() + if printer: + return printer + printer = QuietMultilinePrinter() + printer.print = lambda _: None + return printer + + +def load_cookies(cookie_file, browser_specification, ydl): + cookie_jars = [] + if browser_specification is not None: + browser_name, profile, keyring, container = _parse_browser_specification(*browser_specification) + cookie_jars.append( + extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring, container=container)) + + if cookie_file is not None: + is_filename = is_path_like(cookie_file) + if is_filename: + cookie_file = expand_path(cookie_file) + + jar = YoutubeDLCookieJar(cookie_file) + if not is_filename or os.access(cookie_file, os.R_OK): + jar.load() + cookie_jars.append(jar) + + return _merge_cookie_jars(cookie_jars) + + +def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), *, keyring=None, container=None): + if browser_name == 'firefox': + return _extract_firefox_cookies(profile, container, logger) + elif browser_name == 'safari': + return _extract_safari_cookies(profile, logger) + elif browser_name in CHROMIUM_BASED_BROWSERS: + return _extract_chrome_cookies(browser_name, profile, keyring, logger) + else: + raise ValueError(f'unknown browser: {browser_name}') + + +def _extract_firefox_cookies(profile, container, logger): + logger.info('Extracting cookies from firefox') + if not sqlite3: + logger.warning('Cannot extract cookies from firefox without sqlite3 support. ' + 'Please use a Python interpreter compiled with sqlite3 support') + return YoutubeDLCookieJar() + + if profile is None: + search_roots = list(_firefox_browser_dirs()) + elif _is_path(profile): + search_roots = [profile] + else: + search_roots = [os.path.join(path, profile) for path in _firefox_browser_dirs()] + search_root = ', '.join(map(repr, search_roots)) + + cookie_database_path = _newest(_firefox_cookie_dbs(search_roots)) + if cookie_database_path is None: + raise FileNotFoundError(f'could not find firefox cookies database in {search_root}') + logger.debug(f'Extracting cookies from: "{cookie_database_path}"') + + container_id = None + if container not in (None, 'none'): + containers_path = os.path.join(os.path.dirname(cookie_database_path), 'containers.json') + if not os.path.isfile(containers_path) or not os.access(containers_path, os.R_OK): + raise FileNotFoundError(f'could not read containers.json in {search_root}') + with open(containers_path, encoding='utf8') as containers: + identities = json.load(containers).get('identities', []) + container_id = next((context.get('userContextId') for context in identities if container in ( + context.get('name'), + try_call(lambda: re.fullmatch(r'userContext([^\.]+)\.label', context['l10nID']).group()) + )), None) + if not isinstance(container_id, int): + raise ValueError(f'could not find firefox container "{container}" in containers.json') + + with tempfile.TemporaryDirectory(prefix='yt_dlp') as tmpdir: + cursor = None + try: + cursor = _open_database_copy(cookie_database_path, tmpdir) + if isinstance(container_id, int): + logger.debug( + f'Only loading cookies from firefox container "{container}", ID {container_id}') + cursor.execute( + 'SELECT host, name, value, path, expiry, isSecure FROM moz_cookies WHERE originAttributes LIKE ? OR originAttributes LIKE ?', + (f'%userContextId={container_id}', f'%userContextId={container_id}&%')) + elif container == 'none': + logger.debug('Only loading cookies not belonging to any container') + cursor.execute( + 'SELECT host, name, value, path, expiry, isSecure FROM moz_cookies WHERE NOT INSTR(originAttributes,"userContextId=")') + else: + cursor.execute('SELECT host, name, value, path, expiry, isSecure FROM moz_cookies') + jar = YoutubeDLCookieJar() + with _create_progress_bar(logger) as progress_bar: + table = cursor.fetchall() + total_cookie_count = len(table) + for i, (host, name, value, path, expiry, is_secure) in enumerate(table): + progress_bar.print(f'Loading cookie {i: 6d}/{total_cookie_count: 6d}') + cookie = http.cookiejar.Cookie( + version=0, name=name, value=value, port=None, port_specified=False, + domain=host, domain_specified=bool(host), domain_initial_dot=host.startswith('.'), + path=path, path_specified=bool(path), secure=is_secure, expires=expiry, discard=False, + comment=None, comment_url=None, rest={}) + jar.set_cookie(cookie) + logger.info(f'Extracted {len(jar)} cookies from firefox') + return jar + finally: + if cursor is not None: + cursor.connection.close() + + +def _firefox_browser_dirs(): + if sys.platform in ('cygwin', 'win32'): + yield os.path.expandvars(R'%APPDATA%\Mozilla\Firefox\Profiles') + + elif sys.platform == 'darwin': + yield os.path.expanduser('~/Library/Application Support/Firefox/Profiles') + + else: + yield from map(os.path.expanduser, ('~/.mozilla/firefox', '~/snap/firefox/common/.mozilla/firefox')) + + +def _firefox_cookie_dbs(roots): + for root in map(os.path.abspath, roots): + for pattern in ('', '*/', 'Profiles/*/'): + yield from glob.iglob(os.path.join(root, pattern, 'cookies.sqlite')) + + +def _get_chromium_based_browser_settings(browser_name): + # https://chromium.googlesource.com/chromium/src/+/HEAD/docs/user_data_dir.md + if sys.platform in ('cygwin', 'win32'): + appdata_local = os.path.expandvars('%LOCALAPPDATA%') + appdata_roaming = os.path.expandvars('%APPDATA%') + browser_dir = { + 'brave': os.path.join(appdata_local, R'BraveSoftware\Brave-Browser\User Data'), + 'chrome': os.path.join(appdata_local, R'Google\Chrome\User Data'), + 'chromium': os.path.join(appdata_local, R'Chromium\User Data'), + 'edge': os.path.join(appdata_local, R'Microsoft\Edge\User Data'), + 'opera': os.path.join(appdata_roaming, R'Opera Software\Opera Stable'), + 'vivaldi': os.path.join(appdata_local, R'Vivaldi\User Data'), + }[browser_name] + + elif sys.platform == 'darwin': + appdata = os.path.expanduser('~/Library/Application Support') + browser_dir = { + 'brave': os.path.join(appdata, 'BraveSoftware/Brave-Browser'), + 'chrome': os.path.join(appdata, 'Google/Chrome'), + 'chromium': os.path.join(appdata, 'Chromium'), + 'edge': os.path.join(appdata, 'Microsoft Edge'), + 'opera': os.path.join(appdata, 'com.operasoftware.Opera'), + 'vivaldi': os.path.join(appdata, 'Vivaldi'), + }[browser_name] + + else: + config = _config_home() + browser_dir = { + 'brave': os.path.join(config, 'BraveSoftware/Brave-Browser'), + 'chrome': os.path.join(config, 'google-chrome'), + 'chromium': os.path.join(config, 'chromium'), + 'edge': os.path.join(config, 'microsoft-edge'), + 'opera': os.path.join(config, 'opera'), + 'vivaldi': os.path.join(config, 'vivaldi'), + }[browser_name] + + # Linux keyring names can be determined by snooping on dbus while opening the browser in KDE: + # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" + keyring_name = { + 'brave': 'Brave', + 'chrome': 'Chrome', + 'chromium': 'Chromium', + 'edge': 'Microsoft Edge' if sys.platform == 'darwin' else 'Chromium', + 'opera': 'Opera' if sys.platform == 'darwin' else 'Chromium', + 'vivaldi': 'Vivaldi' if sys.platform == 'darwin' else 'Chrome', + }[browser_name] + + browsers_without_profiles = {'opera'} + + return { + 'browser_dir': browser_dir, + 'keyring_name': keyring_name, + 'supports_profiles': browser_name not in browsers_without_profiles + } + + +def _extract_chrome_cookies(browser_name, profile, keyring, logger): + logger.info(f'Extracting cookies from {browser_name}') + + if not sqlite3: + logger.warning(f'Cannot extract cookies from {browser_name} without sqlite3 support. ' + 'Please use a Python interpreter compiled with sqlite3 support') + return YoutubeDLCookieJar() + + config = _get_chromium_based_browser_settings(browser_name) + + if profile is None: + search_root = config['browser_dir'] + elif _is_path(profile): + search_root = profile + config['browser_dir'] = os.path.dirname(profile) if config['supports_profiles'] else profile + else: + if config['supports_profiles']: + search_root = os.path.join(config['browser_dir'], profile) + else: + logger.error(f'{browser_name} does not support profiles') + search_root = config['browser_dir'] + + cookie_database_path = _newest(_find_files(search_root, 'Cookies', logger)) + if cookie_database_path is None: + raise FileNotFoundError(f'could not find {browser_name} cookies database in "{search_root}"') + logger.debug(f'Extracting cookies from: "{cookie_database_path}"') + + decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger, keyring=keyring) + + with tempfile.TemporaryDirectory(prefix='yt_dlp') as tmpdir: + cursor = None + try: + cursor = _open_database_copy(cookie_database_path, tmpdir) + cursor.connection.text_factory = bytes + column_names = _get_column_names(cursor, 'cookies') + secure_column = 'is_secure' if 'is_secure' in column_names else 'secure' + cursor.execute(f'SELECT host_key, name, value, encrypted_value, path, expires_utc, {secure_column} FROM cookies') + jar = YoutubeDLCookieJar() + failed_cookies = 0 + unencrypted_cookies = 0 + with _create_progress_bar(logger) as progress_bar: + table = cursor.fetchall() + total_cookie_count = len(table) + for i, line in enumerate(table): + progress_bar.print(f'Loading cookie {i: 6d}/{total_cookie_count: 6d}') + is_encrypted, cookie = _process_chrome_cookie(decryptor, *line) + if not cookie: + failed_cookies += 1 + continue + elif not is_encrypted: + unencrypted_cookies += 1 + jar.set_cookie(cookie) + if failed_cookies > 0: + failed_message = f' ({failed_cookies} could not be decrypted)' + else: + failed_message = '' + logger.info(f'Extracted {len(jar)} cookies from {browser_name}{failed_message}') + counts = decryptor._cookie_counts.copy() + counts['unencrypted'] = unencrypted_cookies + logger.debug(f'cookie version breakdown: {counts}') + return jar + except PermissionError as error: + if compat_os_name == 'nt' and error.errno == 13: + message = 'Could not copy Chrome cookie database. See https://github.com/yt-dlp/yt-dlp/issues/7271 for more info' + logger.error(message) + raise DownloadError(message) # force exit + raise + finally: + if cursor is not None: + cursor.connection.close() + + +def _process_chrome_cookie(decryptor, host_key, name, value, encrypted_value, path, expires_utc, is_secure): + host_key = host_key.decode() + name = name.decode() + value = value.decode() + path = path.decode() + is_encrypted = not value and encrypted_value + + if is_encrypted: + value = decryptor.decrypt(encrypted_value) + if value is None: + return is_encrypted, None + + return is_encrypted, http.cookiejar.Cookie( + version=0, name=name, value=value, port=None, port_specified=False, + domain=host_key, domain_specified=bool(host_key), domain_initial_dot=host_key.startswith('.'), + path=path, path_specified=bool(path), secure=is_secure, expires=expires_utc, discard=False, + comment=None, comment_url=None, rest={}) + + +class ChromeCookieDecryptor: + """ + Overview: + + Linux: + - cookies are either v10 or v11 + - v10: AES-CBC encrypted with a fixed key + - also attempts empty password if decryption fails + - v11: AES-CBC encrypted with an OS protected key (keyring) + - also attempts empty password if decryption fails + - v11 keys can be stored in various places depending on the activate desktop environment [2] + + Mac: + - cookies are either v10 or not v10 + - v10: AES-CBC encrypted with an OS protected key (keyring) and more key derivation iterations than linux + - not v10: 'old data' stored as plaintext + + Windows: + - cookies are either v10 or not v10 + - v10: AES-GCM encrypted with a key which is encrypted with DPAPI + - not v10: encrypted with DPAPI + + Sources: + - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/ + - [2] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_linux.cc + - KeyStorageLinux::CreateService + """ + + _cookie_counts = {} + + def decrypt(self, encrypted_value): + raise NotImplementedError('Must be implemented by sub classes') + + +def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None): + if sys.platform == 'darwin': + return MacChromeCookieDecryptor(browser_keyring_name, logger) + elif sys.platform in ('win32', 'cygwin'): + return WindowsChromeCookieDecryptor(browser_root, logger) + return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring) + + +class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): + def __init__(self, browser_keyring_name, logger, *, keyring=None): + self._logger = logger + self._v10_key = self.derive_key(b'peanuts') + self._empty_key = self.derive_key(b'') + self._cookie_counts = {'v10': 0, 'v11': 0, 'other': 0} + self._browser_keyring_name = browser_keyring_name + self._keyring = keyring + + @functools.cached_property + def _v11_key(self): + password = _get_linux_keyring_password(self._browser_keyring_name, self._keyring, self._logger) + return None if password is None else self.derive_key(password) + + @staticmethod + def derive_key(password): + # values from + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_linux.cc + return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1, key_length=16) + + def decrypt(self, encrypted_value): + """ + + following the same approach as the fix in [1]: if cookies fail to decrypt then attempt to decrypt + with an empty password. The failure detection is not the same as what chromium uses so the + results won't be perfect + + References: + - [1] https://chromium.googlesource.com/chromium/src/+/bbd54702284caca1f92d656fdcadf2ccca6f4165%5E%21/ + - a bugfix to try an empty password as a fallback + """ + version = encrypted_value[:3] + ciphertext = encrypted_value[3:] + + if version == b'v10': + self._cookie_counts['v10'] += 1 + return _decrypt_aes_cbc_multi(ciphertext, (self._v10_key, self._empty_key), self._logger) + + elif version == b'v11': + self._cookie_counts['v11'] += 1 + if self._v11_key is None: + self._logger.warning('cannot decrypt v11 cookies: no key found', only_once=True) + return None + return _decrypt_aes_cbc_multi(ciphertext, (self._v11_key, self._empty_key), self._logger) + + else: + self._logger.warning(f'unknown cookie version: "{version}"', only_once=True) + self._cookie_counts['other'] += 1 + return None + + +class MacChromeCookieDecryptor(ChromeCookieDecryptor): + def __init__(self, browser_keyring_name, logger): + self._logger = logger + password = _get_mac_keyring_password(browser_keyring_name, logger) + self._v10_key = None if password is None else self.derive_key(password) + self._cookie_counts = {'v10': 0, 'other': 0} + + @staticmethod + def derive_key(password): + # values from + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_mac.mm + return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1003, key_length=16) + + def decrypt(self, encrypted_value): + version = encrypted_value[:3] + ciphertext = encrypted_value[3:] + + if version == b'v10': + self._cookie_counts['v10'] += 1 + if self._v10_key is None: + self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) + return None + + return _decrypt_aes_cbc_multi(ciphertext, (self._v10_key,), self._logger) + + else: + self._cookie_counts['other'] += 1 + # other prefixes are considered 'old data' which were stored as plaintext + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_mac.mm + return encrypted_value + + +class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): + def __init__(self, browser_root, logger): + self._logger = logger + self._v10_key = _get_windows_v10_key(browser_root, logger) + self._cookie_counts = {'v10': 0, 'other': 0} + + def decrypt(self, encrypted_value): + version = encrypted_value[:3] + ciphertext = encrypted_value[3:] + + if version == b'v10': + self._cookie_counts['v10'] += 1 + if self._v10_key is None: + self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) + return None + + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc + # kNonceLength + nonce_length = 96 // 8 + # boringssl + # EVP_AEAD_AES_GCM_TAG_LEN + authentication_tag_length = 16 + + raw_ciphertext = ciphertext + nonce = raw_ciphertext[:nonce_length] + ciphertext = raw_ciphertext[nonce_length:-authentication_tag_length] + authentication_tag = raw_ciphertext[-authentication_tag_length:] + + return _decrypt_aes_gcm(ciphertext, self._v10_key, nonce, authentication_tag, self._logger) + + else: + self._cookie_counts['other'] += 1 + # any other prefix means the data is DPAPI encrypted + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc + return _decrypt_windows_dpapi(encrypted_value, self._logger).decode() + + +def _extract_safari_cookies(profile, logger): + if sys.platform != 'darwin': + raise ValueError(f'unsupported platform: {sys.platform}') + + if profile: + cookies_path = os.path.expanduser(profile) + if not os.path.isfile(cookies_path): + raise FileNotFoundError('custom safari cookies database not found') + + else: + cookies_path = os.path.expanduser('~/Library/Cookies/Cookies.binarycookies') + + if not os.path.isfile(cookies_path): + logger.debug('Trying secondary cookie location') + cookies_path = os.path.expanduser('~/Library/Containers/com.apple.Safari/Data/Library/Cookies/Cookies.binarycookies') + if not os.path.isfile(cookies_path): + raise FileNotFoundError('could not find safari cookies database') + + with open(cookies_path, 'rb') as f: + cookies_data = f.read() + + jar = parse_safari_cookies(cookies_data, logger=logger) + logger.info(f'Extracted {len(jar)} cookies from safari') + return jar + + +class ParserError(Exception): + pass + + +class DataParser: + def __init__(self, data, logger): + self._data = data + self.cursor = 0 + self._logger = logger + + def read_bytes(self, num_bytes): + if num_bytes < 0: + raise ParserError(f'invalid read of {num_bytes} bytes') + end = self.cursor + num_bytes + if end > len(self._data): + raise ParserError('reached end of input') + data = self._data[self.cursor:end] + self.cursor = end + return data + + def expect_bytes(self, expected_value, message): + value = self.read_bytes(len(expected_value)) + if value != expected_value: + raise ParserError(f'unexpected value: {value} != {expected_value} ({message})') + + def read_uint(self, big_endian=False): + data_format = '>I' if big_endian else ' 0: + self._logger.debug(f'skipping {num_bytes} bytes ({description}): {self.read_bytes(num_bytes)!r}') + elif num_bytes < 0: + raise ParserError(f'invalid skip of {num_bytes} bytes') + + def skip_to(self, offset, description='unknown'): + self.skip(offset - self.cursor, description) + + def skip_to_end(self, description='unknown'): + self.skip_to(len(self._data), description) + + +def _mac_absolute_time_to_posix(timestamp): + return int((datetime(2001, 1, 1, 0, 0, tzinfo=timezone.utc) + timedelta(seconds=timestamp)).timestamp()) + + +def _parse_safari_cookies_header(data, logger): + p = DataParser(data, logger) + p.expect_bytes(b'cook', 'database signature') + number_of_pages = p.read_uint(big_endian=True) + page_sizes = [p.read_uint(big_endian=True) for _ in range(number_of_pages)] + return page_sizes, p.cursor + + +def _parse_safari_cookies_page(data, jar, logger): + p = DataParser(data, logger) + p.expect_bytes(b'\x00\x00\x01\x00', 'page signature') + number_of_cookies = p.read_uint() + record_offsets = [p.read_uint() for _ in range(number_of_cookies)] + if number_of_cookies == 0: + logger.debug(f'a cookies page of size {len(data)} has no cookies') + return + + p.skip_to(record_offsets[0], 'unknown page header field') + + with _create_progress_bar(logger) as progress_bar: + for i, record_offset in enumerate(record_offsets): + progress_bar.print(f'Loading cookie {i: 6d}/{number_of_cookies: 6d}') + p.skip_to(record_offset, 'space between records') + record_length = _parse_safari_cookies_record(data[record_offset:], jar, logger) + p.read_bytes(record_length) + p.skip_to_end('space in between pages') + + +def _parse_safari_cookies_record(data, jar, logger): + p = DataParser(data, logger) + record_size = p.read_uint() + p.skip(4, 'unknown record field 1') + flags = p.read_uint() + is_secure = bool(flags & 0x0001) + p.skip(4, 'unknown record field 2') + domain_offset = p.read_uint() + name_offset = p.read_uint() + path_offset = p.read_uint() + value_offset = p.read_uint() + p.skip(8, 'unknown record field 3') + expiration_date = _mac_absolute_time_to_posix(p.read_double()) + _creation_date = _mac_absolute_time_to_posix(p.read_double()) # noqa: F841 + + try: + p.skip_to(domain_offset) + domain = p.read_cstring() + + p.skip_to(name_offset) + name = p.read_cstring() + + p.skip_to(path_offset) + path = p.read_cstring() + + p.skip_to(value_offset) + value = p.read_cstring() + except UnicodeDecodeError: + logger.warning('failed to parse Safari cookie because UTF-8 decoding failed', only_once=True) + return record_size + + p.skip_to(record_size, 'space at the end of the record') + + cookie = http.cookiejar.Cookie( + version=0, name=name, value=value, port=None, port_specified=False, + domain=domain, domain_specified=bool(domain), domain_initial_dot=domain.startswith('.'), + path=path, path_specified=bool(path), secure=is_secure, expires=expiration_date, discard=False, + comment=None, comment_url=None, rest={}) + jar.set_cookie(cookie) + return record_size + + +def parse_safari_cookies(data, jar=None, logger=YDLLogger()): + """ + References: + - https://github.com/libyal/dtformats/blob/main/documentation/Safari%20Cookies.asciidoc + - this data appears to be out of date but the important parts of the database structure is the same + - there are a few bytes here and there which are skipped during parsing + """ + if jar is None: + jar = YoutubeDLCookieJar() + page_sizes, body_start = _parse_safari_cookies_header(data, logger) + p = DataParser(data[body_start:], logger) + for page_size in page_sizes: + _parse_safari_cookies_page(p.read_bytes(page_size), jar, logger) + p.skip_to_end('footer') + return jar + + +class _LinuxDesktopEnvironment(Enum): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util.h + DesktopEnvironment + """ + OTHER = auto() + CINNAMON = auto() + DEEPIN = auto() + GNOME = auto() + KDE3 = auto() + KDE4 = auto() + KDE5 = auto() + KDE6 = auto() + PANTHEON = auto() + UKUI = auto() + UNITY = auto() + XFCE = auto() + LXQT = auto() + + +class _LinuxKeyring(Enum): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_util_linux.h + SelectedLinuxBackend + """ + KWALLET = auto() # KDE4 + KWALLET5 = auto() + KWALLET6 = auto() + GNOMEKEYRING = auto() + BASICTEXT = auto() + + +SUPPORTED_KEYRINGS = _LinuxKeyring.__members__.keys() + + +def _get_linux_desktop_environment(env, logger): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util.cc + GetDesktopEnvironment + """ + xdg_current_desktop = env.get('XDG_CURRENT_DESKTOP', None) + desktop_session = env.get('DESKTOP_SESSION', None) + if xdg_current_desktop is not None: + xdg_current_desktop = xdg_current_desktop.split(':')[0].strip() + + if xdg_current_desktop == 'Unity': + if desktop_session is not None and 'gnome-fallback' in desktop_session: + return _LinuxDesktopEnvironment.GNOME + else: + return _LinuxDesktopEnvironment.UNITY + elif xdg_current_desktop == 'Deepin': + return _LinuxDesktopEnvironment.DEEPIN + elif xdg_current_desktop == 'GNOME': + return _LinuxDesktopEnvironment.GNOME + elif xdg_current_desktop == 'X-Cinnamon': + return _LinuxDesktopEnvironment.CINNAMON + elif xdg_current_desktop == 'KDE': + kde_version = env.get('KDE_SESSION_VERSION', None) + if kde_version == '5': + return _LinuxDesktopEnvironment.KDE5 + elif kde_version == '6': + return _LinuxDesktopEnvironment.KDE6 + elif kde_version == '4': + return _LinuxDesktopEnvironment.KDE4 + else: + logger.info(f'unknown KDE version: "{kde_version}". Assuming KDE4') + return _LinuxDesktopEnvironment.KDE4 + elif xdg_current_desktop == 'Pantheon': + return _LinuxDesktopEnvironment.PANTHEON + elif xdg_current_desktop == 'XFCE': + return _LinuxDesktopEnvironment.XFCE + elif xdg_current_desktop == 'UKUI': + return _LinuxDesktopEnvironment.UKUI + elif xdg_current_desktop == 'LXQt': + return _LinuxDesktopEnvironment.LXQT + else: + logger.info(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"') + + elif desktop_session is not None: + if desktop_session == 'deepin': + return _LinuxDesktopEnvironment.DEEPIN + elif desktop_session in ('mate', 'gnome'): + return _LinuxDesktopEnvironment.GNOME + elif desktop_session in ('kde4', 'kde-plasma'): + return _LinuxDesktopEnvironment.KDE4 + elif desktop_session == 'kde': + if 'KDE_SESSION_VERSION' in env: + return _LinuxDesktopEnvironment.KDE4 + else: + return _LinuxDesktopEnvironment.KDE3 + elif 'xfce' in desktop_session or desktop_session == 'xubuntu': + return _LinuxDesktopEnvironment.XFCE + elif desktop_session == 'ukui': + return _LinuxDesktopEnvironment.UKUI + else: + logger.info(f'DESKTOP_SESSION is set to an unknown value: "{desktop_session}"') + + else: + if 'GNOME_DESKTOP_SESSION_ID' in env: + return _LinuxDesktopEnvironment.GNOME + elif 'KDE_FULL_SESSION' in env: + if 'KDE_SESSION_VERSION' in env: + return _LinuxDesktopEnvironment.KDE4 + else: + return _LinuxDesktopEnvironment.KDE3 + return _LinuxDesktopEnvironment.OTHER + + +def _choose_linux_keyring(logger): + """ + SelectBackend in [1] + + There is currently support for forcing chromium to use BASIC_TEXT by creating a file called + `Disable Local Encryption` [1] in the user data dir. The function to write this file (`WriteBackendUse()` [1]) + does not appear to be called anywhere other than in tests, so the user would have to create this file manually + and so would be aware enough to tell yt-dlp to use the BASIC_TEXT keyring. + + References: + - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_util_linux.cc + """ + desktop_environment = _get_linux_desktop_environment(os.environ, logger) + logger.debug(f'detected desktop environment: {desktop_environment.name}') + if desktop_environment == _LinuxDesktopEnvironment.KDE4: + linux_keyring = _LinuxKeyring.KWALLET + elif desktop_environment == _LinuxDesktopEnvironment.KDE5: + linux_keyring = _LinuxKeyring.KWALLET5 + elif desktop_environment == _LinuxDesktopEnvironment.KDE6: + linux_keyring = _LinuxKeyring.KWALLET6 + elif desktop_environment in ( + _LinuxDesktopEnvironment.KDE3, _LinuxDesktopEnvironment.LXQT, _LinuxDesktopEnvironment.OTHER + ): + linux_keyring = _LinuxKeyring.BASICTEXT + else: + linux_keyring = _LinuxKeyring.GNOMEKEYRING + return linux_keyring + + +def _get_kwallet_network_wallet(keyring, logger): + """ The name of the wallet used to store network passwords. + + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/kwallet_dbus.cc + KWalletDBus::NetworkWallet + which does a dbus call to the following function: + https://api.kde.org/frameworks/kwallet/html/classKWallet_1_1Wallet.html + Wallet::NetworkWallet + """ + default_wallet = 'kdewallet' + try: + if keyring == _LinuxKeyring.KWALLET: + service_name = 'org.kde.kwalletd' + wallet_path = '/modules/kwalletd' + elif keyring == _LinuxKeyring.KWALLET5: + service_name = 'org.kde.kwalletd5' + wallet_path = '/modules/kwalletd5' + elif keyring == _LinuxKeyring.KWALLET6: + service_name = 'org.kde.kwalletd6' + wallet_path = '/modules/kwalletd6' + else: + raise ValueError(keyring) + + stdout, _, returncode = Popen.run([ + 'dbus-send', '--session', '--print-reply=literal', + f'--dest={service_name}', + wallet_path, + 'org.kde.KWallet.networkWallet' + ], text=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + + if returncode: + logger.warning('failed to read NetworkWallet') + return default_wallet + else: + logger.debug(f'NetworkWallet = "{stdout.strip()}"') + return stdout.strip() + except Exception as e: + logger.warning(f'exception while obtaining NetworkWallet: {e}') + return default_wallet + + +def _get_kwallet_password(browser_keyring_name, keyring, logger): + logger.debug(f'using kwallet-query to obtain password from {keyring.name}') + + if shutil.which('kwallet-query') is None: + logger.error('kwallet-query command not found. KWallet and kwallet-query ' + 'must be installed to read from KWallet. kwallet-query should be' + 'included in the kwallet package for your distribution') + return b'' + + network_wallet = _get_kwallet_network_wallet(keyring, logger) + + try: + stdout, _, returncode = Popen.run([ + 'kwallet-query', + '--read-password', f'{browser_keyring_name} Safe Storage', + '--folder', f'{browser_keyring_name} Keys', + network_wallet + ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + + if returncode: + logger.error(f'kwallet-query failed with return code {returncode}. ' + 'Please consult the kwallet-query man page for details') + return b'' + else: + if stdout.lower().startswith(b'failed to read'): + logger.debug('failed to read password from kwallet. Using empty string instead') + # this sometimes occurs in KDE because chrome does not check hasEntry and instead + # just tries to read the value (which kwallet returns "") whereas kwallet-query + # checks hasEntry. To verify this: + # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" + # while starting chrome. + # this was identified as a bug later and fixed in + # https://chromium.googlesource.com/chromium/src/+/bbd54702284caca1f92d656fdcadf2ccca6f4165%5E%21/#F0 + # https://chromium.googlesource.com/chromium/src/+/5463af3c39d7f5b6d11db7fbd51e38cc1974d764 + return b'' + else: + logger.debug('password found') + return stdout.rstrip(b'\n') + except Exception as e: + logger.warning(f'exception running kwallet-query: {error_to_str(e)}') + return b'' + + +def _get_gnome_keyring_password(browser_keyring_name, logger): + if not secretstorage: + logger.error(f'secretstorage not available {_SECRETSTORAGE_UNAVAILABLE_REASON}') + return b'' + # the Gnome keyring does not seem to organise keys in the same way as KWallet, + # using `dbus-monitor` during startup, it can be observed that chromium lists all keys + # and presumably searches for its key in the list. It appears that we must do the same. + # https://github.com/jaraco/keyring/issues/556 + with contextlib.closing(secretstorage.dbus_init()) as con: + col = secretstorage.get_default_collection(con) + for item in col.get_all_items(): + if item.get_label() == f'{browser_keyring_name} Safe Storage': + return item.get_secret() + else: + logger.error('failed to read from keyring') + return b'' + + +def _get_linux_keyring_password(browser_keyring_name, keyring, logger): + # note: chrome/chromium can be run with the following flags to determine which keyring backend + # it has chosen to use + # chromium --enable-logging=stderr --v=1 2>&1 | grep key_storage_ + # Chromium supports a flag: --password-store= so the automatic detection + # will not be sufficient in all cases. + + keyring = _LinuxKeyring[keyring] if keyring else _choose_linux_keyring(logger) + logger.debug(f'Chosen keyring: {keyring.name}') + + if keyring in (_LinuxKeyring.KWALLET, _LinuxKeyring.KWALLET5, _LinuxKeyring.KWALLET6): + return _get_kwallet_password(browser_keyring_name, keyring, logger) + elif keyring == _LinuxKeyring.GNOMEKEYRING: + return _get_gnome_keyring_password(browser_keyring_name, logger) + elif keyring == _LinuxKeyring.BASICTEXT: + # when basic text is chosen, all cookies are stored as v10 (so no keyring password is required) + return None + assert False, f'Unknown keyring {keyring}' + + +def _get_mac_keyring_password(browser_keyring_name, logger): + logger.debug('using find-generic-password to obtain password from OSX keychain') + try: + stdout, _, returncode = Popen.run( + ['security', 'find-generic-password', + '-w', # write password to stdout + '-a', browser_keyring_name, # match 'account' + '-s', f'{browser_keyring_name} Safe Storage'], # match 'service' + stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + if returncode: + logger.warning('find-generic-password failed') + return None + return stdout.rstrip(b'\n') + except Exception as e: + logger.warning(f'exception running find-generic-password: {error_to_str(e)}') + return None + + +def _get_windows_v10_key(browser_root, logger): + """ + References: + - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc + """ + path = _newest(_find_files(browser_root, 'Local State', logger)) + if path is None: + logger.error('could not find local state file') + return None + logger.debug(f'Found local state file at "{path}"') + with open(path, encoding='utf8') as f: + data = json.load(f) + try: + # kOsCryptEncryptedKeyPrefName in [1] + base64_key = data['os_crypt']['encrypted_key'] + except KeyError: + logger.error('no encrypted key in Local State') + return None + encrypted_key = base64.b64decode(base64_key) + # kDPAPIKeyPrefix in [1] + prefix = b'DPAPI' + if not encrypted_key.startswith(prefix): + logger.error('invalid key') + return None + return _decrypt_windows_dpapi(encrypted_key[len(prefix):], logger) + + +def pbkdf2_sha1(password, salt, iterations, key_length): + return pbkdf2_hmac('sha1', password, salt, iterations, key_length) + + +def _decrypt_aes_cbc_multi(ciphertext, keys, logger, initialization_vector=b' ' * 16): + for key in keys: + plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)) + try: + return plaintext.decode() + except UnicodeDecodeError: + pass + logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) + return None + + +def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger): + try: + plaintext = aes_gcm_decrypt_and_verify_bytes(ciphertext, key, authentication_tag, nonce) + except ValueError: + logger.warning('failed to decrypt cookie (AES-GCM) because the MAC check failed. Possibly the key is wrong?', only_once=True) + return None + + try: + return plaintext.decode() + except UnicodeDecodeError: + logger.warning('failed to decrypt cookie (AES-GCM) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) + return None + + +def _decrypt_windows_dpapi(ciphertext, logger): + """ + References: + - https://docs.microsoft.com/en-us/windows/win32/api/dpapi/nf-dpapi-cryptunprotectdata + """ + + import ctypes + import ctypes.wintypes + + class DATA_BLOB(ctypes.Structure): + _fields_ = [('cbData', ctypes.wintypes.DWORD), + ('pbData', ctypes.POINTER(ctypes.c_char))] + + buffer = ctypes.create_string_buffer(ciphertext) + blob_in = DATA_BLOB(ctypes.sizeof(buffer), buffer) + blob_out = DATA_BLOB() + ret = ctypes.windll.crypt32.CryptUnprotectData( + ctypes.byref(blob_in), # pDataIn + None, # ppszDataDescr: human readable description of pDataIn + None, # pOptionalEntropy: salt? + None, # pvReserved: must be NULL + None, # pPromptStruct: information about prompts to display + 0, # dwFlags + ctypes.byref(blob_out) # pDataOut + ) + if not ret: + logger.warning('failed to decrypt with DPAPI', only_once=True) + return None + + result = ctypes.string_at(blob_out.pbData, blob_out.cbData) + ctypes.windll.kernel32.LocalFree(blob_out.pbData) + return result + + +def _config_home(): + return os.environ.get('XDG_CONFIG_HOME', os.path.expanduser('~/.config')) + + +def _open_database_copy(database_path, tmpdir): + # cannot open sqlite databases if they are already in use (e.g. by the browser) + database_copy_path = os.path.join(tmpdir, 'temporary.sqlite') + shutil.copy(database_path, database_copy_path) + conn = sqlite3.connect(database_copy_path) + return conn.cursor() + + +def _get_column_names(cursor, table_name): + table_info = cursor.execute(f'PRAGMA table_info({table_name})').fetchall() + return [row[1].decode() for row in table_info] + + +def _newest(files): + return max(files, key=lambda path: os.lstat(path).st_mtime, default=None) + + +def _find_files(root, filename, logger): + # if there are multiple browser profiles, take the most recently used one + i = 0 + with _create_progress_bar(logger) as progress_bar: + for curr_root, _, files in os.walk(root): + for file in files: + i += 1 + progress_bar.print(f'Searching for "{filename}": {i: 6d} files searched') + if file == filename: + yield os.path.join(curr_root, file) + + +def _merge_cookie_jars(jars): + output_jar = YoutubeDLCookieJar() + for jar in jars: + for cookie in jar: + output_jar.set_cookie(cookie) + if jar.filename is not None: + output_jar.filename = jar.filename + return output_jar + + +def _is_path(value): + return any(sep in value for sep in (os.path.sep, os.path.altsep) if sep) + + +def _parse_browser_specification(browser_name, profile=None, keyring=None, container=None): + if browser_name not in SUPPORTED_BROWSERS: + raise ValueError(f'unsupported browser: "{browser_name}"') + if keyring not in (None, *SUPPORTED_KEYRINGS): + raise ValueError(f'unsupported keyring: "{keyring}"') + if profile is not None and _is_path(expand_path(profile)): + profile = expand_path(profile) + return browser_name, profile, keyring, container + + +class LenientSimpleCookie(http.cookies.SimpleCookie): + """More lenient version of http.cookies.SimpleCookie""" + # From https://github.com/python/cpython/blob/v3.10.7/Lib/http/cookies.py + # We use Morsel's legal key chars to avoid errors on setting values + _LEGAL_KEY_CHARS = r'\w\d' + re.escape('!#$%&\'*+-.:^_`|~') + _LEGAL_VALUE_CHARS = _LEGAL_KEY_CHARS + re.escape('(),/<=>?@[]{}') + + _RESERVED = { + "expires", + "path", + "comment", + "domain", + "max-age", + "secure", + "httponly", + "version", + "samesite", + } + + _FLAGS = {"secure", "httponly"} + + # Added 'bad' group to catch the remaining value + _COOKIE_PATTERN = re.compile(r""" + \s* # Optional whitespace at start of cookie + (?P # Start of group 'key' + [""" + _LEGAL_KEY_CHARS + r"""]+?# Any word of at least one letter + ) # End of group 'key' + ( # Optional group: there may not be a value. + \s*=\s* # Equal Sign + ( # Start of potential value + (?P # Start of group 'val' + "(?:[^\\"]|\\.)*" # Any doublequoted string + | # or + \w{3},\s[\w\d\s-]{9,11}\s[\d:]{8}\sGMT # Special case for "expires" attr + | # or + [""" + _LEGAL_VALUE_CHARS + r"""]* # Any word or empty string + ) # End of group 'val' + | # or + (?P(?:\\;|[^;])*?) # 'bad' group fallback for invalid values + ) # End of potential value + )? # End of optional value group + \s* # Any number of spaces. + (\s+|;|$) # Ending either at space, semicolon, or EOS. + """, re.ASCII | re.VERBOSE) + + def load(self, data): + # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4776 + if not isinstance(data, str): + return super().load(data) + + morsel = None + for match in self._COOKIE_PATTERN.finditer(data): + if match.group('bad'): + morsel = None + continue + + key, value = match.group('key', 'val') + + is_attribute = False + if key.startswith('$'): + key = key[1:] + is_attribute = True + + lower_key = key.lower() + if lower_key in self._RESERVED: + if morsel is None: + continue + + if value is None: + if lower_key not in self._FLAGS: + morsel = None + continue + value = True + else: + value, _ = self.value_decode(value) + + morsel[key] = value + + elif is_attribute: + morsel = None + + elif value is not None: + morsel = self.get(key, http.cookies.Morsel()) + real_value, coded_value = self.value_decode(value) + morsel.set(key, real_value, coded_value) + self[key] = morsel + + else: + morsel = None + + +class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): + """ + See [1] for cookie file format. + + 1. https://curl.haxx.se/docs/http-cookies.html + """ + _HTTPONLY_PREFIX = '#HttpOnly_' + _ENTRY_LEN = 7 + _HEADER = '''# Netscape HTTP Cookie File +# This file is generated by yt-dlp. Do not edit. + +''' + _CookieFileEntry = collections.namedtuple( + 'CookieFileEntry', + ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value')) + + def __init__(self, filename=None, *args, **kwargs): + super().__init__(None, *args, **kwargs) + if is_path_like(filename): + filename = os.fspath(filename) + self.filename = filename + + @staticmethod + def _true_or_false(cndn): + return 'TRUE' if cndn else 'FALSE' + + @contextlib.contextmanager + def open(self, file, *, write=False): + if is_path_like(file): + with open(file, 'w' if write else 'r', encoding='utf-8') as f: + yield f + else: + if write: + file.truncate(0) + yield file + + def _really_save(self, f, ignore_discard, ignore_expires): + now = time.time() + for cookie in self: + if (not ignore_discard and cookie.discard + or not ignore_expires and cookie.is_expired(now)): + continue + name, value = cookie.name, cookie.value + if value is None: + # cookies.txt regards 'Set-Cookie: foo' as a cookie + # with no name, whereas http.cookiejar regards it as a + # cookie with no value. + name, value = '', name + f.write('%s\n' % '\t'.join(( + cookie.domain, + self._true_or_false(cookie.domain.startswith('.')), + cookie.path, + self._true_or_false(cookie.secure), + str_or_none(cookie.expires, default=''), + name, value + ))) + + def save(self, filename=None, ignore_discard=True, ignore_expires=True): + """ + Save cookies to a file. + Code is taken from CPython 3.6 + https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """ + + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) + + # Store session cookies with `expires` set to 0 instead of an empty string + for cookie in self: + if cookie.expires is None: + cookie.expires = 0 + + with self.open(filename, write=True) as f: + f.write(self._HEADER) + self._really_save(f, ignore_discard, ignore_expires) + + def load(self, filename=None, ignore_discard=True, ignore_expires=True): + """Load cookies from a file.""" + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) + + def prepare_line(line): + if line.startswith(self._HTTPONLY_PREFIX): + line = line[len(self._HTTPONLY_PREFIX):] + # comments and empty lines are fine + if line.startswith('#') or not line.strip(): + return line + cookie_list = line.split('\t') + if len(cookie_list) != self._ENTRY_LEN: + raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list)) + cookie = self._CookieFileEntry(*cookie_list) + if cookie.expires_at and not cookie.expires_at.isdigit(): + raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at) + return line + + cf = io.StringIO() + with self.open(filename) as f: + for line in f: + try: + cf.write(prepare_line(line)) + except http.cookiejar.LoadError as e: + if f'{line.strip()} '[0] in '[{"': + raise http.cookiejar.LoadError( + 'Cookies file must be Netscape formatted, not JSON. See ' + 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp') + write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n') + continue + cf.seek(0) + self._really_load(cf, filename, ignore_discard, ignore_expires) + # Session cookies are denoted by either `expires` field set to + # an empty string or 0. MozillaCookieJar only recognizes the former + # (see [1]). So we need force the latter to be recognized as session + # cookies on our own. + # Session cookies may be important for cookies-based authentication, + # e.g. usually, when user does not check 'Remember me' check box while + # logging in on a site, some important cookies are stored as session + # cookies so that not recognizing them will result in failed login. + # 1. https://bugs.python.org/issue17164 + for cookie in self: + # Treat `expires=0` cookies as session cookies + if cookie.expires == 0: + cookie.expires = None + cookie.discard = True + + def get_cookie_header(self, url): + """Generate a Cookie HTTP header for a given url""" + cookie_req = urllib.request.Request(normalize_url(sanitize_url(url))) + self.add_cookie_header(cookie_req) + return cookie_req.get_header('Cookie') + + def get_cookies_for_url(self, url): + """Generate a list of Cookie objects for a given url""" + # Policy `_now` attribute must be set before calling `_cookies_for_request` + # Ref: https://github.com/python/cpython/blob/3.7/Lib/http/cookiejar.py#L1360 + self._policy._now = self._now = int(time.time()) + return self._cookies_for_request(urllib.request.Request(normalize_url(sanitize_url(url)))) + + def clear(self, *args, **kwargs): + with contextlib.suppress(KeyError): + return super().clear(*args, **kwargs) diff --git a/yt_dlp/dependencies/Cryptodome.py b/yt_dlp/dependencies/Cryptodome.py new file mode 100644 index 0000000..2cfa4c9 --- /dev/null +++ b/yt_dlp/dependencies/Cryptodome.py @@ -0,0 +1,38 @@ +from ..compat.compat_utils import passthrough_module + +try: + import Cryptodome as _parent +except ImportError: + try: + import Crypto as _parent + except (ImportError, SyntaxError): # Old Crypto gives SyntaxError in newer Python + _parent = passthrough_module(__name__, 'no_Cryptodome') + __bool__ = lambda: False + +del passthrough_module + +__version__ = '' +AES = PKCS1_v1_5 = Blowfish = PKCS1_OAEP = SHA1 = CMAC = RSA = None +try: + if _parent.__name__ == 'Cryptodome': + from Cryptodome import __version__ + from Cryptodome.Cipher import AES, PKCS1_OAEP, Blowfish, PKCS1_v1_5 + from Cryptodome.Hash import CMAC, SHA1 + from Cryptodome.PublicKey import RSA + elif _parent.__name__ == 'Crypto': + from Crypto import __version__ + from Crypto.Cipher import AES, PKCS1_OAEP, Blowfish, PKCS1_v1_5 # noqa: F401 + from Crypto.Hash import CMAC, SHA1 # noqa: F401 + from Crypto.PublicKey import RSA # noqa: F401 +except ImportError: + __version__ = f'broken {__version__}'.strip() + + +_yt_dlp__identifier = _parent.__name__ +if AES and _yt_dlp__identifier == 'Crypto': + try: + # In pycrypto, mode defaults to ECB. See: + # https://www.pycryptodome.org/en/latest/src/vs_pycrypto.html#:~:text=not%20have%20ECB%20as%20default%20mode + AES.new(b'abcdefghijklmnop') + except TypeError: + _yt_dlp__identifier = 'pycrypto' diff --git a/yt_dlp/dependencies/__init__.py b/yt_dlp/dependencies/__init__.py new file mode 100644 index 0000000..9e3f907 --- /dev/null +++ b/yt_dlp/dependencies/__init__.py @@ -0,0 +1,92 @@ +# flake8: noqa: F401 +"""Imports all optional dependencies for the project. +An attribute "_yt_dlp__identifier" may be inserted into the module if it uses an ambiguous namespace""" + +try: + import brotlicffi as brotli +except ImportError: + try: + import brotli + except ImportError: + brotli = None + + +try: + import certifi +except ImportError: + certifi = None +else: + from os.path import exists as _path_exists + + # The certificate may not be bundled in executable + if not _path_exists(certifi.where()): + certifi = None + + +try: + import mutagen +except ImportError: + mutagen = None + + +secretstorage = None +try: + import secretstorage + _SECRETSTORAGE_UNAVAILABLE_REASON = None +except ImportError: + _SECRETSTORAGE_UNAVAILABLE_REASON = ( + 'as the `secretstorage` module is not installed. ' + 'Please install by running `python3 -m pip install secretstorage`') +except Exception as _err: + _SECRETSTORAGE_UNAVAILABLE_REASON = f'as the `secretstorage` module could not be initialized. {_err}' + + +try: + import sqlite3 + # We need to get the underlying `sqlite` version, see https://github.com/yt-dlp/yt-dlp/issues/8152 + sqlite3._yt_dlp__version = sqlite3.sqlite_version +except ImportError: + # although sqlite3 is part of the standard library, it is possible to compile Python without + # sqlite support. See: https://github.com/yt-dlp/yt-dlp/issues/544 + sqlite3 = None + + +try: + import websockets +except ImportError: + websockets = None + +try: + import urllib3 +except ImportError: + urllib3 = None + +try: + import requests +except ImportError: + requests = None + +try: + import xattr # xattr or pyxattr +except ImportError: + xattr = None +else: + if hasattr(xattr, 'set'): # pyxattr + xattr._yt_dlp__identifier = 'pyxattr' + + +from . import Cryptodome + +all_dependencies = {k: v for k, v in globals().items() if not k.startswith('_')} +available_dependencies = {k: v for k, v in all_dependencies.items() if v} + + +# Deprecated +Cryptodome_AES = Cryptodome.AES + + +__all__ = [ + 'all_dependencies', + 'available_dependencies', + *all_dependencies.keys(), +] diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py new file mode 100644 index 0000000..51a9f28 --- /dev/null +++ b/yt_dlp/downloader/__init__.py @@ -0,0 +1,131 @@ +from ..utils import NO_DEFAULT, determine_protocol + + +def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=None, to_stdout=False): + info_dict['protocol'] = determine_protocol(info_dict) + info_copy = info_dict.copy() + info_copy['to_stdout'] = to_stdout + + protocols = (protocol or info_copy['protocol']).split('+') + downloaders = [_get_suitable_downloader(info_copy, proto, params, default) for proto in protocols] + + if set(downloaders) == {FFmpegFD} and FFmpegFD.can_merge_formats(info_copy, params): + return FFmpegFD + elif (set(downloaders) == {DashSegmentsFD} + and not (to_stdout and len(protocols) > 1) + and set(protocols) == {'http_dash_segments_generator'}): + return DashSegmentsFD + elif len(downloaders) == 1: + return downloaders[0] + return None + + +# Some of these require get_suitable_downloader +from .common import FileDownloader +from .dash import DashSegmentsFD +from .external import FFmpegFD, get_external_downloader +from .f4m import F4mFD +from .fc2 import FC2LiveFD +from .hls import HlsFD +from .http import HttpFD +from .ism import IsmFD +from .mhtml import MhtmlFD +from .niconico import NiconicoDmcFD, NiconicoLiveFD +from .rtmp import RtmpFD +from .rtsp import RtspFD +from .websocket import WebSocketFragmentFD +from .youtube_live_chat import YoutubeLiveChatFD + +PROTOCOL_MAP = { + 'rtmp': RtmpFD, + 'rtmpe': RtmpFD, + 'rtmp_ffmpeg': FFmpegFD, + 'm3u8_native': HlsFD, + 'm3u8': FFmpegFD, + 'mms': RtspFD, + 'rtsp': RtspFD, + 'f4m': F4mFD, + 'http_dash_segments': DashSegmentsFD, + 'http_dash_segments_generator': DashSegmentsFD, + 'ism': IsmFD, + 'mhtml': MhtmlFD, + 'niconico_dmc': NiconicoDmcFD, + 'niconico_live': NiconicoLiveFD, + 'fc2_live': FC2LiveFD, + 'websocket_frag': WebSocketFragmentFD, + 'youtube_live_chat': YoutubeLiveChatFD, + 'youtube_live_chat_replay': YoutubeLiveChatFD, +} + + +def shorten_protocol_name(proto, simplify=False): + short_protocol_names = { + 'm3u8_native': 'm3u8', + 'm3u8': 'm3u8F', + 'rtmp_ffmpeg': 'rtmpF', + 'http_dash_segments': 'dash', + 'http_dash_segments_generator': 'dashG', + 'niconico_dmc': 'dmc', + 'websocket_frag': 'WSfrag', + } + if simplify: + short_protocol_names.update({ + 'https': 'http', + 'ftps': 'ftp', + 'm3u8': 'm3u8', # Reverse above m3u8 mapping + 'm3u8_native': 'm3u8', + 'http_dash_segments_generator': 'dash', + 'rtmp_ffmpeg': 'rtmp', + 'm3u8_frag_urls': 'm3u8', + 'dash_frag_urls': 'dash', + }) + return short_protocol_names.get(proto, proto) + + +def _get_suitable_downloader(info_dict, protocol, params, default): + """Get the downloader class that can handle the info dict.""" + if default is NO_DEFAULT: + default = HttpFD + + if (info_dict.get('section_start') or info_dict.get('section_end')) and FFmpegFD.can_download(info_dict): + return FFmpegFD + + info_dict['protocol'] = protocol + downloaders = params.get('external_downloader') + external_downloader = ( + downloaders if isinstance(downloaders, str) or downloaders is None + else downloaders.get(shorten_protocol_name(protocol, True), downloaders.get('default'))) + + if external_downloader is None: + if info_dict['to_stdout'] and FFmpegFD.can_merge_formats(info_dict, params): + return FFmpegFD + elif external_downloader.lower() != 'native': + ed = get_external_downloader(external_downloader) + if ed.can_download(info_dict, external_downloader): + return ed + + if protocol == 'http_dash_segments': + if info_dict.get('is_live') and (external_downloader or '').lower() != 'native': + return FFmpegFD + + if protocol in ('m3u8', 'm3u8_native'): + if info_dict.get('is_live'): + return FFmpegFD + elif (external_downloader or '').lower() == 'native': + return HlsFD + elif protocol == 'm3u8_native' and get_suitable_downloader( + info_dict, params, None, protocol='m3u8_frag_urls', to_stdout=info_dict['to_stdout']): + return HlsFD + elif params.get('hls_prefer_native') is True: + return HlsFD + elif params.get('hls_prefer_native') is False: + return FFmpegFD + + return PROTOCOL_MAP.get(protocol, default) + + +__all__ = [ + 'FileDownloader', + 'get_suitable_downloader', + 'shorten_protocol_name', +] diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py new file mode 100644 index 0000000..b71d7ee --- /dev/null +++ b/yt_dlp/downloader/common.py @@ -0,0 +1,486 @@ +import contextlib +import errno +import functools +import os +import random +import re +import time + +from ..minicurses import ( + BreaklineStatusPrinter, + MultilineLogger, + MultilinePrinter, + QuietMultilinePrinter, +) +from ..utils import ( + IDENTITY, + NO_DEFAULT, + LockingUnsupportedError, + Namespace, + RetryManager, + classproperty, + decodeArgument, + deprecation_warning, + encodeFilename, + format_bytes, + join_nonempty, + parse_bytes, + remove_start, + sanitize_open, + shell_quote, + timeconvert, + timetuple_from_msec, + try_call, +) + + +class FileDownloader: + """File Downloader class. + + File downloader objects are the ones responsible of downloading the + actual video file and writing it to disk. + + File downloaders accept a lot of parameters. In order not to saturate + the object constructor with arguments, it receives a dictionary of + options instead. + + Available options: + + verbose: Print additional info to stdout. + quiet: Do not print messages to stdout. + ratelimit: Download speed limit, in bytes/sec. + throttledratelimit: Assume the download is being throttled below this speed (bytes/sec) + retries: Number of times to retry for expected network errors. + Default is 0 for API, but 10 for CLI + file_access_retries: Number of times to retry on file access error (default: 3) + buffersize: Size of download buffer in bytes. + noresizebuffer: Do not automatically resize the download buffer. + continuedl: Try to continue downloads if possible. + noprogress: Do not print the progress bar. + nopart: Do not use temporary .part files. + updatetime: Use the Last-modified header to set output file timestamps. + test: Download only first bytes to test the downloader. + min_filesize: Skip files smaller than this size + max_filesize: Skip files larger than this size + xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. + external_downloader_args: A dictionary of downloader keys (in lower case) + and a list of additional command-line arguments for the + executable. Use 'default' as the name for arguments to be + passed to all downloaders. For compatibility with youtube-dl, + a single list of args can also be used + hls_use_mpegts: Use the mpegts container for HLS videos. + http_chunk_size: Size of a chunk for chunk-based HTTP downloading. May be + useful for bypassing bandwidth throttling imposed by + a webserver (experimental) + progress_template: See YoutubeDL.py + retry_sleep_functions: See YoutubeDL.py + + Subclasses of this one must re-define the real_download method. + """ + + _TEST_FILE_SIZE = 10241 + params = None + + def __init__(self, ydl, params): + """Create a FileDownloader object with the given options.""" + self._set_ydl(ydl) + self._progress_hooks = [] + self.params = params + self._prepare_multiline_status() + self.add_progress_hook(self.report_progress) + + def _set_ydl(self, ydl): + self.ydl = ydl + + for func in ( + 'deprecation_warning', + 'deprecated_feature', + 'report_error', + 'report_file_already_downloaded', + 'report_warning', + 'to_console_title', + 'to_stderr', + 'trouble', + 'write_debug', + ): + if not hasattr(self, func): + setattr(self, func, getattr(ydl, func)) + + def to_screen(self, *args, **kargs): + self.ydl.to_screen(*args, quiet=self.params.get('quiet'), **kargs) + + __to_screen = to_screen + + @classproperty + def FD_NAME(cls): + return re.sub(r'(?<=[a-z])(?=[A-Z])', '_', cls.__name__[:-2]).lower() + + @staticmethod + def format_seconds(seconds): + if seconds is None: + return ' Unknown' + time = timetuple_from_msec(seconds * 1000) + if time.hours > 99: + return '--:--:--' + return '%02d:%02d:%02d' % time[:-1] + + @classmethod + def format_eta(cls, seconds): + return f'{remove_start(cls.format_seconds(seconds), "00:"):>8s}' + + @staticmethod + def calc_percent(byte_counter, data_len): + if data_len is None: + return None + return float(byte_counter) / float(data_len) * 100.0 + + @staticmethod + def format_percent(percent): + return ' N/A%' if percent is None else f'{percent:>5.1f}%' + + @classmethod + def calc_eta(cls, start_or_rate, now_or_remaining, total=NO_DEFAULT, current=NO_DEFAULT): + if total is NO_DEFAULT: + rate, remaining = start_or_rate, now_or_remaining + if None in (rate, remaining): + return None + return int(float(remaining) / rate) + + start, now = start_or_rate, now_or_remaining + if total is None: + return None + if now is None: + now = time.time() + rate = cls.calc_speed(start, now, current) + return rate and int((float(total) - float(current)) / rate) + + @staticmethod + def calc_speed(start, now, bytes): + dif = now - start + if bytes == 0 or dif < 0.001: # One millisecond + return None + return float(bytes) / dif + + @staticmethod + def format_speed(speed): + return ' Unknown B/s' if speed is None else f'{format_bytes(speed):>10s}/s' + + @staticmethod + def format_retries(retries): + return 'inf' if retries == float('inf') else int(retries) + + @staticmethod + def filesize_or_none(unencoded_filename): + if os.path.isfile(unencoded_filename): + return os.path.getsize(unencoded_filename) + return 0 + + @staticmethod + def best_block_size(elapsed_time, bytes): + new_min = max(bytes / 2.0, 1.0) + new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB + if elapsed_time < 0.001: + return int(new_max) + rate = bytes / elapsed_time + if rate > new_max: + return int(new_max) + if rate < new_min: + return int(new_min) + return int(rate) + + @staticmethod + def parse_bytes(bytestr): + """Parse a string indicating a byte quantity into an integer.""" + deprecation_warning('yt_dlp.FileDownloader.parse_bytes is deprecated and ' + 'may be removed in the future. Use yt_dlp.utils.parse_bytes instead') + return parse_bytes(bytestr) + + def slow_down(self, start_time, now, byte_counter): + """Sleep if the download speed is over the rate limit.""" + rate_limit = self.params.get('ratelimit') + if rate_limit is None or byte_counter == 0: + return + if now is None: + now = time.time() + elapsed = now - start_time + if elapsed <= 0.0: + return + speed = float(byte_counter) / elapsed + if speed > rate_limit: + sleep_time = float(byte_counter) / rate_limit - elapsed + if sleep_time > 0: + time.sleep(sleep_time) + + def temp_name(self, filename): + """Returns a temporary filename for the given filename.""" + if self.params.get('nopart', False) or filename == '-' or \ + (os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))): + return filename + return filename + '.part' + + def undo_temp_name(self, filename): + if filename.endswith('.part'): + return filename[:-len('.part')] + return filename + + def ytdl_filename(self, filename): + return filename + '.ytdl' + + def wrap_file_access(action, *, fatal=False): + def error_callback(err, count, retries, *, fd): + return RetryManager.report_retry( + err, count, retries, info=fd.__to_screen, + warn=lambda e: (time.sleep(0.01), fd.to_screen(f'[download] Unable to {action} file: {e}')), + error=None if fatal else lambda e: fd.report_error(f'Unable to {action} file: {e}'), + sleep_func=fd.params.get('retry_sleep_functions', {}).get('file_access')) + + def wrapper(self, func, *args, **kwargs): + for retry in RetryManager(self.params.get('file_access_retries', 3), error_callback, fd=self): + try: + return func(self, *args, **kwargs) + except OSError as err: + if err.errno in (errno.EACCES, errno.EINVAL): + retry.error = err + continue + retry.error_callback(err, 1, 0) + + return functools.partial(functools.partialmethod, wrapper) + + @wrap_file_access('open', fatal=True) + def sanitize_open(self, filename, open_mode): + f, filename = sanitize_open(filename, open_mode) + if not getattr(f, 'locked', None): + self.write_debug(f'{LockingUnsupportedError.msg}. Proceeding without locking', only_once=True) + return f, filename + + @wrap_file_access('remove') + def try_remove(self, filename): + if os.path.isfile(filename): + os.remove(filename) + + @wrap_file_access('rename') + def try_rename(self, old_filename, new_filename): + if old_filename == new_filename: + return + os.replace(old_filename, new_filename) + + def try_utime(self, filename, last_modified_hdr): + """Try to set the last-modified time of the given file.""" + if last_modified_hdr is None: + return + if not os.path.isfile(encodeFilename(filename)): + return + timestr = last_modified_hdr + if timestr is None: + return + filetime = timeconvert(timestr) + if filetime is None: + return filetime + # Ignore obviously invalid dates + if filetime == 0: + return + with contextlib.suppress(Exception): + os.utime(filename, (time.time(), filetime)) + return filetime + + def report_destination(self, filename): + """Report destination filename.""" + self.to_screen('[download] Destination: ' + filename) + + def _prepare_multiline_status(self, lines=1): + if self.params.get('noprogress'): + self._multiline = QuietMultilinePrinter() + elif self.ydl.params.get('logger'): + self._multiline = MultilineLogger(self.ydl.params['logger'], lines) + elif self.params.get('progress_with_newline'): + self._multiline = BreaklineStatusPrinter(self.ydl._out_files.out, lines) + else: + self._multiline = MultilinePrinter(self.ydl._out_files.out, lines, not self.params.get('quiet')) + self._multiline.allow_colors = self.ydl._allow_colors.out and self.ydl._allow_colors.out != 'no_color' + self._multiline._HAVE_FULLCAP = self.ydl._allow_colors.out + + def _finish_multiline_status(self): + self._multiline.end() + + ProgressStyles = Namespace( + downloaded_bytes='light blue', + percent='light blue', + eta='yellow', + speed='green', + elapsed='bold white', + total_bytes='', + total_bytes_estimate='', + ) + + def _report_progress_status(self, s, default_template): + for name, style in self.ProgressStyles.items_: + name = f'_{name}_str' + if name not in s: + continue + s[name] = self._format_progress(s[name], style) + s['_default_template'] = default_template % s + + progress_dict = s.copy() + progress_dict.pop('info_dict') + progress_dict = {'info': s['info_dict'], 'progress': progress_dict} + + progress_template = self.params.get('progress_template', {}) + self._multiline.print_at_line(self.ydl.evaluate_outtmpl( + progress_template.get('download') or '[download] %(progress._default_template)s', + progress_dict), s.get('progress_idx') or 0) + self.to_console_title(self.ydl.evaluate_outtmpl( + progress_template.get('download-title') or 'yt-dlp %(progress._default_template)s', + progress_dict)) + + def _format_progress(self, *args, **kwargs): + return self.ydl._format_text( + self._multiline.stream, self._multiline.allow_colors, *args, **kwargs) + + def report_progress(self, s): + def with_fields(*tups, default=''): + for *fields, tmpl in tups: + if all(s.get(f) is not None for f in fields): + return tmpl + return default + + _format_bytes = lambda k: f'{format_bytes(s.get(k)):>10s}' + + if s['status'] == 'finished': + if self.params.get('noprogress'): + self.to_screen('[download] Download completed') + speed = try_call(lambda: s['total_bytes'] / s['elapsed']) + s.update({ + 'speed': speed, + '_speed_str': self.format_speed(speed).strip(), + '_total_bytes_str': _format_bytes('total_bytes'), + '_elapsed_str': self.format_seconds(s.get('elapsed')), + '_percent_str': self.format_percent(100), + }) + self._report_progress_status(s, join_nonempty( + '100%%', + with_fields(('total_bytes', 'of %(_total_bytes_str)s')), + with_fields(('elapsed', 'in %(_elapsed_str)s')), + with_fields(('speed', 'at %(_speed_str)s')), + delim=' ')) + + if s['status'] != 'downloading': + return + + s.update({ + '_eta_str': self.format_eta(s.get('eta')).strip(), + '_speed_str': self.format_speed(s.get('speed')), + '_percent_str': self.format_percent(try_call( + lambda: 100 * s['downloaded_bytes'] / s['total_bytes'], + lambda: 100 * s['downloaded_bytes'] / s['total_bytes_estimate'], + lambda: s['downloaded_bytes'] == 0 and 0)), + '_total_bytes_str': _format_bytes('total_bytes'), + '_total_bytes_estimate_str': _format_bytes('total_bytes_estimate'), + '_downloaded_bytes_str': _format_bytes('downloaded_bytes'), + '_elapsed_str': self.format_seconds(s.get('elapsed')), + }) + + msg_template = with_fields( + ('total_bytes', '%(_percent_str)s of %(_total_bytes_str)s at %(_speed_str)s ETA %(_eta_str)s'), + ('total_bytes_estimate', '%(_percent_str)s of ~%(_total_bytes_estimate_str)s at %(_speed_str)s ETA %(_eta_str)s'), + ('downloaded_bytes', 'elapsed', '%(_downloaded_bytes_str)s at %(_speed_str)s (%(_elapsed_str)s)'), + ('downloaded_bytes', '%(_downloaded_bytes_str)s at %(_speed_str)s'), + default='%(_percent_str)s at %(_speed_str)s ETA %(_eta_str)s') + + msg_template += with_fields( + ('fragment_index', 'fragment_count', ' (frag %(fragment_index)s/%(fragment_count)s)'), + ('fragment_index', ' (frag %(fragment_index)s)')) + self._report_progress_status(s, msg_template) + + def report_resuming_byte(self, resume_len): + """Report attempt to resume at given byte.""" + self.to_screen('[download] Resuming download at byte %s' % resume_len) + + def report_retry(self, err, count, retries, frag_index=NO_DEFAULT, fatal=True): + """Report retry""" + is_frag = False if frag_index is NO_DEFAULT else 'fragment' + RetryManager.report_retry( + err, count, retries, info=self.__to_screen, + warn=lambda msg: self.__to_screen(f'[download] Got error: {msg}'), + error=IDENTITY if not fatal else lambda e: self.report_error(f'\r[download] Got error: {e}'), + sleep_func=self.params.get('retry_sleep_functions', {}).get(is_frag or 'http'), + suffix=f'fragment{"s" if frag_index is None else f" {frag_index}"}' if is_frag else None) + + def report_unable_to_resume(self): + """Report it was impossible to resume download.""" + self.to_screen('[download] Unable to resume') + + @staticmethod + def supports_manifest(manifest): + """ Whether the downloader can download the fragments from the manifest. + Redefine in subclasses if needed. """ + pass + + def download(self, filename, info_dict, subtitle=False): + """Download to a filename using the info from info_dict + Return True on success and False otherwise + """ + nooverwrites_and_exists = ( + not self.params.get('overwrites', True) + and os.path.exists(encodeFilename(filename)) + ) + + if not hasattr(filename, 'write'): + continuedl_and_exists = ( + self.params.get('continuedl', True) + and os.path.isfile(encodeFilename(filename)) + and not self.params.get('nopart', False) + ) + + # Check file already present + if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists): + self.report_file_already_downloaded(filename) + self._hook_progress({ + 'filename': filename, + 'status': 'finished', + 'total_bytes': os.path.getsize(encodeFilename(filename)), + }, info_dict) + self._finish_multiline_status() + return True, False + + if subtitle: + sleep_interval = self.params.get('sleep_interval_subtitles') or 0 + else: + min_sleep_interval = self.params.get('sleep_interval') or 0 + sleep_interval = random.uniform( + min_sleep_interval, self.params.get('max_sleep_interval') or min_sleep_interval) + if sleep_interval > 0: + self.to_screen(f'[download] Sleeping {sleep_interval:.2f} seconds ...') + time.sleep(sleep_interval) + + ret = self.real_download(filename, info_dict) + self._finish_multiline_status() + return ret, True + + def real_download(self, filename, info_dict): + """Real download process. Redefine in subclasses.""" + raise NotImplementedError('This method must be implemented by subclasses') + + def _hook_progress(self, status, info_dict): + # Ideally we want to make a copy of the dict, but that is too slow + status['info_dict'] = info_dict + # youtube-dl passes the same status object to all the hooks. + # Some third party scripts seems to be relying on this. + # So keep this behavior if possible + for ph in self._progress_hooks: + ph(status) + + def add_progress_hook(self, ph): + # See YoutubeDl.py (search for progress_hooks) for a description of + # this interface + self._progress_hooks.append(ph) + + def _debug_cmd(self, args, exe=None): + if not self.params.get('verbose', False): + return + + str_args = [decodeArgument(a) for a in args] + + if exe is None: + exe = os.path.basename(str_args[0]) + + self.write_debug(f'{exe} command line: {shell_quote(str_args)}') diff --git a/yt_dlp/downloader/dash.py b/yt_dlp/downloader/dash.py new file mode 100644 index 0000000..afc79b6 --- /dev/null +++ b/yt_dlp/downloader/dash.py @@ -0,0 +1,90 @@ +import time +import urllib.parse + +from . import get_suitable_downloader +from .fragment import FragmentFD +from ..utils import update_url_query, urljoin + + +class DashSegmentsFD(FragmentFD): + """ + Download segments in a DASH manifest. External downloaders can take over + the fragment downloads by supporting the 'dash_frag_urls' protocol + """ + + FD_NAME = 'dashsegments' + + def real_download(self, filename, info_dict): + if 'http_dash_segments_generator' in info_dict['protocol'].split('+'): + real_downloader = None # No external FD can support --live-from-start + else: + if info_dict.get('is_live'): + self.report_error('Live DASH videos are not supported') + real_downloader = get_suitable_downloader( + info_dict, self.params, None, protocol='dash_frag_urls', to_stdout=(filename == '-')) + + real_start = time.time() + + requested_formats = [{**info_dict, **fmt} for fmt in info_dict.get('requested_formats', [])] + args = [] + for fmt in requested_formats or [info_dict]: + try: + fragment_count = 1 if self.params.get('test') else len(fmt['fragments']) + except TypeError: + fragment_count = None + ctx = { + 'filename': fmt.get('filepath') or filename, + 'live': 'is_from_start' if fmt.get('is_from_start') else fmt.get('is_live'), + 'total_frags': fragment_count, + } + + if real_downloader: + self._prepare_external_frag_download(ctx) + else: + self._prepare_and_start_frag_download(ctx, fmt) + ctx['start'] = real_start + + extra_query = None + extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') + if extra_param_to_segment_url: + extra_query = urllib.parse.parse_qs(extra_param_to_segment_url) + + fragments_to_download = self._get_fragments(fmt, ctx, extra_query) + + if real_downloader: + self.to_screen( + f'[{self.FD_NAME}] Fragment downloads will be delegated to {real_downloader.get_basename()}') + info_dict['fragments'] = list(fragments_to_download) + fd = real_downloader(self.ydl, self.params) + return fd.real_download(filename, info_dict) + + args.append([ctx, fragments_to_download, fmt]) + + return self.download_and_append_fragments_multiple(*args, is_fatal=lambda idx: idx == 0) + + def _resolve_fragments(self, fragments, ctx): + fragments = fragments(ctx) if callable(fragments) else fragments + return [next(iter(fragments))] if self.params.get('test') else fragments + + def _get_fragments(self, fmt, ctx, extra_query): + fragment_base_url = fmt.get('fragment_base_url') + fragments = self._resolve_fragments(fmt['fragments'], ctx) + + frag_index = 0 + for i, fragment in enumerate(fragments): + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue + fragment_url = fragment.get('url') + if not fragment_url: + assert fragment_base_url + fragment_url = urljoin(fragment_base_url, fragment['path']) + if extra_query: + fragment_url = update_url_query(fragment_url, extra_query) + + yield { + 'frag_index': frag_index, + 'fragment_count': fragment.get('fragment_count'), + 'index': i, + 'url': fragment_url, + } diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py new file mode 100644 index 0000000..ce5eeb0 --- /dev/null +++ b/yt_dlp/downloader/external.py @@ -0,0 +1,664 @@ +import enum +import json +import os +import re +import subprocess +import sys +import tempfile +import time +import uuid + +from .fragment import FragmentFD +from ..compat import functools +from ..networking import Request +from ..postprocessor.ffmpeg import EXT_TO_OUT_FORMATS, FFmpegPostProcessor +from ..utils import ( + Popen, + RetryManager, + _configuration_args, + check_executable, + classproperty, + cli_bool_option, + cli_option, + cli_valueless_option, + determine_ext, + encodeArgument, + encodeFilename, + find_available_port, + remove_end, + traverse_obj, +) + + +class Features(enum.Enum): + TO_STDOUT = enum.auto() + MULTIPLE_FORMATS = enum.auto() + + +class ExternalFD(FragmentFD): + SUPPORTED_PROTOCOLS = ('http', 'https', 'ftp', 'ftps') + SUPPORTED_FEATURES = () + _CAPTURE_STDERR = True + + def real_download(self, filename, info_dict): + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + self._cookies_tempfile = None + + try: + started = time.time() + retval = self._call_downloader(tmpfilename, info_dict) + except KeyboardInterrupt: + if not info_dict.get('is_live'): + raise + # Live stream downloading cancellation should be considered as + # correct and expected termination thus all postprocessing + # should take place + retval = 0 + self.to_screen('[%s] Interrupted by user' % self.get_basename()) + finally: + if self._cookies_tempfile: + self.try_remove(self._cookies_tempfile) + + if retval == 0: + status = { + 'filename': filename, + 'status': 'finished', + 'elapsed': time.time() - started, + } + if filename != '-': + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.try_rename(tmpfilename, filename) + status.update({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + }) + self._hook_progress(status, info_dict) + return True + else: + self.to_stderr('\n') + self.report_error('%s exited with code %d' % ( + self.get_basename(), retval)) + return False + + @classmethod + def get_basename(cls): + return cls.__name__[:-2].lower() + + @classproperty + def EXE_NAME(cls): + return cls.get_basename() + + @functools.cached_property + def exe(self): + return self.EXE_NAME + + @classmethod + def available(cls, path=None): + path = check_executable( + cls.EXE_NAME if path in (None, cls.get_basename()) else path, + [cls.AVAILABLE_OPT]) + if not path: + return False + cls.exe = path + return path + + @classmethod + def supports(cls, info_dict): + return all(( + not info_dict.get('to_stdout') or Features.TO_STDOUT in cls.SUPPORTED_FEATURES, + '+' not in info_dict['protocol'] or Features.MULTIPLE_FORMATS in cls.SUPPORTED_FEATURES, + not traverse_obj(info_dict, ('hls_aes', ...), 'extra_param_to_segment_url'), + all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')), + )) + + @classmethod + def can_download(cls, info_dict, path=None): + return cls.available(path) and cls.supports(info_dict) + + def _option(self, command_option, param): + return cli_option(self.params, command_option, param) + + def _bool_option(self, command_option, param, true_value='true', false_value='false', separator=None): + return cli_bool_option(self.params, command_option, param, true_value, false_value, separator) + + def _valueless_option(self, command_option, param, expected_value=True): + return cli_valueless_option(self.params, command_option, param, expected_value) + + def _configuration_args(self, keys=None, *args, **kwargs): + return _configuration_args( + self.get_basename(), self.params.get('external_downloader_args'), self.EXE_NAME, + keys, *args, **kwargs) + + def _write_cookies(self): + if not self.ydl.cookiejar.filename: + tmp_cookies = tempfile.NamedTemporaryFile(suffix='.cookies', delete=False) + tmp_cookies.close() + self._cookies_tempfile = tmp_cookies.name + self.to_screen(f'[download] Writing temporary cookies file to "{self._cookies_tempfile}"') + # real_download resets _cookies_tempfile; if it's None then save() will write to cookiejar.filename + self.ydl.cookiejar.save(self._cookies_tempfile) + return self.ydl.cookiejar.filename or self._cookies_tempfile + + def _call_downloader(self, tmpfilename, info_dict): + """ Either overwrite this or implement _make_cmd """ + cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] + + self._debug_cmd(cmd) + + if 'fragments' not in info_dict: + _, stderr, returncode = self._call_process(cmd, info_dict) + if returncode and stderr: + self.to_stderr(stderr) + return returncode + + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) + + retry_manager = RetryManager(self.params.get('fragment_retries'), self.report_retry, + frag_index=None, fatal=not skip_unavailable_fragments) + for retry in retry_manager: + _, stderr, returncode = self._call_process(cmd, info_dict) + if not returncode: + break + # TODO: Decide whether to retry based on error code + # https://aria2.github.io/manual/en/html/aria2c.html#exit-status + if stderr: + self.to_stderr(stderr) + retry.error = Exception() + continue + if not skip_unavailable_fragments and retry_manager.error: + return -1 + + decrypt_fragment = self.decrypter(info_dict) + dest, _ = self.sanitize_open(tmpfilename, 'wb') + for frag_index, fragment in enumerate(info_dict['fragments']): + fragment_filename = '%s-Frag%d' % (tmpfilename, frag_index) + try: + src, _ = self.sanitize_open(fragment_filename, 'rb') + except OSError as err: + if skip_unavailable_fragments and frag_index > 1: + self.report_skip_fragment(frag_index, err) + continue + self.report_error(f'Unable to open fragment {frag_index}; {err}') + return -1 + dest.write(decrypt_fragment(fragment, src.read())) + src.close() + if not self.params.get('keep_fragments', False): + self.try_remove(encodeFilename(fragment_filename)) + dest.close() + self.try_remove(encodeFilename('%s.frag.urls' % tmpfilename)) + return 0 + + def _call_process(self, cmd, info_dict): + return Popen.run(cmd, text=True, stderr=subprocess.PIPE if self._CAPTURE_STDERR else None) + + +class CurlFD(ExternalFD): + AVAILABLE_OPT = '-V' + _CAPTURE_STDERR = False # curl writes the progress to stderr + + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '--location', '-o', tmpfilename, '--compressed'] + cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url']) + if cookie_header: + cmd += ['--cookie', cookie_header] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', f'{key}: {val}'] + + cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') + cmd += self._valueless_option('--silent', 'noprogress') + cmd += self._valueless_option('--verbose', 'verbose') + cmd += self._option('--limit-rate', 'ratelimit') + retry = self._option('--retry', 'retries') + if len(retry) == 2: + if retry[1] in ('inf', 'infinite'): + retry[1] = '2147483647' + cmd += retry + cmd += self._option('--max-filesize', 'max_filesize') + cmd += self._option('--interface', 'source_address') + cmd += self._option('--proxy', 'proxy') + cmd += self._valueless_option('--insecure', 'nocheckcertificate') + cmd += self._configuration_args() + cmd += ['--', info_dict['url']] + return cmd + + +class AxelFD(ExternalFD): + AVAILABLE_OPT = '-V' + + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '-o', tmpfilename] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['-H', f'{key}: {val}'] + cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url']) + if cookie_header: + cmd += ['-H', f'Cookie: {cookie_header}', '--max-redirect=0'] + cmd += self._configuration_args() + cmd += ['--', info_dict['url']] + return cmd + + +class WgetFD(ExternalFD): + AVAILABLE_OPT = '--version' + + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '-O', tmpfilename, '-nv', '--compression=auto'] + if self.ydl.cookiejar.get_cookie_header(info_dict['url']): + cmd += ['--load-cookies', self._write_cookies()] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', f'{key}: {val}'] + cmd += self._option('--limit-rate', 'ratelimit') + retry = self._option('--tries', 'retries') + if len(retry) == 2: + if retry[1] in ('inf', 'infinite'): + retry[1] = '0' + cmd += retry + cmd += self._option('--bind-address', 'source_address') + proxy = self.params.get('proxy') + if proxy: + for var in ('http_proxy', 'https_proxy'): + cmd += ['--execute', f'{var}={proxy}'] + cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') + cmd += self._configuration_args() + cmd += ['--', info_dict['url']] + return cmd + + +class Aria2cFD(ExternalFD): + AVAILABLE_OPT = '-v' + SUPPORTED_PROTOCOLS = ('http', 'https', 'ftp', 'ftps', 'dash_frag_urls', 'm3u8_frag_urls') + + @staticmethod + def supports_manifest(manifest): + UNSUPPORTED_FEATURES = [ + r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [1] + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 + ] + check_results = (not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) + return all(check_results) + + @staticmethod + def _aria2c_filename(fn): + return fn if os.path.isabs(fn) else f'.{os.path.sep}{fn}' + + def _call_downloader(self, tmpfilename, info_dict): + # FIXME: Disabled due to https://github.com/yt-dlp/yt-dlp/issues/5931 + if False and 'no-external-downloader-progress' not in self.params.get('compat_opts', []): + info_dict['__rpc'] = { + 'port': find_available_port() or 19190, + 'secret': str(uuid.uuid4()), + } + return super()._call_downloader(tmpfilename, info_dict) + + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '-c', '--no-conf', + '--console-log-level=warn', '--summary-interval=0', '--download-result=hide', + '--http-accept-gzip=true', '--file-allocation=none', '-x16', '-j16', '-s16'] + if 'fragments' in info_dict: + cmd += ['--allow-overwrite=true', '--allow-piece-length-change=true'] + else: + cmd += ['--min-split-size', '1M'] + + if self.ydl.cookiejar.get_cookie_header(info_dict['url']): + cmd += [f'--load-cookies={self._write_cookies()}'] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', f'{key}: {val}'] + cmd += self._option('--max-overall-download-limit', 'ratelimit') + cmd += self._option('--interface', 'source_address') + cmd += self._option('--all-proxy', 'proxy') + cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') + cmd += self._bool_option('--remote-time', 'updatetime', 'true', 'false', '=') + cmd += self._bool_option('--show-console-readout', 'noprogress', 'false', 'true', '=') + cmd += self._configuration_args() + + if '__rpc' in info_dict: + cmd += [ + '--enable-rpc', + f'--rpc-listen-port={info_dict["__rpc"]["port"]}', + f'--rpc-secret={info_dict["__rpc"]["secret"]}'] + + # aria2c strips out spaces from the beginning/end of filenames and paths. + # We work around this issue by adding a "./" to the beginning of the + # filename and relative path, and adding a "/" at the end of the path. + # See: https://github.com/yt-dlp/yt-dlp/issues/276 + # https://github.com/ytdl-org/youtube-dl/issues/20312 + # https://github.com/aria2/aria2/issues/1373 + dn = os.path.dirname(tmpfilename) + if dn: + cmd += ['--dir', self._aria2c_filename(dn) + os.path.sep] + if 'fragments' not in info_dict: + cmd += ['--out', self._aria2c_filename(os.path.basename(tmpfilename))] + cmd += ['--auto-file-renaming=false'] + + if 'fragments' in info_dict: + cmd += ['--uri-selector=inorder'] + url_list_file = '%s.frag.urls' % tmpfilename + url_list = [] + for frag_index, fragment in enumerate(info_dict['fragments']): + fragment_filename = '%s-Frag%d' % (os.path.basename(tmpfilename), frag_index) + url_list.append('%s\n\tout=%s' % (fragment['url'], self._aria2c_filename(fragment_filename))) + stream, _ = self.sanitize_open(url_list_file, 'wb') + stream.write('\n'.join(url_list).encode()) + stream.close() + cmd += ['-i', self._aria2c_filename(url_list_file)] + else: + cmd += ['--', info_dict['url']] + return cmd + + def aria2c_rpc(self, rpc_port, rpc_secret, method, params=()): + # Does not actually need to be UUID, just unique + sanitycheck = str(uuid.uuid4()) + d = json.dumps({ + 'jsonrpc': '2.0', + 'id': sanitycheck, + 'method': method, + 'params': [f'token:{rpc_secret}', *params], + }).encode('utf-8') + request = Request( + f'http://localhost:{rpc_port}/jsonrpc', + data=d, headers={ + 'Content-Type': 'application/json', + 'Content-Length': f'{len(d)}', + }, proxies={'all': None}) + with self.ydl.urlopen(request) as r: + resp = json.load(r) + assert resp.get('id') == sanitycheck, 'Something went wrong with RPC server' + return resp['result'] + + def _call_process(self, cmd, info_dict): + if '__rpc' not in info_dict: + return super()._call_process(cmd, info_dict) + + send_rpc = functools.partial(self.aria2c_rpc, info_dict['__rpc']['port'], info_dict['__rpc']['secret']) + started = time.time() + + fragmented = 'fragments' in info_dict + frag_count = len(info_dict['fragments']) if fragmented else 1 + status = { + 'filename': info_dict.get('_filename'), + 'status': 'downloading', + 'elapsed': 0, + 'downloaded_bytes': 0, + 'fragment_count': frag_count if fragmented else None, + 'fragment_index': 0 if fragmented else None, + } + self._hook_progress(status, info_dict) + + def get_stat(key, *obj, average=False): + val = tuple(filter(None, map(float, traverse_obj(obj, (..., ..., key))))) or [0] + return sum(val) / (len(val) if average else 1) + + with Popen(cmd, text=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) as p: + # Add a small sleep so that RPC client can receive response, + # or the connection stalls infinitely + time.sleep(0.2) + retval = p.poll() + while retval is None: + # We don't use tellStatus as we won't know the GID without reading stdout + # Ref: https://aria2.github.io/manual/en/html/aria2c.html#aria2.tellActive + active = send_rpc('aria2.tellActive') + completed = send_rpc('aria2.tellStopped', [0, frag_count]) + + downloaded = get_stat('totalLength', completed) + get_stat('completedLength', active) + speed = get_stat('downloadSpeed', active) + total = frag_count * get_stat('totalLength', active, completed, average=True) + if total < downloaded: + total = None + + status.update({ + 'downloaded_bytes': int(downloaded), + 'speed': speed, + 'total_bytes': None if fragmented else total, + 'total_bytes_estimate': total, + 'eta': (total - downloaded) / (speed or 1), + 'fragment_index': min(frag_count, len(completed) + 1) if fragmented else None, + 'elapsed': time.time() - started + }) + self._hook_progress(status, info_dict) + + if not active and len(completed) >= frag_count: + send_rpc('aria2.shutdown') + retval = p.wait() + break + + time.sleep(0.1) + retval = p.poll() + + return '', p.stderr.read(), retval + + +class HttpieFD(ExternalFD): + AVAILABLE_OPT = '--version' + EXE_NAME = 'http' + + def _make_cmd(self, tmpfilename, info_dict): + cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] + + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += [f'{key}:{val}'] + + # httpie 3.1.0+ removes the Cookie header on redirect, so this should be safe for now. [1] + # If we ever need cookie handling for redirects, we can export the cookiejar into a session. [2] + # 1: https://github.com/httpie/httpie/security/advisories/GHSA-9w4w-cpc8-h2fq + # 2: https://httpie.io/docs/cli/sessions + cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url']) + if cookie_header: + cmd += [f'Cookie:{cookie_header}'] + return cmd + + +class FFmpegFD(ExternalFD): + SUPPORTED_PROTOCOLS = ('http', 'https', 'ftp', 'ftps', 'm3u8', 'm3u8_native', 'rtsp', 'rtmp', 'rtmp_ffmpeg', 'mms', 'http_dash_segments') + SUPPORTED_FEATURES = (Features.TO_STDOUT, Features.MULTIPLE_FORMATS) + + @classmethod + def available(cls, path=None): + # TODO: Fix path for ffmpeg + # Fixme: This may be wrong when --ffmpeg-location is used + return FFmpegPostProcessor().available + + def on_process_started(self, proc, stdin): + """ Override this in subclasses """ + pass + + @classmethod + def can_merge_formats(cls, info_dict, params): + return ( + info_dict.get('requested_formats') + and info_dict.get('protocol') + and not params.get('allow_unplayable_formats') + and 'no-direct-merge' not in params.get('compat_opts', []) + and cls.can_download(info_dict)) + + def _call_downloader(self, tmpfilename, info_dict): + ffpp = FFmpegPostProcessor(downloader=self) + if not ffpp.available: + self.report_error('m3u8 download detected but ffmpeg could not be found. Please install') + return False + ffpp.check_version() + + args = [ffpp.executable, '-y'] + + for log_level in ('quiet', 'verbose'): + if self.params.get(log_level, False): + args += ['-loglevel', log_level] + break + if not self.params.get('verbose'): + args += ['-hide_banner'] + + args += traverse_obj(info_dict, ('downloader_options', 'ffmpeg_args'), default=[]) + + # These exists only for compatibility. Extractors should use + # info_dict['downloader_options']['ffmpeg_args'] instead + args += info_dict.get('_ffmpeg_args') or [] + seekable = info_dict.get('_seekable') + if seekable is not None: + # setting -seekable prevents ffmpeg from guessing if the server + # supports seeking(by adding the header `Range: bytes=0-`), which + # can cause problems in some cases + # https://github.com/ytdl-org/youtube-dl/issues/11800#issuecomment-275037127 + # http://trac.ffmpeg.org/ticket/6125#comment:10 + args += ['-seekable', '1' if seekable else '0'] + + env = None + proxy = self.params.get('proxy') + if proxy: + if not re.match(r'^[\da-zA-Z]+://', proxy): + proxy = 'http://%s' % proxy + + if proxy.startswith('socks'): + self.report_warning( + '%s does not support SOCKS proxies. Downloading is likely to fail. ' + 'Consider adding --hls-prefer-native to your command.' % self.get_basename()) + + # Since December 2015 ffmpeg supports -http_proxy option (see + # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd) + # We could switch to the following code if we are able to detect version properly + # args += ['-http_proxy', proxy] + env = os.environ.copy() + env['HTTP_PROXY'] = proxy + env['http_proxy'] = proxy + + protocol = info_dict.get('protocol') + + if protocol == 'rtmp': + player_url = info_dict.get('player_url') + page_url = info_dict.get('page_url') + app = info_dict.get('app') + play_path = info_dict.get('play_path') + tc_url = info_dict.get('tc_url') + flash_version = info_dict.get('flash_version') + live = info_dict.get('rtmp_live', False) + conn = info_dict.get('rtmp_conn') + if player_url is not None: + args += ['-rtmp_swfverify', player_url] + if page_url is not None: + args += ['-rtmp_pageurl', page_url] + if app is not None: + args += ['-rtmp_app', app] + if play_path is not None: + args += ['-rtmp_playpath', play_path] + if tc_url is not None: + args += ['-rtmp_tcurl', tc_url] + if flash_version is not None: + args += ['-rtmp_flashver', flash_version] + if live: + args += ['-rtmp_live', 'live'] + if isinstance(conn, list): + for entry in conn: + args += ['-rtmp_conn', entry] + elif isinstance(conn, str): + args += ['-rtmp_conn', conn] + + start_time, end_time = info_dict.get('section_start') or 0, info_dict.get('section_end') + + selected_formats = info_dict.get('requested_formats') or [info_dict] + for i, fmt in enumerate(selected_formats): + is_http = re.match(r'^https?://', fmt['url']) + cookies = self.ydl.cookiejar.get_cookies_for_url(fmt['url']) if is_http else [] + if cookies: + args.extend(['-cookies', ''.join( + f'{cookie.name}={cookie.value}; path={cookie.path}; domain={cookie.domain};\r\n' + for cookie in cookies)]) + if fmt.get('http_headers') and is_http: + # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: + # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + args.extend(['-headers', ''.join(f'{key}: {val}\r\n' for key, val in fmt['http_headers'].items())]) + + if start_time: + args += ['-ss', str(start_time)] + if end_time: + args += ['-t', str(end_time - start_time)] + + args += self._configuration_args((f'_i{i + 1}', '_i')) + ['-i', fmt['url']] + + if not (start_time or end_time) or not self.params.get('force_keyframes_at_cuts'): + args += ['-c', 'copy'] + + if info_dict.get('requested_formats') or protocol == 'http_dash_segments': + for i, fmt in enumerate(selected_formats): + stream_number = fmt.get('manifest_stream_number', 0) + args.extend(['-map', f'{i}:{stream_number}']) + + if self.params.get('test', False): + args += ['-fs', str(self._TEST_FILE_SIZE)] + + ext = info_dict['ext'] + if protocol in ('m3u8', 'm3u8_native'): + use_mpegts = (tmpfilename == '-') or self.params.get('hls_use_mpegts') + if use_mpegts is None: + use_mpegts = info_dict.get('is_live') + if use_mpegts: + args += ['-f', 'mpegts'] + else: + args += ['-f', 'mp4'] + if (ffpp.basename == 'ffmpeg' and ffpp._features.get('needs_adtstoasc')) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): + args += ['-bsf:a', 'aac_adtstoasc'] + elif protocol == 'rtmp': + args += ['-f', 'flv'] + elif ext == 'mp4' and tmpfilename == '-': + args += ['-f', 'mpegts'] + elif ext == 'unknown_video': + ext = determine_ext(remove_end(tmpfilename, '.part')) + if ext == 'unknown_video': + self.report_warning( + 'The video format is unknown and cannot be downloaded by ffmpeg. ' + 'Explicitly set the extension in the filename to attempt download in that format') + else: + self.report_warning(f'The video format is unknown. Trying to download as {ext} according to the filename') + args += ['-f', EXT_TO_OUT_FORMATS.get(ext, ext)] + else: + args += ['-f', EXT_TO_OUT_FORMATS.get(ext, ext)] + + args += self._configuration_args(('_o1', '_o', '')) + + args = [encodeArgument(opt) for opt in args] + args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) + self._debug_cmd(args) + + piped = any(fmt['url'] in ('-', 'pipe:') for fmt in selected_formats) + with Popen(args, stdin=subprocess.PIPE, env=env) as proc: + if piped: + self.on_process_started(proc, proc.stdin) + try: + retval = proc.wait() + except BaseException as e: + # subprocces.run would send the SIGKILL signal to ffmpeg and the + # mp4 file couldn't be played, but if we ask ffmpeg to quit it + # produces a file that is playable (this is mostly useful for live + # streams). Note that Windows is not affected and produces playable + # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). + if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and not piped: + proc.communicate_or_kill(b'q') + else: + proc.kill(timeout=None) + raise + return retval + + +class AVconvFD(FFmpegFD): + pass + + +_BY_NAME = { + klass.get_basename(): klass + for name, klass in globals().items() + if name.endswith('FD') and name not in ('ExternalFD', 'FragmentFD') +} + + +def list_external_downloaders(): + return sorted(_BY_NAME.keys()) + + +def get_external_downloader(external_downloader): + """ Given the name of the executable, see whether we support the given downloader """ + bn = os.path.splitext(os.path.basename(external_downloader))[0] + return _BY_NAME.get(bn) or next(( + klass for klass in _BY_NAME.values() if klass.EXE_NAME in bn + ), None) diff --git a/yt_dlp/downloader/f4m.py b/yt_dlp/downloader/f4m.py new file mode 100644 index 0000000..28cbba0 --- /dev/null +++ b/yt_dlp/downloader/f4m.py @@ -0,0 +1,427 @@ +import base64 +import io +import itertools +import struct +import time +import urllib.parse + +from .fragment import FragmentFD +from ..compat import compat_etree_fromstring +from ..networking.exceptions import HTTPError +from ..utils import fix_xml_ampersands, xpath_text + + +class DataTruncatedError(Exception): + pass + + +class FlvReader(io.BytesIO): + """ + Reader for Flv files + The file format is documented in https://www.adobe.com/devnet/f4v.html + """ + + def read_bytes(self, n): + data = self.read(n) + if len(data) < n: + raise DataTruncatedError( + 'FlvReader error: need %d bytes while only %d bytes got' % ( + n, len(data))) + return data + + # Utility functions for reading numbers and strings + def read_unsigned_long_long(self): + return struct.unpack('!Q', self.read_bytes(8))[0] + + def read_unsigned_int(self): + return struct.unpack('!I', self.read_bytes(4))[0] + + def read_unsigned_char(self): + return struct.unpack('!B', self.read_bytes(1))[0] + + def read_string(self): + res = b'' + while True: + char = self.read_bytes(1) + if char == b'\x00': + break + res += char + return res + + def read_box_info(self): + """ + Read a box and return the info as a tuple: (box_size, box_type, box_data) + """ + real_size = size = self.read_unsigned_int() + box_type = self.read_bytes(4) + header_end = 8 + if size == 1: + real_size = self.read_unsigned_long_long() + header_end = 16 + return real_size, box_type, self.read_bytes(real_size - header_end) + + def read_asrt(self): + # version + self.read_unsigned_char() + # flags + self.read_bytes(3) + quality_entry_count = self.read_unsigned_char() + # QualityEntryCount + for i in range(quality_entry_count): + self.read_string() + + segment_run_count = self.read_unsigned_int() + segments = [] + for i in range(segment_run_count): + first_segment = self.read_unsigned_int() + fragments_per_segment = self.read_unsigned_int() + segments.append((first_segment, fragments_per_segment)) + + return { + 'segment_run': segments, + } + + def read_afrt(self): + # version + self.read_unsigned_char() + # flags + self.read_bytes(3) + # time scale + self.read_unsigned_int() + + quality_entry_count = self.read_unsigned_char() + # QualitySegmentUrlModifiers + for i in range(quality_entry_count): + self.read_string() + + fragments_count = self.read_unsigned_int() + fragments = [] + for i in range(fragments_count): + first = self.read_unsigned_int() + first_ts = self.read_unsigned_long_long() + duration = self.read_unsigned_int() + if duration == 0: + discontinuity_indicator = self.read_unsigned_char() + else: + discontinuity_indicator = None + fragments.append({ + 'first': first, + 'ts': first_ts, + 'duration': duration, + 'discontinuity_indicator': discontinuity_indicator, + }) + + return { + 'fragments': fragments, + } + + def read_abst(self): + # version + self.read_unsigned_char() + # flags + self.read_bytes(3) + + self.read_unsigned_int() # BootstrapinfoVersion + # Profile,Live,Update,Reserved + flags = self.read_unsigned_char() + live = flags & 0x20 != 0 + # time scale + self.read_unsigned_int() + # CurrentMediaTime + self.read_unsigned_long_long() + # SmpteTimeCodeOffset + self.read_unsigned_long_long() + + self.read_string() # MovieIdentifier + server_count = self.read_unsigned_char() + # ServerEntryTable + for i in range(server_count): + self.read_string() + quality_count = self.read_unsigned_char() + # QualityEntryTable + for i in range(quality_count): + self.read_string() + # DrmData + self.read_string() + # MetaData + self.read_string() + + segments_count = self.read_unsigned_char() + segments = [] + for i in range(segments_count): + box_size, box_type, box_data = self.read_box_info() + assert box_type == b'asrt' + segment = FlvReader(box_data).read_asrt() + segments.append(segment) + fragments_run_count = self.read_unsigned_char() + fragments = [] + for i in range(fragments_run_count): + box_size, box_type, box_data = self.read_box_info() + assert box_type == b'afrt' + fragments.append(FlvReader(box_data).read_afrt()) + + return { + 'segments': segments, + 'fragments': fragments, + 'live': live, + } + + def read_bootstrap_info(self): + total_size, box_type, box_data = self.read_box_info() + assert box_type == b'abst' + return FlvReader(box_data).read_abst() + + +def read_bootstrap_info(bootstrap_bytes): + return FlvReader(bootstrap_bytes).read_bootstrap_info() + + +def build_fragments_list(boot_info): + """ Return a list of (segment, fragment) for each fragment in the video """ + res = [] + segment_run_table = boot_info['segments'][0] + fragment_run_entry_table = boot_info['fragments'][0]['fragments'] + first_frag_number = fragment_run_entry_table[0]['first'] + fragments_counter = itertools.count(first_frag_number) + for segment, fragments_count in segment_run_table['segment_run']: + # In some live HDS streams (e.g. Rai), `fragments_count` is + # abnormal and causing out-of-memory errors. It's OK to change the + # number of fragments for live streams as they are updated periodically + if fragments_count == 4294967295 and boot_info['live']: + fragments_count = 2 + for _ in range(fragments_count): + res.append((segment, next(fragments_counter))) + + if boot_info['live']: + res = res[-2:] + + return res + + +def write_unsigned_int(stream, val): + stream.write(struct.pack('!I', val)) + + +def write_unsigned_int_24(stream, val): + stream.write(struct.pack('!I', val)[1:]) + + +def write_flv_header(stream): + """Writes the FLV header to stream""" + # FLV header + stream.write(b'FLV\x01') + stream.write(b'\x05') + stream.write(b'\x00\x00\x00\x09') + stream.write(b'\x00\x00\x00\x00') + + +def write_metadata_tag(stream, metadata): + """Writes optional metadata tag to stream""" + SCRIPT_TAG = b'\x12' + FLV_TAG_HEADER_LEN = 11 + + if metadata: + stream.write(SCRIPT_TAG) + write_unsigned_int_24(stream, len(metadata)) + stream.write(b'\x00\x00\x00\x00\x00\x00\x00') + stream.write(metadata) + write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata)) + + +def remove_encrypted_media(media): + return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib + and 'drmAdditionalHeaderSetId' not in e.attrib, + media)) + + +def _add_ns(prop, ver=1): + return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop) + + +def get_base_url(manifest): + base_url = xpath_text( + manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)], + 'base URL', default=None) + if base_url: + base_url = base_url.strip() + return base_url + + +class F4mFD(FragmentFD): + """ + A downloader for f4m manifests or AdobeHDS. + """ + + def _get_unencrypted_media(self, doc): + media = doc.findall(_add_ns('media')) + if not media: + self.report_error('No media found') + if not self.params.get('allow_unplayable_formats'): + for e in (doc.findall(_add_ns('drmAdditionalHeader')) + + doc.findall(_add_ns('drmAdditionalHeaderSet'))): + # If id attribute is missing it's valid for all media nodes + # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute + if 'id' not in e.attrib: + self.report_error('Missing ID in f4m DRM') + media = remove_encrypted_media(media) + if not media: + self.report_error('Unsupported DRM') + return media + + def _get_bootstrap_from_url(self, bootstrap_url): + bootstrap = self.ydl.urlopen(bootstrap_url).read() + return read_bootstrap_info(bootstrap) + + def _update_live_fragments(self, bootstrap_url, latest_fragment): + fragments_list = [] + retries = 30 + while (not fragments_list) and (retries > 0): + boot_info = self._get_bootstrap_from_url(bootstrap_url) + fragments_list = build_fragments_list(boot_info) + fragments_list = [f for f in fragments_list if f[1] > latest_fragment] + if not fragments_list: + # Retry after a while + time.sleep(5.0) + retries -= 1 + + if not fragments_list: + self.report_error('Failed to update fragments') + + return fragments_list + + def _parse_bootstrap_node(self, node, base_url): + # Sometimes non empty inline bootstrap info can be specified along + # with bootstrap url attribute (e.g. dummy inline bootstrap info + # contains whitespace characters in [1]). We will prefer bootstrap + # url over inline bootstrap info when present. + # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m + bootstrap_url = node.get('url') + if bootstrap_url: + bootstrap_url = urllib.parse.urljoin( + base_url, bootstrap_url) + boot_info = self._get_bootstrap_from_url(bootstrap_url) + else: + bootstrap_url = None + bootstrap = base64.b64decode(node.text) + boot_info = read_bootstrap_info(bootstrap) + return boot_info, bootstrap_url + + def real_download(self, filename, info_dict): + man_url = info_dict['url'] + requested_bitrate = info_dict.get('tbr') + self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) + + urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) + man_url = urlh.url + # Some manifests may be malformed, e.g. prosiebensat1 generated manifests + # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244 + # and https://github.com/ytdl-org/youtube-dl/issues/7823) + manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip() + + doc = compat_etree_fromstring(manifest) + formats = [(int(f.attrib.get('bitrate', -1)), f) + for f in self._get_unencrypted_media(doc)] + if requested_bitrate is None or len(formats) == 1: + # get the best format + formats = sorted(formats, key=lambda f: f[0]) + rate, media = formats[-1] + else: + rate, media = list(filter( + lambda f: int(f[0]) == requested_bitrate, formats))[0] + + # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. + man_base_url = get_base_url(doc) or man_url + + base_url = urllib.parse.urljoin(man_base_url, media.attrib['url']) + bootstrap_node = doc.find(_add_ns('bootstrapInfo')) + boot_info, bootstrap_url = self._parse_bootstrap_node( + bootstrap_node, man_base_url) + live = boot_info['live'] + metadata_node = media.find(_add_ns('metadata')) + if metadata_node is not None: + metadata = base64.b64decode(metadata_node.text) + else: + metadata = None + + fragments_list = build_fragments_list(boot_info) + test = self.params.get('test', False) + if test: + # We only download the first fragment + fragments_list = fragments_list[:1] + total_frags = len(fragments_list) + # For some akamai manifests we'll need to add a query to the fragment url + akamai_pv = xpath_text(doc, _add_ns('pv-2.0')) + + ctx = { + 'filename': filename, + 'total_frags': total_frags, + 'live': bool(live), + } + + self._prepare_frag_download(ctx) + + dest_stream = ctx['dest_stream'] + + if ctx['complete_frags_downloaded_bytes'] == 0: + write_flv_header(dest_stream) + if not live: + write_metadata_tag(dest_stream, metadata) + + base_url_parsed = urllib.parse.urlparse(base_url) + + self._start_frag_download(ctx, info_dict) + + frag_index = 0 + while fragments_list: + seg_i, frag_i = fragments_list.pop(0) + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue + name = 'Seg%d-Frag%d' % (seg_i, frag_i) + query = [] + if base_url_parsed.query: + query.append(base_url_parsed.query) + if akamai_pv: + query.append(akamai_pv.strip(';')) + if info_dict.get('extra_param_to_segment_url'): + query.append(info_dict['extra_param_to_segment_url']) + url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) + try: + success = self._download_fragment(ctx, url_parsed.geturl(), info_dict) + if not success: + return False + down_data = self._read_fragment(ctx) + reader = FlvReader(down_data) + while True: + try: + _, box_type, box_data = reader.read_box_info() + except DataTruncatedError: + if test: + # In tests, segments may be truncated, and thus + # FlvReader may not be able to parse the whole + # chunk. If so, write the segment as is + # See https://github.com/ytdl-org/youtube-dl/issues/9214 + dest_stream.write(down_data) + break + raise + if box_type == b'mdat': + self._append_fragment(ctx, box_data) + break + except HTTPError as err: + if live and (err.status == 404 or err.status == 410): + # We didn't keep up with the live window. Continue + # with the next available fragment. + msg = 'Fragment %d unavailable' % frag_i + self.report_warning(msg) + fragments_list = [] + else: + raise + + if not fragments_list and not test and live and bootstrap_url: + fragments_list = self._update_live_fragments(bootstrap_url, frag_i) + total_frags += len(fragments_list) + if fragments_list and (fragments_list[0][1] > frag_i + 1): + msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) + self.report_warning(msg) + + return self._finish_frag_download(ctx, info_dict) diff --git a/yt_dlp/downloader/fc2.py b/yt_dlp/downloader/fc2.py new file mode 100644 index 0000000..f9763de --- /dev/null +++ b/yt_dlp/downloader/fc2.py @@ -0,0 +1,46 @@ +import threading + +from .common import FileDownloader +from .external import FFmpegFD + + +class FC2LiveFD(FileDownloader): + """ + Downloads FC2 live without being stopped.
+ Note, this is not a part of public API, and will be removed without notice. + DO NOT USE + """ + + def real_download(self, filename, info_dict): + ws = info_dict['ws'] + + heartbeat_lock = threading.Lock() + heartbeat_state = [None, 1] + + def heartbeat(): + if heartbeat_state[1] < 0: + return + + try: + heartbeat_state[1] += 1 + ws.send('{"name":"heartbeat","arguments":{},"id":%d}' % heartbeat_state[1]) + except Exception: + self.to_screen('[fc2:live] Heartbeat failed') + + with heartbeat_lock: + heartbeat_state[0] = threading.Timer(30, heartbeat) + heartbeat_state[0]._daemonic = True + heartbeat_state[0].start() + + heartbeat() + + new_info_dict = info_dict.copy() + new_info_dict.update({ + 'ws': None, + 'protocol': 'live_ffmpeg', + }) + try: + return FFmpegFD(self.ydl, self.params or {}).download(filename, new_info_dict) + finally: + # stop heartbeating + heartbeat_state[1] = -1 diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py new file mode 100644 index 0000000..b4f003d --- /dev/null +++ b/yt_dlp/downloader/fragment.py @@ -0,0 +1,527 @@ +import concurrent.futures +import contextlib +import json +import math +import os +import struct +import time + +from .common import FileDownloader +from .http import HttpFD +from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 +from ..compat import compat_os_name +from ..networking import Request +from ..networking.exceptions import HTTPError, IncompleteRead +from ..utils import DownloadError, RetryManager, encodeFilename, traverse_obj +from ..utils.networking import HTTPHeaderDict +from ..utils.progress import ProgressCalculator + + +class HttpQuietDownloader(HttpFD): + def to_screen(self, *args, **kargs): + pass + + to_console_title = to_screen + + +class FragmentFD(FileDownloader): + """ + A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). + + Available options: + + fragment_retries: Number of times to retry a fragment for HTTP error + (DASH and hlsnative only). Default is 0 for API, but 10 for CLI + skip_unavailable_fragments: + Skip unavailable fragments (DASH and hlsnative only) + keep_fragments: Keep downloaded fragments on disk after downloading is + finished + concurrent_fragment_downloads: The number of threads to use for native hls and dash downloads + _no_ytdl_file: Don't use .ytdl file + + For each incomplete fragment download yt-dlp keeps on disk a special + bookkeeping file with download state and metadata (in future such files will + be used for any incomplete download handled by yt-dlp). This file is + used to properly handle resuming, check download file consistency and detect + potential errors. The file has a .ytdl extension and represents a standard + JSON file of the following format: + + extractor: + Dictionary of extractor related data. TBD. + + downloader: + Dictionary of downloader related data. May contain following data: + current_fragment: + Dictionary with current (being downloaded) fragment data: + index: 0-based index of current fragment among all fragments + fragment_count: + Total count of fragments + + This feature is experimental and file format may change in future. + """ + + def report_retry_fragment(self, err, frag_index, count, retries): + self.deprecation_warning('yt_dlp.downloader.FragmentFD.report_retry_fragment is deprecated. ' + 'Use yt_dlp.downloader.FileDownloader.report_retry instead') + return self.report_retry(err, count, retries, frag_index) + + def report_skip_fragment(self, frag_index, err=None): + err = f' {err};' if err else '' + self.to_screen(f'[download]{err} Skipping fragment {frag_index:d} ...') + + def _prepare_url(self, info_dict, url): + headers = info_dict.get('http_headers') + return Request(url, None, headers) if headers else url + + def _prepare_and_start_frag_download(self, ctx, info_dict): + self._prepare_frag_download(ctx) + self._start_frag_download(ctx, info_dict) + + def __do_ytdl_file(self, ctx): + return ctx['live'] is not True and ctx['tmpfilename'] != '-' and not self.params.get('_no_ytdl_file') + + def _read_ytdl_file(self, ctx): + assert 'ytdl_corrupt' not in ctx + stream, _ = self.sanitize_open(self.ytdl_filename(ctx['filename']), 'r') + try: + ytdl_data = json.loads(stream.read()) + ctx['fragment_index'] = ytdl_data['downloader']['current_fragment']['index'] + if 'extra_state' in ytdl_data['downloader']: + ctx['extra_state'] = ytdl_data['downloader']['extra_state'] + except Exception: + ctx['ytdl_corrupt'] = True + finally: + stream.close() + + def _write_ytdl_file(self, ctx): + frag_index_stream, _ = self.sanitize_open(self.ytdl_filename(ctx['filename']), 'w') + try: + downloader = { + 'current_fragment': { + 'index': ctx['fragment_index'], + }, + } + if 'extra_state' in ctx: + downloader['extra_state'] = ctx['extra_state'] + if ctx.get('fragment_count') is not None: + downloader['fragment_count'] = ctx['fragment_count'] + frag_index_stream.write(json.dumps({'downloader': downloader})) + finally: + frag_index_stream.close() + + def _download_fragment(self, ctx, frag_url, info_dict, headers=None, request_data=None): + fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) + fragment_info_dict = { + 'url': frag_url, + 'http_headers': headers or info_dict.get('http_headers'), + 'request_data': request_data, + 'ctx_id': ctx.get('ctx_id'), + } + frag_resume_len = 0 + if ctx['dl'].params.get('continuedl', True): + frag_resume_len = self.filesize_or_none(self.temp_name(fragment_filename)) + fragment_info_dict['frag_resume_len'] = ctx['frag_resume_len'] = frag_resume_len + + success, _ = ctx['dl'].download(fragment_filename, fragment_info_dict) + if not success: + return False + if fragment_info_dict.get('filetime'): + ctx['fragment_filetime'] = fragment_info_dict.get('filetime') + ctx['fragment_filename_sanitized'] = fragment_filename + return True + + def _read_fragment(self, ctx): + if not ctx.get('fragment_filename_sanitized'): + return None + try: + down, frag_sanitized = self.sanitize_open(ctx['fragment_filename_sanitized'], 'rb') + except FileNotFoundError: + if ctx.get('live'): + return None + raise + ctx['fragment_filename_sanitized'] = frag_sanitized + frag_content = down.read() + down.close() + return frag_content + + def _append_fragment(self, ctx, frag_content): + try: + ctx['dest_stream'].write(frag_content) + ctx['dest_stream'].flush() + finally: + if self.__do_ytdl_file(ctx): + self._write_ytdl_file(ctx) + if not self.params.get('keep_fragments', False): + self.try_remove(encodeFilename(ctx['fragment_filename_sanitized'])) + del ctx['fragment_filename_sanitized'] + + def _prepare_frag_download(self, ctx): + if not ctx.setdefault('live', False): + total_frags_str = '%d' % ctx['total_frags'] + ad_frags = ctx.get('ad_frags', 0) + if ad_frags: + total_frags_str += ' (not including %d ad)' % ad_frags + else: + total_frags_str = 'unknown (live)' + self.to_screen(f'[{self.FD_NAME}] Total fragments: {total_frags_str}') + self.report_destination(ctx['filename']) + dl = HttpQuietDownloader(self.ydl, { + **self.params, + 'noprogress': True, + 'test': False, + 'sleep_interval': 0, + 'max_sleep_interval': 0, + 'sleep_interval_subtitles': 0, + }) + tmpfilename = self.temp_name(ctx['filename']) + open_mode = 'wb' + + # Establish possible resume length + resume_len = self.filesize_or_none(tmpfilename) + if resume_len > 0: + open_mode = 'ab' + + # Should be initialized before ytdl file check + ctx.update({ + 'tmpfilename': tmpfilename, + 'fragment_index': 0, + }) + + if self.__do_ytdl_file(ctx): + ytdl_file_exists = os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))) + continuedl = self.params.get('continuedl', True) + if continuedl and ytdl_file_exists: + self._read_ytdl_file(ctx) + is_corrupt = ctx.get('ytdl_corrupt') is True + is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 + if is_corrupt or is_inconsistent: + message = ( + '.ytdl file is corrupt' if is_corrupt else + 'Inconsistent state of incomplete fragment download') + self.report_warning( + '%s. Restarting from the beginning ...' % message) + ctx['fragment_index'] = resume_len = 0 + if 'ytdl_corrupt' in ctx: + del ctx['ytdl_corrupt'] + self._write_ytdl_file(ctx) + + else: + if not continuedl: + if ytdl_file_exists: + self._read_ytdl_file(ctx) + ctx['fragment_index'] = resume_len = 0 + self._write_ytdl_file(ctx) + assert ctx['fragment_index'] == 0 + + dest_stream, tmpfilename = self.sanitize_open(tmpfilename, open_mode) + + ctx.update({ + 'dl': dl, + 'dest_stream': dest_stream, + 'tmpfilename': tmpfilename, + # Total complete fragments downloaded so far in bytes + 'complete_frags_downloaded_bytes': resume_len, + }) + + def _start_frag_download(self, ctx, info_dict): + resume_len = ctx['complete_frags_downloaded_bytes'] + total_frags = ctx['total_frags'] + ctx_id = ctx.get('ctx_id') + # Stores the download progress, updated by the progress hook + state = { + 'status': 'downloading', + 'downloaded_bytes': resume_len, + 'fragment_index': ctx['fragment_index'], + 'fragment_count': total_frags, + 'filename': ctx['filename'], + 'tmpfilename': ctx['tmpfilename'], + } + + ctx['started'] = time.time() + progress = ProgressCalculator(resume_len) + + def frag_progress_hook(s): + if s['status'] not in ('downloading', 'finished'): + return + + if not total_frags and ctx.get('fragment_count'): + state['fragment_count'] = ctx['fragment_count'] + + if ctx_id is not None and s.get('ctx_id') != ctx_id: + return + + state['max_progress'] = ctx.get('max_progress') + state['progress_idx'] = ctx.get('progress_idx') + + state['elapsed'] = progress.elapsed + frag_total_bytes = s.get('total_bytes') or 0 + s['fragment_info_dict'] = s.pop('info_dict', {}) + + # XXX: Fragment resume is not accounted for here + if not ctx['live']: + estimated_size = ( + (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) + / (state['fragment_index'] + 1) * total_frags) + progress.total = estimated_size + progress.update(s.get('downloaded_bytes')) + state['total_bytes_estimate'] = progress.total + else: + progress.update(s.get('downloaded_bytes')) + + if s['status'] == 'finished': + state['fragment_index'] += 1 + ctx['fragment_index'] = state['fragment_index'] + progress.thread_reset() + + state['downloaded_bytes'] = ctx['complete_frags_downloaded_bytes'] = progress.downloaded + state['speed'] = ctx['speed'] = progress.speed.smooth + state['eta'] = progress.eta.smooth + + self._hook_progress(state, info_dict) + + ctx['dl'].add_progress_hook(frag_progress_hook) + + return ctx['started'] + + def _finish_frag_download(self, ctx, info_dict): + ctx['dest_stream'].close() + if self.__do_ytdl_file(ctx): + self.try_remove(self.ytdl_filename(ctx['filename'])) + elapsed = time.time() - ctx['started'] + + to_file = ctx['tmpfilename'] != '-' + if to_file: + downloaded_bytes = self.filesize_or_none(ctx['tmpfilename']) + else: + downloaded_bytes = ctx['complete_frags_downloaded_bytes'] + + if not downloaded_bytes: + if to_file: + self.try_remove(ctx['tmpfilename']) + self.report_error('The downloaded file is empty') + return False + elif to_file: + self.try_rename(ctx['tmpfilename'], ctx['filename']) + filetime = ctx.get('fragment_filetime') + if self.params.get('updatetime', True) and filetime: + with contextlib.suppress(Exception): + os.utime(ctx['filename'], (time.time(), filetime)) + + self._hook_progress({ + 'downloaded_bytes': downloaded_bytes, + 'total_bytes': downloaded_bytes, + 'filename': ctx['filename'], + 'status': 'finished', + 'elapsed': elapsed, + 'ctx_id': ctx.get('ctx_id'), + 'max_progress': ctx.get('max_progress'), + 'progress_idx': ctx.get('progress_idx'), + }, info_dict) + return True + + def _prepare_external_frag_download(self, ctx): + if 'live' not in ctx: + ctx['live'] = False + if not ctx['live']: + total_frags_str = '%d' % ctx['total_frags'] + ad_frags = ctx.get('ad_frags', 0) + if ad_frags: + total_frags_str += ' (not including %d ad)' % ad_frags + else: + total_frags_str = 'unknown (live)' + self.to_screen(f'[{self.FD_NAME}] Total fragments: {total_frags_str}') + + tmpfilename = self.temp_name(ctx['filename']) + + # Should be initialized before ytdl file check + ctx.update({ + 'tmpfilename': tmpfilename, + 'fragment_index': 0, + }) + + def decrypter(self, info_dict): + _key_cache = {} + + def _get_key(url): + if url not in _key_cache: + _key_cache[url] = self.ydl.urlopen(self._prepare_url(info_dict, url)).read() + return _key_cache[url] + + def decrypt_fragment(fragment, frag_content): + if frag_content is None: + return + decrypt_info = fragment.get('decrypt_info') + if not decrypt_info or decrypt_info['METHOD'] != 'AES-128': + return frag_content + iv = decrypt_info.get('IV') or struct.pack('>8xq', fragment['media_sequence']) + decrypt_info['KEY'] = (decrypt_info.get('KEY') + or _get_key(traverse_obj(info_dict, ('hls_aes', 'uri')) or decrypt_info['URI'])) + # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block + # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded, + # not what it decrypts to. + if self.params.get('test', False): + return frag_content + return unpad_pkcs7(aes_cbc_decrypt_bytes(frag_content, decrypt_info['KEY'], iv)) + + return decrypt_fragment + + def download_and_append_fragments_multiple(self, *args, **kwargs): + ''' + @params (ctx1, fragments1, info_dict1), (ctx2, fragments2, info_dict2), ... + all args must be either tuple or list + ''' + interrupt_trigger = [True] + max_progress = len(args) + if max_progress == 1: + return self.download_and_append_fragments(*args[0], **kwargs) + max_workers = self.params.get('concurrent_fragment_downloads', 1) + if max_progress > 1: + self._prepare_multiline_status(max_progress) + is_live = any(traverse_obj(args, (..., 2, 'is_live'))) + + def thread_func(idx, ctx, fragments, info_dict, tpe): + ctx['max_progress'] = max_progress + ctx['progress_idx'] = idx + return self.download_and_append_fragments( + ctx, fragments, info_dict, **kwargs, tpe=tpe, interrupt_trigger=interrupt_trigger) + + class FTPE(concurrent.futures.ThreadPoolExecutor): + # has to stop this or it's going to wait on the worker thread itself + def __exit__(self, exc_type, exc_val, exc_tb): + pass + + if compat_os_name == 'nt': + def future_result(future): + while True: + try: + return future.result(0.1) + except KeyboardInterrupt: + raise + except concurrent.futures.TimeoutError: + continue + else: + def future_result(future): + return future.result() + + def interrupt_trigger_iter(fg): + for f in fg: + if not interrupt_trigger[0]: + break + yield f + + spins = [] + for idx, (ctx, fragments, info_dict) in enumerate(args): + tpe = FTPE(math.ceil(max_workers / max_progress)) + job = tpe.submit(thread_func, idx, ctx, interrupt_trigger_iter(fragments), info_dict, tpe) + spins.append((tpe, job)) + + result = True + for tpe, job in spins: + try: + result = result and future_result(job) + except KeyboardInterrupt: + interrupt_trigger[0] = False + finally: + tpe.shutdown(wait=True) + if not interrupt_trigger[0] and not is_live: + raise KeyboardInterrupt() + # we expect the user wants to stop and DO WANT the preceding postprocessors to run; + # so returning a intermediate result here instead of KeyboardInterrupt on live + return result + + def download_and_append_fragments( + self, ctx, fragments, info_dict, *, is_fatal=(lambda idx: False), + pack_func=(lambda content, idx: content), finish_func=None, + tpe=None, interrupt_trigger=(True, )): + + if not self.params.get('skip_unavailable_fragments', True): + is_fatal = lambda _: True + + def download_fragment(fragment, ctx): + if not interrupt_trigger[0]: + return + + frag_index = ctx['fragment_index'] = fragment['frag_index'] + ctx['last_error'] = None + headers = HTTPHeaderDict(info_dict.get('http_headers')) + byte_range = fragment.get('byte_range') + if byte_range: + headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1) + + # Never skip the first fragment + fatal = is_fatal(fragment.get('index') or (frag_index - 1)) + + def error_callback(err, count, retries): + if fatal and count > retries: + ctx['dest_stream'].close() + self.report_retry(err, count, retries, frag_index, fatal) + ctx['last_error'] = err + + for retry in RetryManager(self.params.get('fragment_retries'), error_callback): + try: + ctx['fragment_count'] = fragment.get('fragment_count') + if not self._download_fragment( + ctx, fragment['url'], info_dict, headers, info_dict.get('request_data')): + return + except (HTTPError, IncompleteRead) as err: + retry.error = err + continue + except DownloadError: # has own retry settings + if fatal: + raise + + def append_fragment(frag_content, frag_index, ctx): + if frag_content: + self._append_fragment(ctx, pack_func(frag_content, frag_index)) + elif not is_fatal(frag_index - 1): + self.report_skip_fragment(frag_index, 'fragment not found') + else: + ctx['dest_stream'].close() + self.report_error(f'fragment {frag_index} not found, unable to continue') + return False + return True + + decrypt_fragment = self.decrypter(info_dict) + + max_workers = math.ceil( + self.params.get('concurrent_fragment_downloads', 1) / ctx.get('max_progress', 1)) + if max_workers > 1: + def _download_fragment(fragment): + ctx_copy = ctx.copy() + download_fragment(fragment, ctx_copy) + return fragment, fragment['frag_index'], ctx_copy.get('fragment_filename_sanitized') + + with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool: + try: + for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments): + ctx.update({ + 'fragment_filename_sanitized': frag_filename, + 'fragment_index': frag_index, + }) + if not append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), frag_index, ctx): + return False + except KeyboardInterrupt: + self._finish_multiline_status() + self.report_error( + 'Interrupted by user. Waiting for all threads to shutdown...', is_error=False, tb=False) + pool.shutdown(wait=False) + raise + else: + for fragment in fragments: + if not interrupt_trigger[0]: + break + try: + download_fragment(fragment, ctx) + result = append_fragment( + decrypt_fragment(fragment, self._read_fragment(ctx)), fragment['frag_index'], ctx) + except KeyboardInterrupt: + if info_dict.get('is_live'): + break + raise + if not result: + return False + + if finish_func is not None: + ctx['dest_stream'].write(finish_func()) + ctx['dest_stream'].flush() + return self._finish_frag_download(ctx, info_dict) diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py new file mode 100644 index 0000000..4ac5d99 --- /dev/null +++ b/yt_dlp/downloader/hls.py @@ -0,0 +1,378 @@ +import binascii +import io +import re +import urllib.parse + +from . import get_suitable_downloader +from .external import FFmpegFD +from .fragment import FragmentFD +from .. import webvtt +from ..dependencies import Cryptodome +from ..utils import ( + bug_reports_message, + parse_m3u8_attributes, + remove_start, + traverse_obj, + update_url_query, + urljoin, +) + + +class HlsFD(FragmentFD): + """ + Download segments in a m3u8 manifest. External downloaders can take over + the fragment downloads by supporting the 'm3u8_frag_urls' protocol and + re-defining 'supports_manifest' function + """ + + FD_NAME = 'hlsnative' + + @staticmethod + def _has_drm(manifest): # TODO: https://github.com/yt-dlp/yt-dlp/pull/5039 + return bool(re.search('|'.join(( + r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay + r'#EXT-X-(?:SESSION-)?KEY:.*?KEYFORMAT="com\.apple\.streamingkeydelivery"', # Apple FairPlay + r'#EXT-X-(?:SESSION-)?KEY:.*?KEYFORMAT="com\.microsoft\.playready"', # Microsoft PlayReady + r'#EXT-X-FAXS-CM:', # Adobe Flash Access + )), manifest)) + + @classmethod + def can_download(cls, manifest, info_dict, allow_unplayable_formats=False): + UNSUPPORTED_FEATURES = [ + # r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] + + # Live streams heuristic does not always work (e.g. geo restricted to Germany + # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0) + # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] + + # This heuristic also is not correct since segments may not be appended as well. + # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite + # no segments will definitely be appended to the end of the playlist. + # r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of + # # event media playlists [4] + # r'#EXT-X-MAP:', # media initialization [5] + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 + # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 + # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 + # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 + # 5. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.5 + ] + if not allow_unplayable_formats: + UNSUPPORTED_FEATURES += [ + r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1], but not necessarily DRM + ] + + def check_results(): + yield not info_dict.get('is_live') + for feature in UNSUPPORTED_FEATURES: + yield not re.search(feature, manifest) + if not allow_unplayable_formats: + yield not cls._has_drm(manifest) + return all(check_results()) + + def real_download(self, filename, info_dict): + man_url = info_dict['url'] + self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) + + urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) + man_url = urlh.url + s = urlh.read().decode('utf-8', 'ignore') + + can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None + if can_download: + has_ffmpeg = FFmpegFD.available() + no_crypto = not Cryptodome.AES and '#EXT-X-KEY:METHOD=AES-128' in s + if no_crypto and has_ffmpeg: + can_download, message = False, 'The stream has AES-128 encryption and pycryptodomex is not available' + elif no_crypto: + message = ('The stream has AES-128 encryption and neither ffmpeg nor pycryptodomex are available; ' + 'Decryption will be performed natively, but will be extremely slow') + elif info_dict.get('extractor_key') == 'Generic' and re.search(r'(?m)#EXT-X-MEDIA-SEQUENCE:(?!0$)', s): + install_ffmpeg = '' if has_ffmpeg else 'install ffmpeg and ' + message = ('Live HLS streams are not supported by the native downloader. If this is a livestream, ' + f'please {install_ffmpeg}add "--downloader ffmpeg --hls-use-mpegts" to your command') + if not can_download: + if self._has_drm(s) and not self.params.get('allow_unplayable_formats'): + if info_dict.get('has_drm') and self.params.get('test'): + self.to_screen(f'[{self.FD_NAME}] This format is DRM protected', skip_eol=True) + else: + self.report_error( + 'This format is DRM protected; Try selecting another format with --format or ' + 'add --check-formats to automatically fallback to the next best format', tb=False) + return False + message = message or 'Unsupported features have been detected' + fd = FFmpegFD(self.ydl, self.params) + self.report_warning(f'{message}; extraction will be delegated to {fd.get_basename()}') + return fd.real_download(filename, info_dict) + elif message: + self.report_warning(message) + + is_webvtt = info_dict['ext'] == 'vtt' + if is_webvtt: + real_downloader = None # Packing the fragments is not currently supported for external downloader + else: + real_downloader = get_suitable_downloader( + info_dict, self.params, None, protocol='m3u8_frag_urls', to_stdout=(filename == '-')) + if real_downloader and not real_downloader.supports_manifest(s): + real_downloader = None + if real_downloader: + self.to_screen(f'[{self.FD_NAME}] Fragment downloads will be delegated to {real_downloader.get_basename()}') + + def is_ad_fragment_start(s): + return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s + or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad')) + + def is_ad_fragment_end(s): + return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s + or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment')) + + fragments = [] + + media_frags = 0 + ad_frags = 0 + ad_frag_next = False + for line in s.splitlines(): + line = line.strip() + if not line: + continue + if line.startswith('#'): + if is_ad_fragment_start(line): + ad_frag_next = True + elif is_ad_fragment_end(line): + ad_frag_next = False + continue + if ad_frag_next: + ad_frags += 1 + continue + media_frags += 1 + + ctx = { + 'filename': filename, + 'total_frags': media_frags, + 'ad_frags': ad_frags, + } + + if real_downloader: + self._prepare_external_frag_download(ctx) + else: + self._prepare_and_start_frag_download(ctx, info_dict) + + extra_state = ctx.setdefault('extra_state', {}) + + format_index = info_dict.get('format_index') + extra_query = None + extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') + if extra_param_to_segment_url: + extra_query = urllib.parse.parse_qs(extra_param_to_segment_url) + i = 0 + media_sequence = 0 + decrypt_info = {'METHOD': 'NONE'} + external_aes_key = traverse_obj(info_dict, ('hls_aes', 'key')) + if external_aes_key: + external_aes_key = binascii.unhexlify(remove_start(external_aes_key, '0x')) + assert len(external_aes_key) in (16, 24, 32), 'Invalid length for HLS AES-128 key' + external_aes_iv = traverse_obj(info_dict, ('hls_aes', 'iv')) + if external_aes_iv: + external_aes_iv = binascii.unhexlify(remove_start(external_aes_iv, '0x').zfill(32)) + byte_range = {} + discontinuity_count = 0 + frag_index = 0 + ad_frag_next = False + for line in s.splitlines(): + line = line.strip() + if line: + if not line.startswith('#'): + if format_index and discontinuity_count != format_index: + continue + if ad_frag_next: + continue + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue + frag_url = urljoin(man_url, line) + if extra_query: + frag_url = update_url_query(frag_url, extra_query) + + fragments.append({ + 'frag_index': frag_index, + 'url': frag_url, + 'decrypt_info': decrypt_info, + 'byte_range': byte_range, + 'media_sequence': media_sequence, + }) + media_sequence += 1 + + elif line.startswith('#EXT-X-MAP'): + if format_index and discontinuity_count != format_index: + continue + if frag_index > 0: + self.report_error( + 'Initialization fragment found after media fragments, unable to download') + return False + frag_index += 1 + map_info = parse_m3u8_attributes(line[11:]) + frag_url = urljoin(man_url, map_info.get('URI')) + if extra_query: + frag_url = update_url_query(frag_url, extra_query) + + if map_info.get('BYTERANGE'): + splitted_byte_range = map_info.get('BYTERANGE').split('@') + sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end'] + byte_range = { + 'start': sub_range_start, + 'end': sub_range_start + int(splitted_byte_range[0]), + } + + fragments.append({ + 'frag_index': frag_index, + 'url': frag_url, + 'decrypt_info': decrypt_info, + 'byte_range': byte_range, + 'media_sequence': media_sequence + }) + media_sequence += 1 + + elif line.startswith('#EXT-X-KEY'): + decrypt_url = decrypt_info.get('URI') + decrypt_info = parse_m3u8_attributes(line[11:]) + if decrypt_info['METHOD'] == 'AES-128': + if external_aes_iv: + decrypt_info['IV'] = external_aes_iv + elif 'IV' in decrypt_info: + decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32)) + if external_aes_key: + decrypt_info['KEY'] = external_aes_key + else: + decrypt_info['URI'] = urljoin(man_url, decrypt_info['URI']) + if extra_query: + decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) + if decrypt_url != decrypt_info['URI']: + decrypt_info['KEY'] = None + + elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): + media_sequence = int(line[22:]) + elif line.startswith('#EXT-X-BYTERANGE'): + splitted_byte_range = line[17:].split('@') + sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end'] + byte_range = { + 'start': sub_range_start, + 'end': sub_range_start + int(splitted_byte_range[0]), + } + elif is_ad_fragment_start(line): + ad_frag_next = True + elif is_ad_fragment_end(line): + ad_frag_next = False + elif line.startswith('#EXT-X-DISCONTINUITY'): + discontinuity_count += 1 + i += 1 + + # We only download the first fragment during the test + if self.params.get('test', False): + fragments = [fragments[0] if fragments else None] + + if real_downloader: + info_dict['fragments'] = fragments + fd = real_downloader(self.ydl, self.params) + # TODO: Make progress updates work without hooking twice + # for ph in self._progress_hooks: + # fd.add_progress_hook(ph) + return fd.real_download(filename, info_dict) + + if is_webvtt: + def pack_fragment(frag_content, frag_index): + output = io.StringIO() + adjust = 0 + overflow = False + mpegts_last = None + for block in webvtt.parse_fragment(frag_content): + if isinstance(block, webvtt.CueBlock): + extra_state['webvtt_mpegts_last'] = mpegts_last + if overflow: + extra_state['webvtt_mpegts_adjust'] += 1 + overflow = False + block.start += adjust + block.end += adjust + + dedup_window = extra_state.setdefault('webvtt_dedup_window', []) + + ready = [] + + i = 0 + is_new = True + while i < len(dedup_window): + wcue = dedup_window[i] + wblock = webvtt.CueBlock.from_json(wcue) + i += 1 + if wblock.hinges(block): + wcue['end'] = block.end + is_new = False + continue + if wblock == block: + is_new = False + continue + if wblock.end > block.start: + continue + ready.append(wblock) + i -= 1 + del dedup_window[i] + + if is_new: + dedup_window.append(block.as_json) + for block in ready: + block.write_into(output) + + # we only emit cues once they fall out of the duplicate window + continue + elif isinstance(block, webvtt.Magic): + # take care of MPEG PES timestamp overflow + if block.mpegts is None: + block.mpegts = 0 + extra_state.setdefault('webvtt_mpegts_adjust', 0) + block.mpegts += extra_state['webvtt_mpegts_adjust'] << 33 + if block.mpegts < extra_state.get('webvtt_mpegts_last', 0): + overflow = True + block.mpegts += 1 << 33 + mpegts_last = block.mpegts + + if frag_index == 1: + extra_state['webvtt_mpegts'] = block.mpegts or 0 + extra_state['webvtt_local'] = block.local or 0 + # XXX: block.local = block.mpegts = None ? + else: + if block.mpegts is not None and block.local is not None: + adjust = ( + (block.mpegts - extra_state.get('webvtt_mpegts', 0)) + - (block.local - extra_state.get('webvtt_local', 0)) + ) + continue + elif isinstance(block, webvtt.HeaderBlock): + if frag_index != 1: + # XXX: this should probably be silent as well + # or verify that all segments contain the same data + self.report_warning(bug_reports_message( + 'Discarding a %s block found in the middle of the stream; ' + 'if the subtitles display incorrectly,' + % (type(block).__name__))) + continue + block.write_into(output) + + return output.getvalue().encode() + + def fin_fragments(): + dedup_window = extra_state.get('webvtt_dedup_window') + if not dedup_window: + return b'' + + output = io.StringIO() + for cue in dedup_window: + webvtt.CueBlock.from_json(cue).write_into(output) + + return output.getvalue().encode() + + if len(fragments) == 1: + self.download_and_append_fragments(ctx, fragments, info_dict) + else: + self.download_and_append_fragments( + ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments) + else: + return self.download_and_append_fragments(ctx, fragments, info_dict) diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py new file mode 100644 index 0000000..693828b --- /dev/null +++ b/yt_dlp/downloader/http.py @@ -0,0 +1,383 @@ +import os +import random +import time + +from .common import FileDownloader +from ..networking import Request +from ..networking.exceptions import ( + CertificateVerifyError, + HTTPError, + TransportError, +) +from ..utils import ( + ContentTooShortError, + RetryManager, + ThrottledDownload, + XAttrMetadataError, + XAttrUnavailableError, + encodeFilename, + int_or_none, + parse_http_range, + try_call, + write_xattr, +) +from ..utils.networking import HTTPHeaderDict + + +class HttpFD(FileDownloader): + def real_download(self, filename, info_dict): + url = info_dict['url'] + request_data = info_dict.get('request_data', None) + + class DownloadContext(dict): + __getattr__ = dict.get + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + ctx = DownloadContext() + ctx.filename = filename + ctx.tmpfilename = self.temp_name(filename) + ctx.stream = None + + # Disable compression + headers = HTTPHeaderDict({'Accept-Encoding': 'identity'}, info_dict.get('http_headers')) + + is_test = self.params.get('test', False) + chunk_size = self._TEST_FILE_SIZE if is_test else ( + self.params.get('http_chunk_size') + or info_dict.get('downloader_options', {}).get('http_chunk_size') + or 0) + + ctx.open_mode = 'wb' + ctx.resume_len = 0 + ctx.block_size = self.params.get('buffersize', 1024) + ctx.start_time = time.time() + + # parse given Range + req_start, req_end, _ = parse_http_range(headers.get('Range')) + + if self.params.get('continuedl', True): + # Establish possible resume length + if os.path.isfile(encodeFilename(ctx.tmpfilename)): + ctx.resume_len = os.path.getsize( + encodeFilename(ctx.tmpfilename)) + + ctx.is_resume = ctx.resume_len > 0 + + class SucceedDownload(Exception): + pass + + class RetryDownload(Exception): + def __init__(self, source_error): + self.source_error = source_error + + class NextFragment(Exception): + pass + + def establish_connection(): + ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size) + if not is_test and chunk_size else chunk_size) + if ctx.resume_len > 0: + range_start = ctx.resume_len + if req_start is not None: + # offset the beginning of Range to be within request + range_start += req_start + if ctx.is_resume: + self.report_resuming_byte(ctx.resume_len) + ctx.open_mode = 'ab' + elif req_start is not None: + range_start = req_start + elif ctx.chunk_size > 0: + range_start = 0 + else: + range_start = None + ctx.is_resume = False + + if ctx.chunk_size: + chunk_aware_end = range_start + ctx.chunk_size - 1 + # we're not allowed to download outside Range + range_end = chunk_aware_end if req_end is None else min(chunk_aware_end, req_end) + elif req_end is not None: + # there's no need for chunked downloads, so download until the end of Range + range_end = req_end + else: + range_end = None + + if try_call(lambda: range_start > range_end): + ctx.resume_len = 0 + ctx.open_mode = 'wb' + raise RetryDownload(Exception(f'Conflicting range. (start={range_start} > end={range_end})')) + + if try_call(lambda: range_end >= ctx.content_len): + range_end = ctx.content_len - 1 + + request = Request(url, request_data, headers) + has_range = range_start is not None + if has_range: + request.headers['Range'] = f'bytes={int(range_start)}-{int_or_none(range_end) or ""}' + # Establish connection + try: + ctx.data = self.ydl.urlopen(request) + # When trying to resume, Content-Range HTTP header of response has to be checked + # to match the value of requested Range HTTP header. This is due to a webservers + # that don't support resuming and serve a whole file with no Content-Range + # set in response despite of requested Range (see + # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799) + if has_range: + content_range = ctx.data.headers.get('Content-Range') + content_range_start, content_range_end, content_len = parse_http_range(content_range) + # Content-Range is present and matches requested Range, resume is possible + if range_start == content_range_start and ( + # Non-chunked download + not ctx.chunk_size + # Chunked download and requested piece or + # its part is promised to be served + or content_range_end == range_end + or content_len < range_end): + ctx.content_len = content_len + if content_len or req_end: + ctx.data_len = min(content_len or req_end, req_end or content_len) - (req_start or 0) + return + # Content-Range is either not present or invalid. Assuming remote webserver is + # trying to send the whole file, resume is not possible, so wiping the local file + # and performing entire redownload + elif range_start > 0: + self.report_unable_to_resume() + ctx.resume_len = 0 + ctx.open_mode = 'wb' + ctx.data_len = ctx.content_len = int_or_none(ctx.data.headers.get('Content-length', None)) + except HTTPError as err: + if err.status == 416: + # Unable to resume (requested range not satisfiable) + try: + # Open the connection again without the range header + ctx.data = self.ydl.urlopen( + Request(url, request_data, headers)) + content_length = ctx.data.headers['Content-Length'] + except HTTPError as err: + if err.status < 500 or err.status >= 600: + raise + else: + # Examine the reported length + if (content_length is not None + and (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)): + # The file had already been fully downloaded. + # Explanation to the above condition: in issue #175 it was revealed that + # YouTube sometimes adds or removes a few bytes from the end of the file, + # changing the file size slightly and causing problems for some users. So + # I decided to implement a suggested change and consider the file + # completely downloaded if the file size differs less than 100 bytes from + # the one in the hard drive. + self.report_file_already_downloaded(ctx.filename) + self.try_rename(ctx.tmpfilename, ctx.filename) + self._hook_progress({ + 'filename': ctx.filename, + 'status': 'finished', + 'downloaded_bytes': ctx.resume_len, + 'total_bytes': ctx.resume_len, + }, info_dict) + raise SucceedDownload() + else: + # The length does not match, we start the download over + self.report_unable_to_resume() + ctx.resume_len = 0 + ctx.open_mode = 'wb' + return + elif err.status < 500 or err.status >= 600: + # Unexpected HTTP error + raise + raise RetryDownload(err) + except CertificateVerifyError: + raise + except TransportError as err: + raise RetryDownload(err) + + def close_stream(): + if ctx.stream is not None: + if not ctx.tmpfilename == '-': + ctx.stream.close() + ctx.stream = None + + def download(): + data_len = ctx.data.headers.get('Content-length') + + if ctx.data.headers.get('Content-encoding'): + # Content-encoding is present, Content-length is not reliable anymore as we are + # doing auto decompression. (See: https://github.com/yt-dlp/yt-dlp/pull/6176) + data_len = None + + # Range HTTP header may be ignored/unsupported by a webserver + # (e.g. extractor/scivee.py, extractor/bambuser.py). + # However, for a test we still would like to download just a piece of a file. + # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control + # block size when downloading a file. + if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): + data_len = self._TEST_FILE_SIZE + + if data_len is not None: + data_len = int(data_len) + ctx.resume_len + min_data_len = self.params.get('min_filesize') + max_data_len = self.params.get('max_filesize') + if min_data_len is not None and data_len < min_data_len: + self.to_screen( + f'\r[download] File is smaller than min-filesize ({data_len} bytes < {min_data_len} bytes). Aborting.') + return False + if max_data_len is not None and data_len > max_data_len: + self.to_screen( + f'\r[download] File is larger than max-filesize ({data_len} bytes > {max_data_len} bytes). Aborting.') + return False + + byte_counter = 0 + ctx.resume_len + block_size = ctx.block_size + start = time.time() + + # measure time over whole while-loop, so slow_down() and best_block_size() work together properly + now = None # needed for slow_down() in the first loop run + before = start # start measuring + + def retry(e): + close_stream() + if ctx.tmpfilename == '-': + ctx.resume_len = byte_counter + else: + try: + ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + except FileNotFoundError: + ctx.resume_len = 0 + raise RetryDownload(e) + + while True: + try: + # Download and write + data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) + except TransportError as err: + retry(err) + + byte_counter += len(data_block) + + # exit loop when download is finished + if len(data_block) == 0: + break + + # Open destination file just in time + if ctx.stream is None: + try: + ctx.stream, ctx.tmpfilename = self.sanitize_open( + ctx.tmpfilename, ctx.open_mode) + assert ctx.stream is not None + ctx.filename = self.undo_temp_name(ctx.tmpfilename) + self.report_destination(ctx.filename) + except OSError as err: + self.report_error('unable to open for writing: %s' % str(err)) + return False + + if self.params.get('xattr_set_filesize', False) and data_len is not None: + try: + write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode()) + except (XAttrUnavailableError, XAttrMetadataError) as err: + self.report_error('unable to set filesize xattr: %s' % str(err)) + + try: + ctx.stream.write(data_block) + except OSError as err: + self.to_stderr('\n') + self.report_error('unable to write data: %s' % str(err)) + return False + + # Apply rate limit + self.slow_down(start, now, byte_counter - ctx.resume_len) + + # end measuring of one loop run + now = time.time() + after = now + + # Adjust block size + if not self.params.get('noresizebuffer', False): + block_size = self.best_block_size(after - before, len(data_block)) + + before = after + + # Progress message + speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) + if ctx.data_len is None: + eta = None + else: + eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len) + + self._hook_progress({ + 'status': 'downloading', + 'downloaded_bytes': byte_counter, + 'total_bytes': ctx.data_len, + 'tmpfilename': ctx.tmpfilename, + 'filename': ctx.filename, + 'eta': eta, + 'speed': speed, + 'elapsed': now - ctx.start_time, + 'ctx_id': info_dict.get('ctx_id'), + }, info_dict) + + if data_len is not None and byte_counter == data_len: + break + + if speed and speed < (self.params.get('throttledratelimit') or 0): + # The speed must stay below the limit for 3 seconds + # This prevents raising error when the speed temporarily goes down + if ctx.throttle_start is None: + ctx.throttle_start = now + elif now - ctx.throttle_start > 3: + if ctx.stream is not None and ctx.tmpfilename != '-': + ctx.stream.close() + raise ThrottledDownload() + elif speed: + ctx.throttle_start = None + + if ctx.stream is None: + self.to_stderr('\n') + self.report_error('Did not get any data blocks') + return False + + if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len: + ctx.resume_len = byte_counter + raise NextFragment() + + if ctx.tmpfilename != '-': + ctx.stream.close() + + if data_len is not None and byte_counter != data_len: + err = ContentTooShortError(byte_counter, int(data_len)) + retry(err) + + self.try_rename(ctx.tmpfilename, ctx.filename) + + # Update file modification time + if self.params.get('updatetime', True): + info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.headers.get('last-modified', None)) + + self._hook_progress({ + 'downloaded_bytes': byte_counter, + 'total_bytes': byte_counter, + 'filename': ctx.filename, + 'status': 'finished', + 'elapsed': time.time() - ctx.start_time, + 'ctx_id': info_dict.get('ctx_id'), + }, info_dict) + + return True + + for retry in RetryManager(self.params.get('retries'), self.report_retry): + try: + establish_connection() + return download() + except RetryDownload as err: + retry.error = err.source_error + continue + except NextFragment: + retry.error = None + retry.attempt -= 1 + continue + except SucceedDownload: + return True + except: # noqa: E722 + close_stream() + raise + return False diff --git a/yt_dlp/downloader/ism.py b/yt_dlp/downloader/ism.py new file mode 100644 index 0000000..dd688f5 --- /dev/null +++ b/yt_dlp/downloader/ism.py @@ -0,0 +1,283 @@ +import binascii +import io +import struct +import time + +from .fragment import FragmentFD +from ..networking.exceptions import HTTPError +from ..utils import RetryManager + +u8 = struct.Struct('>B') +u88 = struct.Struct('>Bx') +u16 = struct.Struct('>H') +u1616 = struct.Struct('>Hxx') +u32 = struct.Struct('>I') +u64 = struct.Struct('>Q') + +s88 = struct.Struct('>bx') +s16 = struct.Struct('>h') +s1616 = struct.Struct('>hxx') +s32 = struct.Struct('>i') + +unity_matrix = (s32.pack(0x10000) + s32.pack(0) * 3) * 2 + s32.pack(0x40000000) + +TRACK_ENABLED = 0x1 +TRACK_IN_MOVIE = 0x2 +TRACK_IN_PREVIEW = 0x4 + +SELF_CONTAINED = 0x1 + + +def box(box_type, payload): + return u32.pack(8 + len(payload)) + box_type + payload + + +def full_box(box_type, version, flags, payload): + return box(box_type, u8.pack(version) + u32.pack(flags)[1:] + payload) + + +def write_piff_header(stream, params): + track_id = params['track_id'] + fourcc = params['fourcc'] + duration = params['duration'] + timescale = params.get('timescale', 10000000) + language = params.get('language', 'und') + height = params.get('height', 0) + width = params.get('width', 0) + stream_type = params['stream_type'] + creation_time = modification_time = int(time.time()) + + ftyp_payload = b'isml' # major brand + ftyp_payload += u32.pack(1) # minor version + ftyp_payload += b'piff' + b'iso2' # compatible brands + stream.write(box(b'ftyp', ftyp_payload)) # File Type Box + + mvhd_payload = u64.pack(creation_time) + mvhd_payload += u64.pack(modification_time) + mvhd_payload += u32.pack(timescale) + mvhd_payload += u64.pack(duration) + mvhd_payload += s1616.pack(1) # rate + mvhd_payload += s88.pack(1) # volume + mvhd_payload += u16.pack(0) # reserved + mvhd_payload += u32.pack(0) * 2 # reserved + mvhd_payload += unity_matrix + mvhd_payload += u32.pack(0) * 6 # pre defined + mvhd_payload += u32.pack(0xffffffff) # next track id + moov_payload = full_box(b'mvhd', 1, 0, mvhd_payload) # Movie Header Box + + tkhd_payload = u64.pack(creation_time) + tkhd_payload += u64.pack(modification_time) + tkhd_payload += u32.pack(track_id) # track id + tkhd_payload += u32.pack(0) # reserved + tkhd_payload += u64.pack(duration) + tkhd_payload += u32.pack(0) * 2 # reserved + tkhd_payload += s16.pack(0) # layer + tkhd_payload += s16.pack(0) # alternate group + tkhd_payload += s88.pack(1 if stream_type == 'audio' else 0) # volume + tkhd_payload += u16.pack(0) # reserved + tkhd_payload += unity_matrix + tkhd_payload += u1616.pack(width) + tkhd_payload += u1616.pack(height) + trak_payload = full_box(b'tkhd', 1, TRACK_ENABLED | TRACK_IN_MOVIE | TRACK_IN_PREVIEW, tkhd_payload) # Track Header Box + + mdhd_payload = u64.pack(creation_time) + mdhd_payload += u64.pack(modification_time) + mdhd_payload += u32.pack(timescale) + mdhd_payload += u64.pack(duration) + mdhd_payload += u16.pack(((ord(language[0]) - 0x60) << 10) | ((ord(language[1]) - 0x60) << 5) | (ord(language[2]) - 0x60)) + mdhd_payload += u16.pack(0) # pre defined + mdia_payload = full_box(b'mdhd', 1, 0, mdhd_payload) # Media Header Box + + hdlr_payload = u32.pack(0) # pre defined + if stream_type == 'audio': # handler type + hdlr_payload += b'soun' + hdlr_payload += u32.pack(0) * 3 # reserved + hdlr_payload += b'SoundHandler\0' # name + elif stream_type == 'video': + hdlr_payload += b'vide' + hdlr_payload += u32.pack(0) * 3 # reserved + hdlr_payload += b'VideoHandler\0' # name + elif stream_type == 'text': + hdlr_payload += b'subt' + hdlr_payload += u32.pack(0) * 3 # reserved + hdlr_payload += b'SubtitleHandler\0' # name + else: + assert False + mdia_payload += full_box(b'hdlr', 0, 0, hdlr_payload) # Handler Reference Box + + if stream_type == 'audio': + smhd_payload = s88.pack(0) # balance + smhd_payload += u16.pack(0) # reserved + media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header + elif stream_type == 'video': + vmhd_payload = u16.pack(0) # graphics mode + vmhd_payload += u16.pack(0) * 3 # opcolor + media_header_box = full_box(b'vmhd', 0, 1, vmhd_payload) # Video Media Header + elif stream_type == 'text': + media_header_box = full_box(b'sthd', 0, 0, b'') # Subtitle Media Header + else: + assert False + minf_payload = media_header_box + + dref_payload = u32.pack(1) # entry count + dref_payload += full_box(b'url ', 0, SELF_CONTAINED, b'') # Data Entry URL Box + dinf_payload = full_box(b'dref', 0, 0, dref_payload) # Data Reference Box + minf_payload += box(b'dinf', dinf_payload) # Data Information Box + + stsd_payload = u32.pack(1) # entry count + + sample_entry_payload = u8.pack(0) * 6 # reserved + sample_entry_payload += u16.pack(1) # data reference index + if stream_type == 'audio': + sample_entry_payload += u32.pack(0) * 2 # reserved + sample_entry_payload += u16.pack(params.get('channels', 2)) + sample_entry_payload += u16.pack(params.get('bits_per_sample', 16)) + sample_entry_payload += u16.pack(0) # pre defined + sample_entry_payload += u16.pack(0) # reserved + sample_entry_payload += u1616.pack(params['sampling_rate']) + + if fourcc == 'AACL': + sample_entry_box = box(b'mp4a', sample_entry_payload) + if fourcc == 'EC-3': + sample_entry_box = box(b'ec-3', sample_entry_payload) + elif stream_type == 'video': + sample_entry_payload += u16.pack(0) # pre defined + sample_entry_payload += u16.pack(0) # reserved + sample_entry_payload += u32.pack(0) * 3 # pre defined + sample_entry_payload += u16.pack(width) + sample_entry_payload += u16.pack(height) + sample_entry_payload += u1616.pack(0x48) # horiz resolution 72 dpi + sample_entry_payload += u1616.pack(0x48) # vert resolution 72 dpi + sample_entry_payload += u32.pack(0) # reserved + sample_entry_payload += u16.pack(1) # frame count + sample_entry_payload += u8.pack(0) * 32 # compressor name + sample_entry_payload += u16.pack(0x18) # depth + sample_entry_payload += s16.pack(-1) # pre defined + + codec_private_data = binascii.unhexlify(params['codec_private_data'].encode()) + if fourcc in ('H264', 'AVC1'): + sps, pps = codec_private_data.split(u32.pack(1))[1:] + avcc_payload = u8.pack(1) # configuration version + avcc_payload += sps[1:4] # avc profile indication + profile compatibility + avc level indication + avcc_payload += u8.pack(0xfc | (params.get('nal_unit_length_field', 4) - 1)) # complete representation (1) + reserved (11111) + length size minus one + avcc_payload += u8.pack(1) # reserved (0) + number of sps (0000001) + avcc_payload += u16.pack(len(sps)) + avcc_payload += sps + avcc_payload += u8.pack(1) # number of pps + avcc_payload += u16.pack(len(pps)) + avcc_payload += pps + sample_entry_payload += box(b'avcC', avcc_payload) # AVC Decoder Configuration Record + sample_entry_box = box(b'avc1', sample_entry_payload) # AVC Simple Entry + else: + assert False + elif stream_type == 'text': + if fourcc == 'TTML': + sample_entry_payload += b'http://www.w3.org/ns/ttml\0' # namespace + sample_entry_payload += b'\0' # schema location + sample_entry_payload += b'\0' # auxilary mime types(??) + sample_entry_box = box(b'stpp', sample_entry_payload) + else: + assert False + else: + assert False + stsd_payload += sample_entry_box + + stbl_payload = full_box(b'stsd', 0, 0, stsd_payload) # Sample Description Box + + stts_payload = u32.pack(0) # entry count + stbl_payload += full_box(b'stts', 0, 0, stts_payload) # Decoding Time to Sample Box + + stsc_payload = u32.pack(0) # entry count + stbl_payload += full_box(b'stsc', 0, 0, stsc_payload) # Sample To Chunk Box + + stco_payload = u32.pack(0) # entry count + stbl_payload += full_box(b'stco', 0, 0, stco_payload) # Chunk Offset Box + + minf_payload += box(b'stbl', stbl_payload) # Sample Table Box + + mdia_payload += box(b'minf', minf_payload) # Media Information Box + + trak_payload += box(b'mdia', mdia_payload) # Media Box + + moov_payload += box(b'trak', trak_payload) # Track Box + + mehd_payload = u64.pack(duration) + mvex_payload = full_box(b'mehd', 1, 0, mehd_payload) # Movie Extends Header Box + + trex_payload = u32.pack(track_id) # track id + trex_payload += u32.pack(1) # default sample description index + trex_payload += u32.pack(0) # default sample duration + trex_payload += u32.pack(0) # default sample size + trex_payload += u32.pack(0) # default sample flags + mvex_payload += full_box(b'trex', 0, 0, trex_payload) # Track Extends Box + + moov_payload += box(b'mvex', mvex_payload) # Movie Extends Box + stream.write(box(b'moov', moov_payload)) # Movie Box + + +def extract_box_data(data, box_sequence): + data_reader = io.BytesIO(data) + while True: + box_size = u32.unpack(data_reader.read(4))[0] + box_type = data_reader.read(4) + if box_type == box_sequence[0]: + box_data = data_reader.read(box_size - 8) + if len(box_sequence) == 1: + return box_data + return extract_box_data(box_data, box_sequence[1:]) + data_reader.seek(box_size - 8, 1) + + +class IsmFD(FragmentFD): + """ + Download segments in a ISM manifest + """ + + def real_download(self, filename, info_dict): + segments = info_dict['fragments'][:1] if self.params.get( + 'test', False) else info_dict['fragments'] + + ctx = { + 'filename': filename, + 'total_frags': len(segments), + } + + self._prepare_and_start_frag_download(ctx, info_dict) + + extra_state = ctx.setdefault('extra_state', { + 'ism_track_written': False, + }) + + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) + + frag_index = 0 + for i, segment in enumerate(segments): + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue + + retry_manager = RetryManager(self.params.get('fragment_retries'), self.report_retry, + frag_index=frag_index, fatal=not skip_unavailable_fragments) + for retry in retry_manager: + try: + success = self._download_fragment(ctx, segment['url'], info_dict) + if not success: + return False + frag_content = self._read_fragment(ctx) + + if not extra_state['ism_track_written']: + tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd']) + info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0] + write_piff_header(ctx['dest_stream'], info_dict['_download_params']) + extra_state['ism_track_written'] = True + self._append_fragment(ctx, frag_content) + except HTTPError as err: + retry.error = err + continue + + if retry_manager.error: + if not skip_unavailable_fragments: + return False + self.report_skip_fragment(frag_index) + + return self._finish_frag_download(ctx, info_dict) diff --git a/yt_dlp/downloader/mhtml.py b/yt_dlp/downloader/mhtml.py new file mode 100644 index 0000000..d977dce --- /dev/null +++ b/yt_dlp/downloader/mhtml.py @@ -0,0 +1,189 @@ +import io +import quopri +import re +import uuid + +from .fragment import FragmentFD +from ..compat import imghdr +from ..utils import escapeHTML, formatSeconds, srt_subtitles_timecode, urljoin +from ..version import __version__ as YT_DLP_VERSION + + +class MhtmlFD(FragmentFD): + _STYLESHEET = """\ +html, body { + margin: 0; + padding: 0; + height: 100vh; +} + +html { + overflow-y: scroll; + scroll-snap-type: y mandatory; +} + +body { + scroll-snap-type: y mandatory; + display: flex; + flex-flow: column; +} + +body > figure { + max-width: 100vw; + max-height: 100vh; + scroll-snap-align: center; +} + +body > figure > figcaption { + text-align: center; + height: 2.5em; +} + +body > figure > img { + display: block; + margin: auto; + max-width: 100%; + max-height: calc(100vh - 5em); +} +""" + _STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET) + _STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET) + + @staticmethod + def _escape_mime(s): + return '=?utf-8?Q?' + (b''.join( + bytes((b,)) if b >= 0x20 else b'=%02X' % b + for b in quopri.encodestring(s.encode(), header=True) + )).decode('us-ascii') + '?=' + + def _gen_cid(self, i, fragment, frag_boundary): + return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary) + + def _gen_stub(self, *, fragments, frag_boundary, title): + output = io.StringIO() + + output.write(( + '' + '' + '' + '' '' + '' '{title}' + '' '' + '' + ).format( + version=escapeHTML(YT_DLP_VERSION), + styles=self._STYLESHEET, + title=escapeHTML(title) + )) + + t0 = 0 + for i, frag in enumerate(fragments): + output.write('
') + try: + t1 = t0 + frag['duration'] + output.write(( + '
Slide #{num}: {t0} – {t1} (duration: {duration})
' + ).format( + num=i + 1, + t0=srt_subtitles_timecode(t0), + t1=srt_subtitles_timecode(t1), + duration=formatSeconds(frag['duration'], msec=True) + )) + except (KeyError, ValueError, TypeError): + t1 = None + output.write(( + '
Slide #{num}
' + ).format(num=i + 1)) + output.write(''.format( + cid=self._gen_cid(i, frag, frag_boundary))) + output.write('
') + t0 = t1 + + return output.getvalue() + + def real_download(self, filename, info_dict): + fragment_base_url = info_dict.get('fragment_base_url') + fragments = info_dict['fragments'][:1] if self.params.get( + 'test', False) else info_dict['fragments'] + title = info_dict.get('title', info_dict['format_id']) + origin = info_dict.get('webpage_url', info_dict['url']) + + ctx = { + 'filename': filename, + 'total_frags': len(fragments), + } + + self._prepare_and_start_frag_download(ctx, info_dict) + + extra_state = ctx.setdefault('extra_state', { + 'header_written': False, + 'mime_boundary': str(uuid.uuid4()).replace('-', ''), + }) + + frag_boundary = extra_state['mime_boundary'] + + if not extra_state['header_written']: + stub = self._gen_stub( + fragments=fragments, + frag_boundary=frag_boundary, + title=title + ) + + ctx['dest_stream'].write(( + 'MIME-Version: 1.0\r\n' + 'From: \r\n' + 'To: \r\n' + 'Subject: {title}\r\n' + 'Content-type: multipart/related; ' + '' 'boundary="{boundary}"; ' + '' 'type="text/html"\r\n' + 'X.yt-dlp.Origin: {origin}\r\n' + '\r\n' + '--{boundary}\r\n' + 'Content-Type: text/html; charset=utf-8\r\n' + 'Content-Length: {length}\r\n' + '\r\n' + '{stub}\r\n' + ).format( + origin=origin, + boundary=frag_boundary, + length=len(stub), + title=self._escape_mime(title), + stub=stub + ).encode()) + extra_state['header_written'] = True + + for i, fragment in enumerate(fragments): + if (i + 1) <= ctx['fragment_index']: + continue + + fragment_url = fragment.get('url') + if not fragment_url: + assert fragment_base_url + fragment_url = urljoin(fragment_base_url, fragment['path']) + + success = self._download_fragment(ctx, fragment_url, info_dict) + if not success: + continue + frag_content = self._read_fragment(ctx) + + frag_header = io.BytesIO() + frag_header.write( + b'--%b\r\n' % frag_boundary.encode('us-ascii')) + frag_header.write( + b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii')) + frag_header.write( + b'Content-type: %b\r\n' % f'image/{imghdr.what(h=frag_content) or "jpeg"}'.encode()) + frag_header.write( + b'Content-length: %u\r\n' % len(frag_content)) + frag_header.write( + b'Content-location: %b\r\n' % fragment_url.encode('us-ascii')) + frag_header.write( + b'X.yt-dlp.Duration: %f\r\n' % fragment['duration']) + frag_header.write(b'\r\n') + self._append_fragment( + ctx, frag_header.getvalue() + frag_content + b'\r\n') + + ctx['dest_stream'].write( + b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii')) + return self._finish_frag_download(ctx, info_dict) diff --git a/yt_dlp/downloader/niconico.py b/yt_dlp/downloader/niconico.py new file mode 100644 index 0000000..fef8bff --- /dev/null +++ b/yt_dlp/downloader/niconico.py @@ -0,0 +1,140 @@ +import json +import threading +import time + +from . import get_suitable_downloader +from .common import FileDownloader +from .external import FFmpegFD +from ..networking import Request +from ..utils import DownloadError, str_or_none, try_get + + +class NiconicoDmcFD(FileDownloader): + """ Downloading niconico douga from DMC with heartbeat """ + + def real_download(self, filename, info_dict): + from ..extractor.niconico import NiconicoIE + + self.to_screen('[%s] Downloading from DMC' % self.FD_NAME) + ie = NiconicoIE(self.ydl) + info_dict, heartbeat_info_dict = ie._get_heartbeat_info(info_dict) + + fd = get_suitable_downloader(info_dict, params=self.params)(self.ydl, self.params) + + success = download_complete = False + timer = [None] + heartbeat_lock = threading.Lock() + heartbeat_url = heartbeat_info_dict['url'] + heartbeat_data = heartbeat_info_dict['data'].encode() + heartbeat_interval = heartbeat_info_dict.get('interval', 30) + + request = Request(heartbeat_url, heartbeat_data) + + def heartbeat(): + try: + self.ydl.urlopen(request).read() + except Exception: + self.to_screen('[%s] Heartbeat failed' % self.FD_NAME) + + with heartbeat_lock: + if not download_complete: + timer[0] = threading.Timer(heartbeat_interval, heartbeat) + timer[0].start() + + heartbeat_info_dict['ping']() + self.to_screen('[%s] Heartbeat with %d second interval ...' % (self.FD_NAME, heartbeat_interval)) + try: + heartbeat() + if type(fd).__name__ == 'HlsFD': + info_dict.update(ie._extract_m3u8_formats(info_dict['url'], info_dict['id'])[0]) + success = fd.real_download(filename, info_dict) + finally: + if heartbeat_lock: + with heartbeat_lock: + timer[0].cancel() + download_complete = True + return success + + +class NiconicoLiveFD(FileDownloader): + """ Downloads niconico live without being stopped """ + + def real_download(self, filename, info_dict): + video_id = info_dict['video_id'] + ws_url = info_dict['url'] + ws_extractor = info_dict['ws'] + ws_origin_host = info_dict['origin'] + live_quality = info_dict.get('live_quality', 'high') + live_latency = info_dict.get('live_latency', 'high') + dl = FFmpegFD(self.ydl, self.params or {}) + + new_info_dict = info_dict.copy() + new_info_dict.update({ + 'protocol': 'm3u8', + }) + + def communicate_ws(reconnect): + if reconnect: + ws = self.ydl.urlopen(Request(ws_url, headers={'Origin': f'https://{ws_origin_host}'})) + if self.ydl.params.get('verbose', False): + self.to_screen('[debug] Sending startWatching request') + ws.send(json.dumps({ + 'type': 'startWatching', + 'data': { + 'stream': { + 'quality': live_quality, + 'protocol': 'hls+fmp4', + 'latency': live_latency, + 'chasePlay': False + }, + 'room': { + 'protocol': 'webSocket', + 'commentable': True + }, + 'reconnect': True, + } + })) + else: + ws = ws_extractor + with ws: + while True: + recv = ws.recv() + if not recv: + continue + data = json.loads(recv) + if not data or not isinstance(data, dict): + continue + if data.get('type') == 'ping': + # pong back + ws.send(r'{"type":"pong"}') + ws.send(r'{"type":"keepSeat"}') + elif data.get('type') == 'disconnect': + self.write_debug(data) + return True + elif data.get('type') == 'error': + self.write_debug(data) + message = try_get(data, lambda x: x['body']['code'], str) or recv + return DownloadError(message) + elif self.ydl.params.get('verbose', False): + if len(recv) > 100: + recv = recv[:100] + '...' + self.to_screen('[debug] Server said: %s' % recv) + + def ws_main(): + reconnect = False + while True: + try: + ret = communicate_ws(reconnect) + if ret is True: + return + except BaseException as e: + self.to_screen('[%s] %s: Connection error occured, reconnecting after 10 seconds: %s' % ('niconico:live', video_id, str_or_none(e))) + time.sleep(10) + continue + finally: + reconnect = True + + thread = threading.Thread(target=ws_main, daemon=True) + thread.start() + + return dl.download(filename, new_info_dict) diff --git a/yt_dlp/downloader/rtmp.py b/yt_dlp/downloader/rtmp.py new file mode 100644 index 0000000..0e09525 --- /dev/null +++ b/yt_dlp/downloader/rtmp.py @@ -0,0 +1,213 @@ +import os +import re +import subprocess +import time + +from .common import FileDownloader +from ..utils import ( + Popen, + check_executable, + encodeArgument, + encodeFilename, + get_exe_version, +) + + +def rtmpdump_version(): + return get_exe_version( + 'rtmpdump', ['--help'], r'(?i)RTMPDump\s*v?([0-9a-zA-Z._-]+)') + + +class RtmpFD(FileDownloader): + def real_download(self, filename, info_dict): + def run_rtmpdump(args): + start = time.time() + resume_percent = None + resume_downloaded_data_len = None + proc = Popen(args, stderr=subprocess.PIPE) + cursor_in_new_line = True + proc_stderr_closed = False + try: + while not proc_stderr_closed: + # read line from stderr + line = '' + while True: + char = proc.stderr.read(1) + if not char: + proc_stderr_closed = True + break + if char in [b'\r', b'\n']: + break + line += char.decode('ascii', 'replace') + if not line: + # proc_stderr_closed is True + continue + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1)) * 1024) + percent = float(mobj.group(2)) + if not resume_percent: + resume_percent = percent + resume_downloaded_data_len = downloaded_data_len + time_now = time.time() + eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) + speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) + data_len = None + if percent > 0: + data_len = int(downloaded_data_len * 100 / percent) + self._hook_progress({ + 'status': 'downloading', + 'downloaded_bytes': downloaded_data_len, + 'total_bytes_estimate': data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'eta': eta, + 'elapsed': time_now - start, + 'speed': speed, + }, info_dict) + cursor_in_new_line = False + else: + # no percent for live streams + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1)) * 1024) + time_now = time.time() + speed = self.calc_speed(start, time_now, downloaded_data_len) + self._hook_progress({ + 'downloaded_bytes': downloaded_data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'status': 'downloading', + 'elapsed': time_now - start, + 'speed': speed, + }, info_dict) + cursor_in_new_line = False + elif self.params.get('verbose', False): + if not cursor_in_new_line: + self.to_screen('') + cursor_in_new_line = True + self.to_screen('[rtmpdump] ' + line) + if not cursor_in_new_line: + self.to_screen('') + return proc.wait() + except BaseException: # Including KeyboardInterrupt + proc.kill(timeout=None) + raise + + url = info_dict['url'] + player_url = info_dict.get('player_url') + page_url = info_dict.get('page_url') + app = info_dict.get('app') + play_path = info_dict.get('play_path') + tc_url = info_dict.get('tc_url') + flash_version = info_dict.get('flash_version') + live = info_dict.get('rtmp_live', False) + conn = info_dict.get('rtmp_conn') + protocol = info_dict.get('rtmp_protocol') + real_time = info_dict.get('rtmp_real_time', False) + no_resume = info_dict.get('no_resume', False) + continue_dl = self.params.get('continuedl', True) + + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + test = self.params.get('test', False) + + # Check for rtmpdump first + if not check_executable('rtmpdump', ['-h']): + self.report_error('RTMP download detected but "rtmpdump" could not be run. Please install') + return False + + # Download using rtmpdump. rtmpdump returns exit code 2 when + # the connection was interrupted and resuming appears to be + # possible. This is part of rtmpdump's normal usage, AFAIK. + basic_args = [ + 'rtmpdump', '--verbose', '-r', url, + '-o', tmpfilename] + if player_url is not None: + basic_args += ['--swfVfy', player_url] + if page_url is not None: + basic_args += ['--pageUrl', page_url] + if app is not None: + basic_args += ['--app', app] + if play_path is not None: + basic_args += ['--playpath', play_path] + if tc_url is not None: + basic_args += ['--tcUrl', tc_url] + if test: + basic_args += ['--stop', '1'] + if flash_version is not None: + basic_args += ['--flashVer', flash_version] + if live: + basic_args += ['--live'] + if isinstance(conn, list): + for entry in conn: + basic_args += ['--conn', entry] + elif isinstance(conn, str): + basic_args += ['--conn', conn] + if protocol is not None: + basic_args += ['--protocol', protocol] + if real_time: + basic_args += ['--realtime'] + + args = basic_args + if not no_resume and continue_dl and not live: + args += ['--resume'] + if not live and continue_dl: + args += ['--skip', '1'] + + args = [encodeArgument(a) for a in args] + + self._debug_cmd(args, exe='rtmpdump') + + RD_SUCCESS = 0 + RD_FAILED = 1 + RD_INCOMPLETE = 2 + RD_NO_CONNECT = 3 + + started = time.time() + + try: + retval = run_rtmpdump(args) + except KeyboardInterrupt: + if not info_dict.get('is_live'): + raise + retval = RD_SUCCESS + self.to_screen('\n[rtmpdump] Interrupted by user') + + if retval == RD_NO_CONNECT: + self.report_error('[rtmpdump] Could not connect to RTMP server.') + return False + + while retval in (RD_INCOMPLETE, RD_FAILED) and not test and not live: + prevsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen('[rtmpdump] Downloaded %s bytes' % prevsize) + time.sleep(5.0) # This seems to be needed + args = basic_args + ['--resume'] + if retval == RD_FAILED: + args += ['--skip', '1'] + args = [encodeArgument(a) for a in args] + retval = run_rtmpdump(args) + cursize = os.path.getsize(encodeFilename(tmpfilename)) + if prevsize == cursize and retval == RD_FAILED: + break + # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those + if prevsize == cursize and retval == RD_INCOMPLETE and cursize > 1024: + self.to_screen('[rtmpdump] Could not download the whole video. This can happen for some advertisements.') + retval = RD_SUCCESS + break + if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE): + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen('[rtmpdump] Downloaded %s bytes' % fsize) + self.try_rename(tmpfilename, filename) + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': filename, + 'status': 'finished', + 'elapsed': time.time() - started, + }, info_dict) + return True + else: + self.to_stderr('\n') + self.report_error('rtmpdump exited with code %d' % retval) + return False diff --git a/yt_dlp/downloader/rtsp.py b/yt_dlp/downloader/rtsp.py new file mode 100644 index 0000000..e89269f --- /dev/null +++ b/yt_dlp/downloader/rtsp.py @@ -0,0 +1,42 @@ +import os +import subprocess + +from .common import FileDownloader +from ..utils import check_executable, encodeFilename + + +class RtspFD(FileDownloader): + def real_download(self, filename, info_dict): + url = info_dict['url'] + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + + if check_executable('mplayer', ['-h']): + args = [ + 'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', + '-dumpstream', '-dumpfile', tmpfilename, url] + elif check_executable('mpv', ['-h']): + args = [ + 'mpv', '-really-quiet', '--vo=null', '--stream-dump=' + tmpfilename, url] + else: + self.report_error('MMS or RTSP download detected but neither "mplayer" nor "mpv" could be run. Please install one') + return False + + self._debug_cmd(args) + + retval = subprocess.call(args) + if retval == 0: + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen(f'\r[{args[0]}] {fsize} bytes') + self.try_rename(tmpfilename, filename) + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': filename, + 'status': 'finished', + }, info_dict) + return True + else: + self.to_stderr('\n') + self.report_error('%s exited with code %d' % (args[0], retval)) + return False diff --git a/yt_dlp/downloader/websocket.py b/yt_dlp/downloader/websocket.py new file mode 100644 index 0000000..6837ff1 --- /dev/null +++ b/yt_dlp/downloader/websocket.py @@ -0,0 +1,53 @@ +import asyncio +import contextlib +import os +import signal +import threading + +from .common import FileDownloader +from .external import FFmpegFD +from ..dependencies import websockets + + +class FFmpegSinkFD(FileDownloader): + """ A sink to ffmpeg for downloading fragments in any form """ + + def real_download(self, filename, info_dict): + info_copy = info_dict.copy() + info_copy['url'] = '-' + + async def call_conn(proc, stdin): + try: + await self.real_connection(stdin, info_dict) + except OSError: + pass + finally: + with contextlib.suppress(OSError): + stdin.flush() + stdin.close() + os.kill(os.getpid(), signal.SIGINT) + + class FFmpegStdinFD(FFmpegFD): + @classmethod + def get_basename(cls): + return FFmpegFD.get_basename() + + def on_process_started(self, proc, stdin): + thread = threading.Thread(target=asyncio.run, daemon=True, args=(call_conn(proc, stdin), )) + thread.start() + + return FFmpegStdinFD(self.ydl, self.params or {}).download(filename, info_copy) + + async def real_connection(self, sink, info_dict): + """ Override this in subclasses """ + raise NotImplementedError('This method must be implemented by subclasses') + + +class WebSocketFragmentFD(FFmpegSinkFD): + async def real_connection(self, sink, info_dict): + async with websockets.connect(info_dict['url'], extra_headers=info_dict.get('http_headers', {})) as ws: + while True: + recv = await ws.recv() + if isinstance(recv, str): + recv = recv.encode('utf8') + sink.write(recv) diff --git a/yt_dlp/downloader/youtube_live_chat.py b/yt_dlp/downloader/youtube_live_chat.py new file mode 100644 index 0000000..c7a8637 --- /dev/null +++ b/yt_dlp/downloader/youtube_live_chat.py @@ -0,0 +1,228 @@ +import json +import time + +from .fragment import FragmentFD +from ..networking.exceptions import HTTPError +from ..utils import ( + RegexNotFoundError, + RetryManager, + dict_get, + int_or_none, + try_get, +) +from ..utils.networking import HTTPHeaderDict + + +class YoutubeLiveChatFD(FragmentFD): + """ Downloads YouTube live chats fragment by fragment """ + + def real_download(self, filename, info_dict): + video_id = info_dict['video_id'] + self.to_screen('[%s] Downloading live chat' % self.FD_NAME) + if not self.params.get('skip_download') and info_dict['protocol'] == 'youtube_live_chat': + self.report_warning('Live chat download runs until the livestream ends. ' + 'If you wish to download the video simultaneously, run a separate yt-dlp instance') + + test = self.params.get('test', False) + + ctx = { + 'filename': filename, + 'live': True, + 'total_frags': None, + } + + from ..extractor.youtube import YoutubeBaseInfoExtractor + + ie = YoutubeBaseInfoExtractor(self.ydl) + + start_time = int(time.time() * 1000) + + def dl_fragment(url, data=None, headers=None): + http_headers = HTTPHeaderDict(info_dict.get('http_headers'), headers) + return self._download_fragment(ctx, url, info_dict, http_headers, data) + + def parse_actions_replay(live_chat_continuation): + offset = continuation_id = click_tracking_params = None + processed_fragment = bytearray() + for action in live_chat_continuation.get('actions', []): + if 'replayChatItemAction' in action: + replay_chat_item_action = action['replayChatItemAction'] + offset = int(replay_chat_item_action['videoOffsetTimeMsec']) + processed_fragment.extend( + json.dumps(action, ensure_ascii=False).encode() + b'\n') + if offset is not None: + continuation = try_get( + live_chat_continuation, + lambda x: x['continuations'][0]['liveChatReplayContinuationData'], dict) + if continuation: + continuation_id = continuation.get('continuation') + click_tracking_params = continuation.get('clickTrackingParams') + self._append_fragment(ctx, processed_fragment) + return continuation_id, offset, click_tracking_params + + def try_refresh_replay_beginning(live_chat_continuation): + # choose the second option that contains the unfiltered live chat replay + refresh_continuation = try_get( + live_chat_continuation, + lambda x: x['header']['liveChatHeaderRenderer']['viewSelector']['sortFilterSubMenuRenderer']['subMenuItems'][1]['continuation']['reloadContinuationData'], dict) + if refresh_continuation: + # no data yet but required to call _append_fragment + self._append_fragment(ctx, b'') + refresh_continuation_id = refresh_continuation.get('continuation') + offset = 0 + click_tracking_params = refresh_continuation.get('trackingParams') + return refresh_continuation_id, offset, click_tracking_params + return parse_actions_replay(live_chat_continuation) + + live_offset = 0 + + def parse_actions_live(live_chat_continuation): + nonlocal live_offset + continuation_id = click_tracking_params = None + processed_fragment = bytearray() + for action in live_chat_continuation.get('actions', []): + timestamp = self.parse_live_timestamp(action) + if timestamp is not None: + live_offset = timestamp - start_time + # compatibility with replay format + pseudo_action = { + 'replayChatItemAction': {'actions': [action]}, + 'videoOffsetTimeMsec': str(live_offset), + 'isLive': True, + } + processed_fragment.extend( + json.dumps(pseudo_action, ensure_ascii=False).encode() + b'\n') + continuation_data_getters = [ + lambda x: x['continuations'][0]['invalidationContinuationData'], + lambda x: x['continuations'][0]['timedContinuationData'], + ] + continuation_data = try_get(live_chat_continuation, continuation_data_getters, dict) + if continuation_data: + continuation_id = continuation_data.get('continuation') + click_tracking_params = continuation_data.get('clickTrackingParams') + timeout_ms = int_or_none(continuation_data.get('timeoutMs')) + if timeout_ms is not None: + time.sleep(timeout_ms / 1000) + self._append_fragment(ctx, processed_fragment) + return continuation_id, live_offset, click_tracking_params + + def download_and_parse_fragment(url, frag_index, request_data=None, headers=None): + for retry in RetryManager(self.params.get('fragment_retries'), self.report_retry, frag_index=frag_index): + try: + success = dl_fragment(url, request_data, headers) + if not success: + return False, None, None, None + raw_fragment = self._read_fragment(ctx) + try: + data = ie.extract_yt_initial_data(video_id, raw_fragment.decode('utf-8', 'replace')) + except RegexNotFoundError: + data = None + if not data: + data = json.loads(raw_fragment) + live_chat_continuation = try_get( + data, + lambda x: x['continuationContents']['liveChatContinuation'], dict) or {} + + func = (info_dict['protocol'] == 'youtube_live_chat' and parse_actions_live + or frag_index == 1 and try_refresh_replay_beginning + or parse_actions_replay) + return (True, *func(live_chat_continuation)) + except HTTPError as err: + retry.error = err + continue + return False, None, None, None + + self._prepare_and_start_frag_download(ctx, info_dict) + + success = dl_fragment(info_dict['url']) + if not success: + return False + raw_fragment = self._read_fragment(ctx) + try: + data = ie.extract_yt_initial_data(video_id, raw_fragment.decode('utf-8', 'replace')) + except RegexNotFoundError: + return False + continuation_id = try_get( + data, + lambda x: x['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']) + # no data yet but required to call _append_fragment + self._append_fragment(ctx, b'') + + ytcfg = ie.extract_ytcfg(video_id, raw_fragment.decode('utf-8', 'replace')) + + if not ytcfg: + return False + api_key = try_get(ytcfg, lambda x: x['INNERTUBE_API_KEY']) + innertube_context = try_get(ytcfg, lambda x: x['INNERTUBE_CONTEXT']) + if not api_key or not innertube_context: + return False + visitor_data = try_get(innertube_context, lambda x: x['client']['visitorData'], str) + if info_dict['protocol'] == 'youtube_live_chat_replay': + url = 'https://www.youtube.com/youtubei/v1/live_chat/get_live_chat_replay?key=' + api_key + chat_page_url = 'https://www.youtube.com/live_chat_replay?continuation=' + continuation_id + elif info_dict['protocol'] == 'youtube_live_chat': + url = 'https://www.youtube.com/youtubei/v1/live_chat/get_live_chat?key=' + api_key + chat_page_url = 'https://www.youtube.com/live_chat?continuation=' + continuation_id + + frag_index = offset = 0 + click_tracking_params = None + while continuation_id is not None: + frag_index += 1 + request_data = { + 'context': innertube_context, + 'continuation': continuation_id, + } + if frag_index > 1: + request_data['currentPlayerState'] = {'playerOffsetMs': str(max(offset - 5000, 0))} + if click_tracking_params: + request_data['context']['clickTracking'] = {'clickTrackingParams': click_tracking_params} + headers = ie.generate_api_headers(ytcfg=ytcfg, visitor_data=visitor_data) + headers.update({'content-type': 'application/json'}) + fragment_request_data = json.dumps(request_data, ensure_ascii=False).encode() + b'\n' + success, continuation_id, offset, click_tracking_params = download_and_parse_fragment( + url, frag_index, fragment_request_data, headers) + else: + success, continuation_id, offset, click_tracking_params = download_and_parse_fragment( + chat_page_url, frag_index) + if not success: + return False + if test: + break + + return self._finish_frag_download(ctx, info_dict) + + @staticmethod + def parse_live_timestamp(action): + action_content = dict_get( + action, + ['addChatItemAction', 'addLiveChatTickerItemAction', 'addBannerToLiveChatCommand']) + if not isinstance(action_content, dict): + return None + item = dict_get(action_content, ['item', 'bannerRenderer']) + if not isinstance(item, dict): + return None + renderer = dict_get(item, [ + # text + 'liveChatTextMessageRenderer', 'liveChatPaidMessageRenderer', + 'liveChatMembershipItemRenderer', 'liveChatPaidStickerRenderer', + # ticker + 'liveChatTickerPaidMessageItemRenderer', + 'liveChatTickerSponsorItemRenderer', + # banner + 'liveChatBannerRenderer', + ]) + if not isinstance(renderer, dict): + return None + parent_item_getters = [ + lambda x: x['showItemEndpoint']['showLiveChatItemEndpoint']['renderer'], + lambda x: x['contents'], + ] + parent_item = try_get(renderer, parent_item_getters, dict) + if parent_item: + renderer = dict_get(parent_item, [ + 'liveChatTextMessageRenderer', 'liveChatPaidMessageRenderer', + 'liveChatMembershipItemRenderer', 'liveChatPaidStickerRenderer', + ]) + if not isinstance(renderer, dict): + return None + return int_or_none(renderer.get('timestampUsec'), 1000) diff --git a/yt_dlp/extractor/__init__.py b/yt_dlp/extractor/__init__.py new file mode 100644 index 0000000..6bfa4bd --- /dev/null +++ b/yt_dlp/extractor/__init__.py @@ -0,0 +1,42 @@ +from ..compat.compat_utils import passthrough_module + +passthrough_module(__name__, '.extractors') +del passthrough_module + + +def gen_extractor_classes(): + """ Return a list of supported extractors. + The order does matter; the first extractor matched is the one handling the URL. + """ + from .extractors import _ALL_CLASSES + + return _ALL_CLASSES + + +def gen_extractors(): + """ Return a list of an instance of every supported extractor. + The order does matter; the first extractor matched is the one handling the URL. + """ + return [klass() for klass in gen_extractor_classes()] + + +def list_extractor_classes(age_limit=None): + """Return a list of extractors that are suitable for the given age, sorted by extractor name""" + from .generic import GenericIE + + yield from sorted(filter( + lambda ie: ie.is_suitable(age_limit) and ie != GenericIE, + gen_extractor_classes()), key=lambda ie: ie.IE_NAME.lower()) + yield GenericIE + + +def list_extractors(age_limit=None): + """Return a list of extractor instances that are suitable for the given age, sorted by extractor name""" + return [ie() for ie in list_extractor_classes(age_limit)] + + +def get_info_extractor(ie_name): + """Returns the info extractor class with the given ie_name""" + from . import extractors + + return getattr(extractors, f'{ie_name}IE') diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py new file mode 100644 index 0000000..c753655 --- /dev/null +++ b/yt_dlp/extractor/_extractors.py @@ -0,0 +1,2493 @@ +# flake8: noqa: F401 + +from .youtube import ( # Youtube is moved to the top to improve performance + YoutubeIE, + YoutubeClipIE, + YoutubeFavouritesIE, + YoutubeNotificationsIE, + YoutubeHistoryIE, + YoutubeTabIE, + YoutubeLivestreamEmbedIE, + YoutubePlaylistIE, + YoutubeRecommendedIE, + YoutubeSearchDateIE, + YoutubeSearchIE, + YoutubeSearchURLIE, + YoutubeMusicSearchURLIE, + YoutubeSubscriptionsIE, + YoutubeTruncatedIDIE, + YoutubeTruncatedURLIE, + YoutubeYtBeIE, + YoutubeYtUserIE, + YoutubeWatchLaterIE, + YoutubeShortsAudioPivotIE, + YoutubeConsentRedirectIE, +) + +from .abc import ( + ABCIE, + ABCIViewIE, + ABCIViewShowSeriesIE, +) +from .abcnews import ( + AbcNewsIE, + AbcNewsVideoIE, +) +from .abcotvs import ( + ABCOTVSIE, + ABCOTVSClipsIE, +) +from .abematv import ( + AbemaTVIE, + AbemaTVTitleIE, +) +from .academicearth import AcademicEarthCourseIE +from .acast import ( + ACastIE, + ACastChannelIE, +) +from .acfun import AcFunVideoIE, AcFunBangumiIE +from .adn import ADNIE, ADNSeasonIE +from .adobeconnect import AdobeConnectIE +from .adobetv import ( + AdobeTVEmbedIE, + AdobeTVIE, + AdobeTVShowIE, + AdobeTVChannelIE, + AdobeTVVideoIE, +) +from .adultswim import AdultSwimIE +from .aenetworks import ( + AENetworksIE, + AENetworksCollectionIE, + AENetworksShowIE, + HistoryTopicIE, + HistoryPlayerIE, + BiographyIE, +) +from .aeonco import AeonCoIE +from .afreecatv import ( + AfreecaTVIE, + AfreecaTVLiveIE, + AfreecaTVUserIE, +) +from .agora import ( + TokFMAuditionIE, + TokFMPodcastIE, + WyborczaPodcastIE, + WyborczaVideoIE, +) +from .airtv import AirTVIE +from .aitube import AitubeKZVideoIE +from .aljazeera import AlJazeeraIE +from .allstar import ( + AllstarIE, + AllstarProfileIE, +) +from .alphaporno import AlphaPornoIE +from .altcensored import ( + AltCensoredIE, + AltCensoredChannelIE, +) +from .alura import ( + AluraIE, + AluraCourseIE +) +from .amadeustv import AmadeusTVIE +from .amara import AmaraIE +from .amcnetworks import AMCNetworksIE +from .amazon import ( + AmazonStoreIE, + AmazonReviewsIE, +) +from .amazonminitv import ( + AmazonMiniTVIE, + AmazonMiniTVSeasonIE, + AmazonMiniTVSeriesIE, +) +from .americastestkitchen import ( + AmericasTestKitchenIE, + AmericasTestKitchenSeasonIE, +) +from .anchorfm import AnchorFMEpisodeIE +from .angel import AngelIE +from .anvato import AnvatoIE +from .aol import AolIE +from .allocine import AllocineIE +from .aliexpress import AliExpressLiveIE +from .alsace20tv import ( + Alsace20TVIE, + Alsace20TVEmbedIE, +) +from .apa import APAIE +from .aparat import AparatIE +from .appleconnect import AppleConnectIE +from .appletrailers import ( + AppleTrailersIE, + AppleTrailersSectionIE, +) +from .applepodcasts import ApplePodcastsIE +from .archiveorg import ( + ArchiveOrgIE, + YoutubeWebArchiveIE, +) +from .arcpublishing import ArcPublishingIE +from .arkena import ArkenaIE +from .ard import ( + ARDBetaMediathekIE, + ARDMediathekCollectionIE, + ARDIE, +) +from .art19 import ( + Art19IE, + Art19ShowIE, +) +from .arte import ( + ArteTVIE, + ArteTVEmbedIE, + ArteTVPlaylistIE, + ArteTVCategoryIE, +) +from .arnes import ArnesIE +from .asobichannel import AsobiChannelIE, AsobiChannelTagURLIE +from .atresplayer import AtresPlayerIE +from .atscaleconf import AtScaleConfEventIE +from .atvat import ATVAtIE +from .audimedia import AudiMediaIE +from .audioboom import AudioBoomIE +from .audiodraft import ( + AudiodraftCustomIE, + AudiodraftGenericIE, +) +from .audiomack import AudiomackIE, AudiomackAlbumIE +from .audius import ( + AudiusIE, + AudiusTrackIE, + AudiusPlaylistIE, + AudiusProfileIE, +) +from .awaan import ( + AWAANIE, + AWAANVideoIE, + AWAANLiveIE, + AWAANSeasonIE, +) +from .axs import AxsIE +from .azmedien import AZMedienIE +from .baidu import BaiduVideoIE +from .banbye import ( + BanByeIE, + BanByeChannelIE, +) +from .bandaichannel import BandaiChannelIE +from .bandcamp import ( + BandcampIE, + BandcampAlbumIE, + BandcampWeeklyIE, + BandcampUserIE, +) +from .bannedvideo import BannedVideoIE +from .bbc import ( + BBCCoUkIE, + BBCCoUkArticleIE, + BBCCoUkIPlayerEpisodesIE, + BBCCoUkIPlayerGroupIE, + BBCCoUkPlaylistIE, + BBCIE, +) +from .beeg import BeegIE +from .behindkink import BehindKinkIE +from .bellmedia import BellMediaIE +from .beatbump import ( + BeatBumpVideoIE, + BeatBumpPlaylistIE, +) +from .beatport import BeatportIE +from .berufetv import BerufeTVIE +from .bet import BetIE +from .bfi import BFIPlayerIE +from .bfmtv import ( + BFMTVIE, + BFMTVLiveIE, + BFMTVArticleIE, +) +from .bibeltv import ( + BibelTVLiveIE, + BibelTVSeriesIE, + BibelTVVideoIE, +) +from .bigflix import BigflixIE +from .bigo import BigoIE +from .bild import BildIE +from .bilibili import ( + BiliBiliIE, + BiliBiliBangumiIE, + BiliBiliBangumiSeasonIE, + BiliBiliBangumiMediaIE, + BilibiliCheeseIE, + BilibiliCheeseSeasonIE, + BiliBiliSearchIE, + BilibiliCategoryIE, + BilibiliAudioIE, + BilibiliAudioAlbumIE, + BiliBiliPlayerIE, + BilibiliSpaceVideoIE, + BilibiliSpaceAudioIE, + BilibiliCollectionListIE, + BilibiliSeriesListIE, + BilibiliFavoritesListIE, + BilibiliWatchlaterIE, + BilibiliPlaylistIE, + BiliIntlIE, + BiliIntlSeriesIE, + BiliLiveIE, +) +from .biobiochiletv import BioBioChileTVIE +from .bitchute import ( + BitChuteIE, + BitChuteChannelIE, +) +from .blackboardcollaborate import BlackboardCollaborateIE +from .bleacherreport import ( + BleacherReportIE, + BleacherReportCMSIE, +) +from .blerp import BlerpIE +from .blogger import BloggerIE +from .bloomberg import BloombergIE +from .bokecc import BokeCCIE +from .bongacams import BongaCamsIE +from .boosty import BoostyIE +from .bostonglobe import BostonGlobeIE +from .box import BoxIE +from .boxcast import BoxCastVideoIE +from .bpb import BpbIE +from .br import BRIE +from .bravotv import BravoTVIE +from .brainpop import ( + BrainPOPIE, + BrainPOPJrIE, + BrainPOPELLIE, + BrainPOPEspIE, + BrainPOPFrIE, + BrainPOPIlIE, +) +from .breitbart import BreitBartIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) +from .brilliantpala import ( + BrilliantpalaElearnIE, + BrilliantpalaClassesIE, +) +from .businessinsider import BusinessInsiderIE +from .bundesliga import BundesligaIE +from .bundestag import BundestagIE +from .buzzfeed import BuzzFeedIE +from .byutv import BYUtvIE +from .c56 import C56IE +from .cableav import CableAVIE +from .callin import CallinIE +from .caltrans import CaltransIE +from .cam4 import CAM4IE +from .camdemy import ( + CamdemyIE, + CamdemyFolderIE +) +from .camfm import ( + CamFMEpisodeIE, + CamFMShowIE +) +from .cammodels import CamModelsIE +from .camsoda import CamsodaIE +from .camtasia import CamtasiaEmbedIE +from .canal1 import Canal1IE +from .canalalpha import CanalAlphaIE +from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE +from .caracoltv import CaracolTvPlayIE +from .cartoonnetwork import CartoonNetworkIE +from .cbc import ( + CBCIE, + CBCPlayerIE, + CBCPlayerPlaylistIE, + CBCGemIE, + CBCGemPlaylistIE, + CBCGemLiveIE, +) +from .cbs import ( + CBSIE, + ParamountPressExpressIE, +) +from .cbsnews import ( + CBSNewsEmbedIE, + CBSNewsIE, + CBSLocalIE, + CBSLocalArticleIE, + CBSLocalLiveIE, + CBSNewsLiveIE, + CBSNewsLiveVideoIE, +) +from .cbssports import ( + CBSSportsEmbedIE, + CBSSportsIE, + TwentyFourSevenSportsIE, +) +from .ccc import ( + CCCIE, + CCCPlaylistIE, +) +from .ccma import CCMAIE +from .cctv import CCTVIE +from .cda import CDAIE +from .cellebrite import CellebriteIE +from .ceskatelevize import CeskaTelevizeIE +from .cgtn import CGTNIE +from .charlierose import CharlieRoseIE +from .chaturbate import ChaturbateIE +from .chilloutzone import ChilloutzoneIE +from .chzzk import ( + CHZZKLiveIE, + CHZZKVideoIE, +) +from .cinemax import CinemaxIE +from .cinetecamilano import CinetecaMilanoIE +from .cineverse import ( + CineverseIE, + CineverseDetailsIE, +) +from .ciscolive import ( + CiscoLiveSessionIE, + CiscoLiveSearchIE, +) +from .ciscowebex import CiscoWebexIE +from .cjsw import CJSWIE +from .clipchamp import ClipchampIE +from .clippit import ClippitIE +from .cliprs import ClipRsIE +from .closertotruth import CloserToTruthIE +from .cloudflarestream import CloudflareStreamIE +from .cloudycdn import CloudyCDNIE +from .clubic import ClubicIE +from .clyp import ClypIE +from .cmt import CMTIE +from .cnbc import ( + CNBCVideoIE, +) +from .cnn import ( + CNNIE, + CNNBlogsIE, + CNNArticleIE, + CNNIndonesiaIE, +) +from .coub import CoubIE +from .comedycentral import ( + ComedyCentralIE, + ComedyCentralTVIE, +) +from .commonmistakes import CommonMistakesIE, UnicodeBOMIE +from .commonprotocols import ( + MmsIE, + RtmpIE, + ViewSourceIE, +) +from .condenast import CondeNastIE +from .contv import CONtvIE +from .corus import CorusIE +from .cpac import ( + CPACIE, + CPACPlaylistIE, +) +from .cozytv import CozyTVIE +from .cracked import CrackedIE +from .crackle import CrackleIE +from .craftsy import CraftsyIE +from .crooksandliars import CrooksAndLiarsIE +from .crowdbunker import ( + CrowdBunkerIE, + CrowdBunkerChannelIE, +) +from .crtvg import CrtvgIE +from .crunchyroll import ( + CrunchyrollBetaIE, + CrunchyrollBetaShowIE, + CrunchyrollMusicIE, + CrunchyrollArtistIE, +) +from .cspan import CSpanIE, CSpanCongressIE +from .ctsnews import CtsNewsIE +from .ctv import CTVIE +from .ctvnews import CTVNewsIE +from .cultureunplugged import CultureUnpluggedIE +from .curiositystream import ( + CuriosityStreamIE, + CuriosityStreamCollectionsIE, + CuriosityStreamSeriesIE, +) +from .cwtv import CWTVIE +from .cybrary import ( + CybraryIE, + CybraryCourseIE +) +from .dacast import ( + DacastVODIE, + DacastPlaylistIE, +) +from .dailymail import DailyMailIE +from .dailymotion import ( + DailymotionIE, + DailymotionPlaylistIE, + DailymotionSearchIE, + DailymotionUserIE, +) +from .dailywire import ( + DailyWireIE, + DailyWirePodcastIE, +) +from .damtomo import ( + DamtomoRecordIE, + DamtomoVideoIE, +) +from .daum import ( + DaumIE, + DaumClipIE, + DaumPlaylistIE, + DaumUserIE, +) +from .daystar import DaystarClipIE +from .dbtv import DBTVIE +from .dctp import DctpTvIE +from .deezer import ( + DeezerPlaylistIE, + DeezerAlbumIE, +) +from .democracynow import DemocracynowIE +from .detik import DetikEmbedIE +from .dlf import ( + DLFIE, + DLFCorpusIE, +) +from .dfb import DFBIE +from .dhm import DHMIE +from .douyutv import ( + DouyuShowIE, + DouyuTVIE, +) +from .dplay import ( + DPlayIE, + DiscoveryPlusIE, + HGTVDeIE, + GoDiscoveryIE, + TravelChannelIE, + CookingChannelIE, + HGTVUsaIE, + FoodNetworkIE, + InvestigationDiscoveryIE, + DestinationAmericaIE, + AmHistoryChannelIE, + ScienceChannelIE, + DIYNetworkIE, + DiscoveryLifeIE, + AnimalPlanetIE, + TLCIE, + MotorTrendIE, + MotorTrendOnDemandIE, + DiscoveryPlusIndiaIE, + DiscoveryNetworksDeIE, + DiscoveryPlusItalyIE, + DiscoveryPlusItalyShowIE, + DiscoveryPlusIndiaShowIE, + GlobalCyclingNetworkPlusIE, +) +from .dreisat import DreiSatIE +from .drbonanza import DRBonanzaIE +from .drtuber import DrTuberIE +from .drtv import ( + DRTVIE, + DRTVLiveIE, + DRTVSeasonIE, + DRTVSeriesIE, +) +from .dtube import DTubeIE +from .dvtv import DVTVIE +from .duboku import ( + DubokuIE, + DubokuPlaylistIE +) +from .dumpert import DumpertIE +from .deuxm import ( + DeuxMIE, + DeuxMNewsIE +) +from .digitalconcerthall import DigitalConcertHallIE +from .discogs import DiscogsReleasePlaylistIE +from .discovery import DiscoveryIE +from .disney import DisneyIE +from .dispeak import DigitallySpeakingIE +from .dropbox import DropboxIE +from .dropout import ( + DropoutSeasonIE, + DropoutIE +) +from .duoplay import DuoplayIE +from .dw import ( + DWIE, + DWArticleIE, +) +from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE +from .ebaumsworld import EbaumsWorldIE +from .ebay import EbayIE +from .egghead import ( + EggheadCourseIE, + EggheadLessonIE, +) +from .eighttracks import EightTracksIE +from .einthusan import EinthusanIE +from .eitb import EitbIE +from .elementorembed import ElementorEmbedIE +from .elonet import ElonetIE +from .elpais import ElPaisIE +from .eltrecetv import ElTreceTVIE +from .embedly import EmbedlyIE +from .epicon import ( + EpiconIE, + EpiconSeriesIE, +) +from .epidemicsound import EpidemicSoundIE +from .eplus import EplusIbIE +from .epoch import EpochIE +from .eporner import EpornerIE +from .erocast import ErocastIE +from .eroprofile import ( + EroProfileIE, + EroProfileAlbumIE, +) +from .err import ERRJupiterIE +from .ertgr import ( + ERTFlixCodenameIE, + ERTFlixIE, + ERTWebtvEmbedIE, +) +from .espn import ( + ESPNIE, + WatchESPNIE, + ESPNArticleIE, + FiveThirtyEightIE, + ESPNCricInfoIE, +) +from .ettutv import EttuTvIE +from .europa import EuropaIE, EuroParlWebstreamIE +from .europeantour import EuropeanTourIE +from .eurosport import EurosportIE +from .euscreen import EUScreenIE +from .expressen import ExpressenIE +from .eyedotv import EyedoTVIE +from .facebook import ( + FacebookIE, + FacebookPluginsVideoIE, + FacebookRedirectURLIE, + FacebookReelIE, + FacebookAdsIE, +) +from .fancode import ( + FancodeVodIE, + FancodeLiveIE +) + +from .faz import FazIE +from .fc2 import ( + FC2IE, + FC2EmbedIE, + FC2LiveIE, +) +from .fczenit import FczenitIE +from .fifa import FifaIE +from .filmon import ( + FilmOnIE, + FilmOnChannelIE, +) +from .filmweb import FilmwebIE +from .firsttv import FirstTVIE +from .fivetv import FiveTVIE +from .flextv import FlexTVIE +from .flickr import FlickrIE +from .floatplane import ( + FloatplaneIE, + FloatplaneChannelIE, +) +from .folketinget import FolketingetIE +from .footyroom import FootyRoomIE +from .formula1 import Formula1IE +from .fourtube import ( + FourTubeIE, + PornTubeIE, + PornerBrosIE, + FuxIE, +) +from .fox import FOXIE +from .fox9 import ( + FOX9IE, + FOX9NewsIE, +) +from .foxnews import ( + FoxNewsIE, + FoxNewsArticleIE, + FoxNewsVideoIE, +) +from .foxsports import FoxSportsIE +from .fptplay import FptplayIE +from .franceinter import FranceInterIE +from .francetv import ( + FranceTVIE, + FranceTVSiteIE, + FranceTVInfoIE, +) +from .freesound import FreesoundIE +from .freespeech import FreespeechIE +from .frontendmasters import ( + FrontendMastersIE, + FrontendMastersLessonIE, + FrontendMastersCourseIE +) +from .freetv import ( + FreeTvIE, + FreeTvMoviesIE, +) +from .fujitv import FujiTVFODPlus7IE +from .funimation import ( + FunimationIE, + FunimationPageIE, + FunimationShowIE, +) +from .funk import FunkIE +from .funker530 import Funker530IE +from .fuyintv import FuyinTVIE +from .gab import ( + GabTVIE, + GabIE, +) +from .gaia import GaiaIE +from .gamejolt import ( + GameJoltIE, + GameJoltUserIE, + GameJoltGameIE, + GameJoltGameSoundtrackIE, + GameJoltCommunityIE, + GameJoltSearchIE, +) +from .gamespot import GameSpotIE +from .gamestar import GameStarIE +from .gaskrank import GaskrankIE +from .gazeta import GazetaIE +from .gdcvault import GDCVaultIE +from .gedidigital import GediDigitalIE +from .generic import GenericIE +from .genius import ( + GeniusIE, + GeniusLyricsIE, +) +from .getcourseru import ( + GetCourseRuPlayerIE, + GetCourseRuIE +) +from .gettr import ( + GettrIE, + GettrStreamingIE, +) +from .giantbomb import GiantBombIE +from .glide import GlideIE +from .globalplayer import ( + GlobalPlayerLiveIE, + GlobalPlayerLivePlaylistIE, + GlobalPlayerAudioIE, + GlobalPlayerAudioEpisodeIE, + GlobalPlayerVideoIE +) +from .globo import ( + GloboIE, + GloboArticleIE, +) +from .gmanetwork import GMANetworkVideoIE +from .go import GoIE +from .godtube import GodTubeIE +from .gofile import GofileIE +from .golem import GolemIE +from .goodgame import GoodGameIE +from .googledrive import ( + GoogleDriveIE, + GoogleDriveFolderIE, +) +from .googlepodcasts import ( + GooglePodcastsIE, + GooglePodcastsFeedIE, +) +from .googlesearch import GoogleSearchIE +from .gopro import GoProIE +from .goplay import GoPlayIE +from .goshgay import GoshgayIE +from .gotostage import GoToStageIE +from .gputechconf import GPUTechConfIE +from .gronkh import ( + GronkhIE, + GronkhFeedIE, + GronkhVodsIE +) +from .groupon import GrouponIE +from .harpodeon import HarpodeonIE +from .hbo import HBOIE +from .hearthisat import HearThisAtIE +from .heise import HeiseIE +from .hellporno import HellPornoIE +from .hgtv import HGTVComShowIE +from .hketv import HKETVIE +from .hidive import HiDiveIE +from .historicfilms import HistoricFilmsIE +from .hitrecord import HitRecordIE +from .hollywoodreporter import ( + HollywoodReporterIE, + HollywoodReporterPlaylistIE, +) +from .holodex import HolodexIE +from .hotnewhiphop import HotNewHipHopIE +from .hotstar import ( + HotStarIE, + HotStarPrefixIE, + HotStarPlaylistIE, + HotStarSeasonIE, + HotStarSeriesIE, +) +from .hrefli import HrefLiRedirectIE +from .hrfensehen import HRFernsehenIE +from .hrti import ( + HRTiIE, + HRTiPlaylistIE, +) +from .hse import ( + HSEShowIE, + HSEProductIE, +) +from .genericembeds import ( + HTML5MediaEmbedIE, + QuotedHTMLIE, +) +from .huajiao import HuajiaoIE +from .huya import HuyaLiveIE +from .huffpost import HuffPostIE +from .hungama import ( + HungamaIE, + HungamaSongIE, + HungamaAlbumPlaylistIE, +) +from .hypem import HypemIE +from .hypergryph import MonsterSirenHypergryphMusicIE +from .hytale import HytaleIE +from .icareus import IcareusIE +from .ichinanalive import ( + IchinanaLiveIE, + IchinanaLiveClipIE, +) +from .idolplus import IdolPlusIE +from .ign import ( + IGNIE, + IGNVideoIE, + IGNArticleIE, +) +from .iheart import ( + IHeartRadioIE, + IHeartRadioPodcastIE, +) +from .ilpost import IlPostIE +from .iltalehti import IltalehtiIE +from .imdb import ( + ImdbIE, + ImdbListIE +) +from .imgur import ( + ImgurIE, + ImgurAlbumIE, + ImgurGalleryIE, +) +from .ina import InaIE +from .inc import IncIE +from .indavideo import IndavideoEmbedIE +from .infoq import InfoQIE +from .instagram import ( + InstagramIE, + InstagramIOSIE, + InstagramUserIE, + InstagramTagIE, + InstagramStoryIE, +) +from .internazionale import InternazionaleIE +from .internetvideoarchive import InternetVideoArchiveIE +from .iprima import ( + IPrimaIE, + IPrimaCNNIE +) +from .iqiyi import ( + IqiyiIE, + IqIE, + IqAlbumIE +) +from .islamchannel import ( + IslamChannelIE, + IslamChannelSeriesIE, +) +from .israelnationalnews import IsraelNationalNewsIE +from .itprotv import ( + ITProTVIE, + ITProTVCourseIE +) +from .itv import ( + ITVIE, + ITVBTCCIE, +) +from .ivi import ( + IviIE, + IviCompilationIE +) +from .ivideon import IvideonIE +from .iwara import ( + IwaraIE, + IwaraPlaylistIE, + IwaraUserIE, +) +from .ixigua import IxiguaIE +from .izlesene import IzleseneIE +from .jable import ( + JableIE, + JablePlaylistIE, +) +from .jamendo import ( + JamendoIE, + JamendoAlbumIE, +) +from .japandiet import ( + ShugiinItvLiveIE, + ShugiinItvLiveRoomIE, + ShugiinItvVodIE, + SangiinInstructionIE, + SangiinIE, +) +from .jeuxvideo import JeuxVideoIE +from .jiosaavn import ( + JioSaavnSongIE, + JioSaavnAlbumIE, +) +from .jove import JoveIE +from .joj import JojIE +from .joqrag import JoqrAgIE +from .jstream import JStreamIE +from .jtbc import ( + JTBCIE, + JTBCProgramIE, +) +from .jwplatform import JWPlatformIE +from .kakao import KakaoIE +from .kaltura import KalturaIE +from .kankanews import KankaNewsIE +from .karaoketv import KaraoketvIE +from .kelbyone import KelbyOneIE +from .khanacademy import ( + KhanAcademyIE, + KhanAcademyUnitIE, +) +from .kick import ( + KickIE, + KickVODIE, +) +from .kicker import KickerIE +from .kickstarter import KickStarterIE +from .kinja import KinjaEmbedIE +from .kinopoisk import KinoPoiskIE +from .kommunetv import KommunetvIE +from .kompas import KompasVideoIE +from .koo import KooIE +from .kth import KTHIE +from .krasview import KrasViewIE +from .ku6 import Ku6IE +from .kukululive import KukuluLiveIE +from .kuwo import ( + KuwoIE, + KuwoAlbumIE, + KuwoChartIE, + KuwoSingerIE, + KuwoCategoryIE, + KuwoMvIE, +) +from .la7 import ( + LA7IE, + LA7PodcastEpisodeIE, + LA7PodcastIE, +) +from .lastfm import ( + LastFMIE, + LastFMPlaylistIE, + LastFMUserIE, +) +from .laxarxames import LaXarxaMesIE +from .lbry import ( + LBRYIE, + LBRYChannelIE, + LBRYPlaylistIE, +) +from .lci import LCIIE +from .lcp import ( + LcpPlayIE, + LcpIE, +) +from .lecture2go import Lecture2GoIE +from .lecturio import ( + LecturioIE, + LecturioCourseIE, + LecturioDeCourseIE, +) +from .leeco import ( + LeIE, + LePlaylistIE, + LetvCloudIE, +) +from .lefigaro import ( + LeFigaroVideoEmbedIE, + LeFigaroVideoSectionIE, +) +from .lego import LEGOIE +from .lemonde import LemondeIE +from .lenta import LentaIE +from .libraryofcongress import LibraryOfCongressIE +from .libsyn import LibsynIE +from .lifenews import ( + LifeNewsIE, + LifeEmbedIE, +) +from .likee import ( + LikeeIE, + LikeeUserIE +) +from .limelight import ( + LimelightMediaIE, + LimelightChannelIE, + LimelightChannelListIE, +) +from .linkedin import ( + LinkedInIE, + LinkedInLearningIE, + LinkedInLearningCourseIE, +) +from .liputan6 import Liputan6IE +from .listennotes import ListenNotesIE +from .litv import LiTVIE +from .livejournal import LiveJournalIE +from .livestream import ( + LivestreamIE, + LivestreamOriginalIE, + LivestreamShortenerIE, +) +from .livestreamfails import LivestreamfailsIE +from .lnkgo import ( + LnkGoIE, + LnkIE, +) +from .lovehomeporn import LoveHomePornIE +from .lrt import ( + LRTVODIE, + LRTStreamIE +) +from .lsm import ( + LSMLREmbedIE, + LSMLTVEmbedIE, + LSMReplayIE +) +from .lumni import ( + LumniIE +) +from .lynda import ( + LyndaIE, + LyndaCourseIE +) +from .maariv import MaarivIE +from .magellantv import MagellanTVIE +from .magentamusik import MagentaMusikIE +from .mailru import ( + MailRuIE, + MailRuMusicIE, + MailRuMusicSearchIE, +) +from .mainstreaming import MainStreamingIE +from .mangomolo import ( + MangomoloVideoIE, + MangomoloLiveIE, +) +from .manoto import ( + ManotoTVIE, + ManotoTVShowIE, + ManotoTVLiveIE, +) +from .manyvids import ManyVidsIE +from .maoritv import MaoriTVIE +from .markiza import ( + MarkizaIE, + MarkizaPageIE, +) +from .massengeschmacktv import MassengeschmackTVIE +from .masters import MastersIE +from .matchtv import MatchTVIE +from .mbn import MBNIE +from .mdr import MDRIE +from .medaltv import MedalTVIE +from .mediaite import MediaiteIE +from .mediaklikk import MediaKlikkIE +from .mediaset import ( + MediasetIE, + MediasetShowIE, +) +from .mediasite import ( + MediasiteIE, + MediasiteCatalogIE, + MediasiteNamedCatalogIE, +) +from .mediastream import ( + MediaStreamIE, + WinSportsVideoIE, +) +from .mediaworksnz import MediaWorksNZVODIE +from .medici import MediciIE +from .megaphone import MegaphoneIE +from .meipai import MeipaiIE +from .melonvod import MelonVODIE +from .metacritic import MetacriticIE +from .mgtv import MGTVIE +from .microsoftstream import MicrosoftStreamIE +from .microsoftvirtualacademy import ( + MicrosoftVirtualAcademyIE, + MicrosoftVirtualAcademyCourseIE, +) +from .microsoftembed import MicrosoftEmbedIE +from .mildom import ( + MildomIE, + MildomVodIE, + MildomClipIE, + MildomUserVodIE, +) +from .minds import ( + MindsIE, + MindsChannelIE, + MindsGroupIE, +) +from .minoto import MinotoIE +from .mirrativ import ( + MirrativIE, + MirrativUserIE, +) +from .mirrorcouk import MirrorCoUKIE +from .mit import TechTVMITIE, OCWMITIE +from .mitele import MiTeleIE +from .mixch import ( + MixchIE, + MixchArchiveIE, +) +from .mixcloud import ( + MixcloudIE, + MixcloudUserIE, + MixcloudPlaylistIE, +) +from .mlb import ( + MLBIE, + MLBVideoIE, + MLBTVIE, + MLBArticleIE, +) +from .mlssoccer import MLSSoccerIE +from .mocha import MochaVideoIE +from .mojvideo import MojvideoIE +from .monstercat import MonstercatIE +from .motherless import ( + MotherlessIE, + MotherlessGroupIE, + MotherlessGalleryIE, + MotherlessUploaderIE, +) +from .motorsport import MotorsportIE +from .moviepilot import MoviepilotIE +from .moview import MoviewPlayIE +from .moviezine import MoviezineIE +from .movingimage import MovingImageIE +from .msn import MSNIE +from .mtv import ( + MTVIE, + MTVVideoIE, + MTVServicesEmbeddedIE, + MTVDEIE, + MTVJapanIE, + MTVItaliaIE, + MTVItaliaProgrammaIE, +) +from .muenchentv import MuenchenTVIE +from .murrtube import MurrtubeIE, MurrtubeUserIE +from .museai import MuseAIIE +from .musescore import MuseScoreIE +from .musicdex import ( + MusicdexSongIE, + MusicdexAlbumIE, + MusicdexArtistIE, + MusicdexPlaylistIE, +) +from .mx3 import ( + Mx3IE, + Mx3NeoIE, + Mx3VolksmusikIE, +) +from .mxplayer import ( + MxplayerIE, + MxplayerShowIE, +) +from .myspace import MySpaceIE, MySpaceAlbumIE +from .myspass import MySpassIE +from .myvideoge import MyVideoGeIE +from .myvidster import MyVidsterIE +from .mzaalo import MzaaloIE +from .n1 import ( + N1InfoAssetIE, + N1InfoIIE, +) +from .nate import ( + NateIE, + NateProgramIE, +) +from .nationalgeographic import ( + NationalGeographicVideoIE, + NationalGeographicTVIE, +) +from .naver import ( + NaverIE, + NaverLiveIE, + NaverNowIE, +) +from .nba import ( + NBAWatchEmbedIE, + NBAWatchIE, + NBAWatchCollectionIE, + NBAEmbedIE, + NBAIE, + NBAChannelIE, +) +from .nbc import ( + NBCIE, + NBCNewsIE, + NBCOlympicsIE, + NBCOlympicsStreamIE, + NBCSportsIE, + NBCSportsStreamIE, + NBCSportsVPlayerIE, + NBCStationsIE, +) +from .ndr import ( + NDRIE, + NJoyIE, + NDREmbedBaseIE, + NDREmbedIE, + NJoyEmbedIE, +) +from .ndtv import NDTVIE +from .nebula import ( + NebulaIE, + NebulaClassIE, + NebulaSubscriptionsIE, + NebulaChannelIE, +) +from .nekohacker import NekoHackerIE +from .nerdcubed import NerdCubedFeedIE +from .netzkino import NetzkinoIE +from .neteasemusic import ( + NetEaseMusicIE, + NetEaseMusicAlbumIE, + NetEaseMusicSingerIE, + NetEaseMusicListIE, + NetEaseMusicMvIE, + NetEaseMusicProgramIE, + NetEaseMusicDjRadioIE, +) +from .netverse import ( + NetverseIE, + NetversePlaylistIE, + NetverseSearchIE, +) +from .newgrounds import ( + NewgroundsIE, + NewgroundsPlaylistIE, + NewgroundsUserIE, +) +from .newspicks import NewsPicksIE +from .newsy import NewsyIE +from .nextmedia import ( + NextMediaIE, + NextMediaActionNewsIE, + AppleDailyIE, + NextTVIE, +) +from .nexx import ( + NexxIE, + NexxEmbedIE, +) +from .nfb import ( + NFBIE, + NFBSeriesIE, +) +from .nfhsnetwork import NFHSNetworkIE +from .nfl import ( + NFLIE, + NFLArticleIE, + NFLPlusEpisodeIE, + NFLPlusReplayIE, +) +from .nhk import ( + NhkVodIE, + NhkVodProgramIE, + NhkForSchoolBangumiIE, + NhkForSchoolSubjectIE, + NhkForSchoolProgramListIE, + NhkRadioNewsPageIE, + NhkRadiruIE, + NhkRadiruLiveIE, +) +from .nhl import NHLIE +from .nick import ( + NickIE, + NickBrIE, + NickDeIE, + NickRuIE, +) +from .niconico import ( + NiconicoIE, + NiconicoPlaylistIE, + NiconicoUserIE, + NiconicoSeriesIE, + NiconicoHistoryIE, + NicovideoSearchDateIE, + NicovideoSearchIE, + NicovideoSearchURLIE, + NicovideoTagURLIE, + NiconicoLiveIE, +) +from .ninaprotocol import NinaProtocolIE +from .ninecninemedia import ( + NineCNineMediaIE, + CPTwentyFourIE, +) +from .niconicochannelplus import ( + NiconicoChannelPlusIE, + NiconicoChannelPlusChannelVideosIE, + NiconicoChannelPlusChannelLivesIE, +) +from .ninegag import NineGagIE +from .ninenews import NineNewsIE +from .ninenow import NineNowIE +from .nintendo import NintendoIE +from .nitter import NitterIE +from .nobelprize import NobelPrizeIE +from .noice import NoicePodcastIE +from .nonktube import NonkTubeIE +from .noodlemagazine import NoodleMagazineIE +from .noovo import NoovoIE +from .nosnl import NOSNLArticleIE +from .nova import ( + NovaEmbedIE, + NovaIE, +) +from .novaplay import NovaPlayIE +from .nowness import ( + NownessIE, + NownessPlaylistIE, + NownessSeriesIE, +) +from .noz import NozIE +from .npo import ( + AndereTijdenIE, + NPOIE, + NPOLiveIE, + NPORadioIE, + NPORadioFragmentIE, + SchoolTVIE, + HetKlokhuisIE, + VPROIE, + WNLIE, +) +from .npr import NprIE +from .nrk import ( + NRKIE, + NRKPlaylistIE, + NRKSkoleIE, + NRKTVIE, + NRKTVDirekteIE, + NRKRadioPodkastIE, + NRKTVEpisodeIE, + NRKTVEpisodesIE, + NRKTVSeasonIE, + NRKTVSeriesIE, +) +from .nrl import NRLTVIE +from .ntvcojp import NTVCoJpCUIE +from .ntvde import NTVDeIE +from .ntvru import NTVRuIE +from .nubilesporn import NubilesPornIE +from .nytimes import ( + NYTimesIE, + NYTimesArticleIE, + NYTimesCookingIE, + NYTimesCookingRecipeIE, +) +from .nuum import ( + NuumLiveIE, + NuumTabIE, + NuumMediaIE, +) +from .nuvid import NuvidIE +from .nzherald import NZHeraldIE +from .nzonscreen import NZOnScreenIE +from .nzz import NZZIE +from .odkmedia import OnDemandChinaEpisodeIE +from .odnoklassniki import OdnoklassnikiIE +from .oftv import ( + OfTVIE, + OfTVPlaylistIE +) +from .oktoberfesttv import OktoberfestTVIE +from .olympics import OlympicsReplayIE +from .on24 import On24IE +from .ondemandkorea import ( + OnDemandKoreaIE, + OnDemandKoreaProgramIE, +) +from .onefootball import OneFootballIE +from .onenewsnz import OneNewsNZIE +from .oneplace import OnePlacePodcastIE +from .onet import ( + OnetIE, + OnetChannelIE, + OnetMVPIE, + OnetPlIE, +) +from .onionstudios import OnionStudiosIE +from .opencast import ( + OpencastIE, + OpencastPlaylistIE, +) +from .openrec import ( + OpenRecIE, + OpenRecCaptureIE, + OpenRecMovieIE, +) +from .ora import OraTVIE +from .orf import ( + ORFTVthekIE, + ORFFM4StoryIE, + ORFONIE, + ORFRadioIE, + ORFPodcastIE, + ORFIPTVIE, +) +from .outsidetv import OutsideTVIE +from .owncloud import OwnCloudIE +from .packtpub import ( + PacktPubIE, + PacktPubCourseIE, +) +from .palcomp3 import ( + PalcoMP3IE, + PalcoMP3ArtistIE, + PalcoMP3VideoIE, +) +from .panopto import ( + PanoptoIE, + PanoptoListIE, + PanoptoPlaylistIE +) +from .paramountplus import ( + ParamountPlusIE, + ParamountPlusSeriesIE, +) +from .parler import ParlerIE +from .parlview import ParlviewIE +from .patreon import ( + PatreonIE, + PatreonCampaignIE +) +from .pbs import PBSIE, PBSKidsIE +from .pearvideo import PearVideoIE +from .peekvids import PeekVidsIE, PlayVidsIE +from .peertube import ( + PeerTubeIE, + PeerTubePlaylistIE, +) +from .peertv import PeerTVIE +from .peloton import ( + PelotonIE, + PelotonLiveIE +) +from .performgroup import PerformGroupIE +from .periscope import ( + PeriscopeIE, + PeriscopeUserIE, +) +from .pgatour import PGATourIE +from .philharmoniedeparis import PhilharmonieDeParisIE +from .phoenix import PhoenixIE +from .photobucket import PhotobucketIE +from .piapro import PiaproIE +from .piaulizaportal import PIAULIZAPortalIE +from .picarto import ( + PicartoIE, + PicartoVodIE, +) +from .piksel import PikselIE +from .pinkbike import PinkbikeIE +from .pinterest import ( + PinterestIE, + PinterestCollectionIE, +) +from .pixivsketch import ( + PixivSketchIE, + PixivSketchUserIE, +) +from .pladform import PladformIE +from .planetmarathi import PlanetMarathiIE +from .platzi import ( + PlatziIE, + PlatziCourseIE, +) +from .playplustv import PlayPlusTVIE +from .playsuisse import PlaySuisseIE +from .playtvak import PlaytvakIE +from .playwire import PlaywireIE +from .plutotv import PlutoTVIE +from .pluralsight import ( + PluralsightIE, + PluralsightCourseIE, +) +from .podbayfm import PodbayFMIE, PodbayFMChannelIE +from .podchaser import PodchaserIE +from .podomatic import PodomaticIE +from .pokemon import ( + PokemonIE, + PokemonWatchIE, +) +from .pokergo import ( + PokerGoIE, + PokerGoCollectionIE, +) +from .polsatgo import PolsatGoIE +from .polskieradio import ( + PolskieRadioIE, + PolskieRadioLegacyIE, + PolskieRadioAuditionIE, + PolskieRadioCategoryIE, + PolskieRadioPlayerIE, + PolskieRadioPodcastIE, + PolskieRadioPodcastListIE, +) +from .popcorntimes import PopcorntimesIE +from .popcorntv import PopcornTVIE +from .porn91 import Porn91IE +from .pornbox import PornboxIE +from .pornflip import PornFlipIE +from .pornhub import ( + PornHubIE, + PornHubUserIE, + PornHubPlaylistIE, + PornHubPagedVideoListIE, + PornHubUserVideosUploadIE, +) +from .pornotube import PornotubeIE +from .pornovoisines import PornoVoisinesIE +from .pornoxo import PornoXOIE +from .puhutv import ( + PuhuTVIE, + PuhuTVSerieIE, +) +from .pr0gramm import Pr0grammIE +from .prankcast import PrankCastIE, PrankCastPostIE +from .premiershiprugby import PremiershipRugbyIE +from .presstv import PressTVIE +from .projectveritas import ProjectVeritasIE +from .prosiebensat1 import ProSiebenSat1IE +from .prx import ( + PRXStoryIE, + PRXSeriesIE, + PRXAccountIE, + PRXStoriesSearchIE, + PRXSeriesSearchIE +) +from .puls4 import Puls4IE +from .pyvideo import PyvideoIE +from .qdance import QDanceIE +from .qingting import QingTingIE +from .qqmusic import ( + QQMusicIE, + QQMusicSingerIE, + QQMusicAlbumIE, + QQMusicToplistIE, + QQMusicPlaylistIE, +) +from .r7 import ( + R7IE, + R7ArticleIE, +) +from .radiko import RadikoIE, RadikoRadioIE +from .radiocanada import ( + RadioCanadaIE, + RadioCanadaAudioVideoIE, +) +from .radiocomercial import ( + RadioComercialIE, + RadioComercialPlaylistIE, +) +from .radiode import RadioDeIE +from .radiojavan import RadioJavanIE +from .radiofrance import ( + FranceCultureIE, + RadioFranceIE, + RadioFranceLiveIE, + RadioFrancePodcastIE, + RadioFranceProfileIE, + RadioFranceProgramScheduleIE, +) +from .radiozet import RadioZetPodcastIE +from .radiokapital import ( + RadioKapitalIE, + RadioKapitalShowIE, +) +from .radlive import ( + RadLiveIE, + RadLiveChannelIE, + RadLiveSeasonIE, +) +from .rai import ( + RaiIE, + RaiCulturaIE, + RaiPlayIE, + RaiPlayLiveIE, + RaiPlayPlaylistIE, + RaiPlaySoundIE, + RaiPlaySoundLiveIE, + RaiPlaySoundPlaylistIE, + RaiNewsIE, + RaiSudtirolIE, +) +from .raywenderlich import ( + RayWenderlichIE, + RayWenderlichCourseIE, +) +from .rbgtum import ( + RbgTumIE, + RbgTumCourseIE, + RbgTumNewCourseIE, +) +from .rcs import ( + RCSIE, + RCSEmbedsIE, + RCSVariousIE, +) +from .rcti import ( + RCTIPlusIE, + RCTIPlusSeriesIE, + RCTIPlusTVIE, +) +from .rds import RDSIE +from .redbee import ParliamentLiveUKIE, RTBFIE +from .redbulltv import ( + RedBullTVIE, + RedBullEmbedIE, + RedBullTVRrnContentIE, + RedBullIE, +) +from .reddit import RedditIE +from .redge import RedCDNLivxIE +from .redgifs import ( + RedGifsIE, + RedGifsSearchIE, + RedGifsUserIE, +) +from .redtube import RedTubeIE +from .rentv import ( + RENTVIE, + RENTVArticleIE, +) +from .restudy import RestudyIE +from .reuters import ReutersIE +from .reverbnation import ReverbNationIE +from .rheinmaintv import RheinMainTVIE +from .ridehome import RideHomeIE +from .rinsefm import ( + RinseFMIE, + RinseFMArtistPlaylistIE, +) +from .rmcdecouverte import RMCDecouverteIE +from .rockstargames import RockstarGamesIE +from .rokfin import ( + RokfinIE, + RokfinStackIE, + RokfinChannelIE, + RokfinSearchIE, +) +from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE +from .rottentomatoes import RottenTomatoesIE +from .rozhlas import ( + RozhlasIE, + RozhlasVltavaIE, + MujRozhlasIE, +) +from .rte import RteIE, RteRadioIE +from .rtlnl import ( + RtlNlIE, + RTLLuTeleVODIE, + RTLLuArticleIE, + RTLLuLiveIE, + RTLLuRadioIE, +) +from .rtl2 import RTL2IE +from .rtnews import ( + RTNewsIE, + RTDocumentryIE, + RTDocumentryPlaylistIE, + RuptlyIE, +) +from .rtp import RTPIE +from .rtrfm import RTRFMIE +from .rts import RTSIE +from .rtvcplay import ( + RTVCPlayIE, + RTVCPlayEmbedIE, + RTVCKalturaIE, +) +from .rtve import ( + RTVEALaCartaIE, + RTVEAudioIE, + RTVELiveIE, + RTVEInfantilIE, + RTVETelevisionIE, +) +from .rtvs import RTVSIE +from .rtvslo import RTVSLOIE +from .rule34video import Rule34VideoIE +from .rumble import ( + RumbleEmbedIE, + RumbleIE, + RumbleChannelIE, +) +from .rudovideo import RudoVideoIE +from .rutube import ( + RutubeIE, + RutubeChannelIE, + RutubeEmbedIE, + RutubeMovieIE, + RutubePersonIE, + RutubePlaylistIE, + RutubeTagsIE, +) +from .glomex import ( + GlomexIE, + GlomexEmbedIE, +) +from .megatvcom import ( + MegaTVComIE, + MegaTVComEmbedIE, +) +from .antenna import ( + AntennaGrWatchIE, + Ant1NewsGrArticleIE, + Ant1NewsGrEmbedIE, +) +from .rutv import RUTVIE +from .ruutu import RuutuIE +from .ruv import ( + RuvIE, + RuvSpilaIE +) +from .s4c import ( + S4CIE, + S4CSeriesIE +) +from .safari import ( + SafariIE, + SafariApiIE, + SafariCourseIE, +) +from .saitosan import SaitosanIE +from .samplefocus import SampleFocusIE +from .sapo import SapoIE +from .sbs import SBSIE +from .sbscokr import ( + SBSCoKrIE, + SBSCoKrAllvodProgramIE, + SBSCoKrProgramsVodIE, +) +from .screen9 import Screen9IE +from .screencast import ScreencastIE +from .screencastify import ScreencastifyIE +from .screencastomatic import ScreencastOMaticIE +from .scrippsnetworks import ( + ScrippsNetworksWatchIE, + ScrippsNetworksIE, +) +from .scte import ( + SCTEIE, + SCTECourseIE, +) +from .scrolller import ScrolllerIE +from .sejmpl import SejmIE +from .senalcolombia import SenalColombiaLiveIE +from .senategov import SenateISVPIE, SenateGovIE +from .sendtonews import SendtoNewsIE +from .servus import ServusIE +from .sevenplus import SevenPlusIE +from .sexu import SexuIE +from .seznamzpravy import ( + SeznamZpravyIE, + SeznamZpravyArticleIE, +) +from .shahid import ( + ShahidIE, + ShahidShowIE, +) +from .sharevideos import ShareVideosEmbedIE +from .sibnet import SibnetEmbedIE +from .shemaroome import ShemarooMeIE +from .showroomlive import ShowRoomLiveIE +from .simplecast import ( + SimplecastIE, + SimplecastEpisodeIE, + SimplecastPodcastIE, +) +from .sina import SinaIE +from .sixplay import SixPlayIE +from .skeb import SkebIE +from .skyit import ( + SkyItPlayerIE, + SkyItVideoIE, + SkyItVideoLiveIE, + SkyItIE, + SkyItArteIE, + CieloTVItIE, + TV8ItIE, +) +from .skylinewebcams import SkylineWebcamsIE +from .skynewsarabia import ( + SkyNewsArabiaIE, + SkyNewsArabiaArticleIE, +) +from .skynewsau import SkyNewsAUIE +from .sky import ( + SkyNewsIE, + SkyNewsStoryIE, + SkySportsIE, + SkySportsNewsIE, +) +from .slideshare import SlideshareIE +from .slideslive import SlidesLiveIE +from .slutload import SlutloadIE +from .smotrim import SmotrimIE +from .snotr import SnotrIE +from .sohu import ( + SohuIE, + SohuVIE, +) +from .sonyliv import ( + SonyLIVIE, + SonyLIVSeriesIE, +) +from .soundcloud import ( + SoundcloudEmbedIE, + SoundcloudIE, + SoundcloudSetIE, + SoundcloudRelatedIE, + SoundcloudUserIE, + SoundcloudUserPermalinkIE, + SoundcloudTrackStationIE, + SoundcloudPlaylistIE, + SoundcloudSearchIE, +) +from .soundgasm import ( + SoundgasmIE, + SoundgasmProfileIE +) +from .southpark import ( + SouthParkIE, + SouthParkDeIE, + SouthParkDkIE, + SouthParkEsIE, + SouthParkLatIE, + SouthParkNlIE +) +from .sovietscloset import ( + SovietsClosetIE, + SovietsClosetPlaylistIE +) +from .spankbang import ( + SpankBangIE, + SpankBangPlaylistIE, +) +from .spiegel import SpiegelIE +from .spike import ( + BellatorIE, + ParamountNetworkIE, +) +from .stageplus import StagePlusVODConcertIE +from .startrek import StarTrekIE +from .stitcher import ( + StitcherIE, + StitcherShowIE, +) +from .sport5 import Sport5IE +from .sportbox import SportBoxIE +from .sportdeutschland import SportDeutschlandIE +from .spotify import ( + SpotifyIE, + SpotifyShowIE, +) +from .spreaker import ( + SpreakerIE, + SpreakerPageIE, + SpreakerShowIE, + SpreakerShowPageIE, +) +from .springboardplatform import SpringboardPlatformIE +from .sprout import SproutIE +from .srgssr import ( + SRGSSRIE, + SRGSSRPlayIE, +) +from .srmediathek import SRMediathekIE +from .stacommu import ( + StacommuLiveIE, + StacommuVODIE, + TheaterComplexTownVODIE, + TheaterComplexTownPPVIE, +) +from .stanfordoc import StanfordOpenClassroomIE +from .startv import StarTVIE +from .steam import ( + SteamIE, + SteamCommunityBroadcastIE, +) +from .storyfire import ( + StoryFireIE, + StoryFireUserIE, + StoryFireSeriesIE, +) +from .streamable import StreamableIE +from .streamcz import StreamCZIE +from .streetvoice import StreetVoiceIE +from .stretchinternet import StretchInternetIE +from .stripchat import StripchatIE +from .stv import STVPlayerIE +from .substack import SubstackIE +from .sunporno import SunPornoIE +from .sverigesradio import ( + SverigesRadioEpisodeIE, + SverigesRadioPublicationIE, +) +from .svt import ( + SVTIE, + SVTPageIE, + SVTPlayIE, + SVTSeriesIE, +) +from .swearnet import SwearnetEpisodeIE +from .syvdk import SYVDKIE +from .syfy import SyfyIE +from .sztvhu import SztvHuIE +from .tagesschau import TagesschauIE +from .tass import TassIE +from .tbs import TBSIE +from .tbsjp import ( + TBSJPEpisodeIE, + TBSJPProgramIE, + TBSJPPlaylistIE, +) +from .teachable import ( + TeachableIE, + TeachableCourseIE, +) +from .teachertube import ( + TeacherTubeIE, + TeacherTubeUserIE, +) +from .teachingchannel import TeachingChannelIE +from .teamcoco import ( + TeamcocoIE, + ConanClassicIE, +) +from .teamtreehouse import TeamTreeHouseIE +from .ted import ( + TedEmbedIE, + TedPlaylistIE, + TedSeriesIE, + TedTalkIE, +) +from .tele5 import Tele5IE +from .tele13 import Tele13IE +from .telebruxelles import TeleBruxellesIE +from .telecaribe import TelecaribePlayIE +from .telecinco import TelecincoIE +from .telegraaf import TelegraafIE +from .telegram import TelegramEmbedIE +from .telemb import TeleMBIE +from .telemundo import TelemundoIE +from .telequebec import ( + TeleQuebecIE, + TeleQuebecSquatIE, + TeleQuebecEmissionIE, + TeleQuebecLiveIE, + TeleQuebecVideoIE, +) +from .teletask import TeleTaskIE +from .telewebion import TelewebionIE +from .tempo import TempoIE, IVXPlayerIE +from .tencent import ( + IflixEpisodeIE, + IflixSeriesIE, + VQQSeriesIE, + VQQVideoIE, + WeTvEpisodeIE, + WeTvSeriesIE, +) +from .tennistv import TennisTVIE +from .tenplay import ( + TenPlayIE, + TenPlaySeasonIE, +) +from .testurl import TestURLIE +from .tf1 import TF1IE +from .tfo import TFOIE +from .theguardian import ( + TheGuardianPodcastIE, + TheGuardianPodcastPlaylistIE, +) +from .theholetv import TheHoleTvIE +from .theintercept import TheInterceptIE +from .theplatform import ( + ThePlatformIE, + ThePlatformFeedIE, +) +from .thestar import TheStarIE +from .thesun import TheSunIE +from .theweatherchannel import TheWeatherChannelIE +from .thisamericanlife import ThisAmericanLifeIE +from .thisoldhouse import ThisOldHouseIE +from .thisvid import ( + ThisVidIE, + ThisVidMemberIE, + ThisVidPlaylistIE, +) +from .threespeak import ( + ThreeSpeakIE, + ThreeSpeakUserIE, +) +from .threeqsdn import ThreeQSDNIE +from .tiktok import ( + TikTokIE, + TikTokUserIE, + TikTokSoundIE, + TikTokEffectIE, + TikTokTagIE, + TikTokVMIE, + TikTokLiveIE, + DouyinIE, +) +from .tmz import TMZIE +from .tnaflix import ( + TNAFlixNetworkEmbedIE, + TNAFlixIE, + EMPFlixIE, + MovieFapIE, +) +from .toggle import ( + ToggleIE, + MeWatchIE, +) +from .toggo import ( + ToggoIE, +) +from .tonline import TOnlineIE +from .toongoggles import ToonGogglesIE +from .toutv import TouTvIE +from .toypics import ToypicsUserIE, ToypicsIE +from .traileraddict import TrailerAddictIE +from .triller import ( + TrillerIE, + TrillerUserIE, + TrillerShortIE, +) +from .trovo import ( + TrovoIE, + TrovoVodIE, + TrovoChannelVodIE, + TrovoChannelClipIE, +) +from .trtcocuk import TrtCocukVideoIE +from .trtworld import TrtWorldIE +from .trueid import TrueIDIE +from .trunews import TruNewsIE +from .truth import TruthIE +from .trutv import TruTVIE +from .tube8 import Tube8IE +from .tubetugraz import TubeTuGrazIE, TubeTuGrazSeriesIE +from .tubitv import ( + TubiTvIE, + TubiTvShowIE, +) +from .tumblr import TumblrIE +from .tunein import ( + TuneInStationIE, + TuneInPodcastIE, + TuneInPodcastEpisodeIE, + TuneInShortenerIE, +) +from .tv2 import ( + TV2IE, + TV2ArticleIE, + KatsomoIE, + MTVUutisetArticleIE, +) +from .tv24ua import ( + TV24UAVideoIE, +) +from .tv2dk import ( + TV2DKIE, + TV2DKBornholmPlayIE, +) +from .tv2hu import ( + TV2HuIE, + TV2HuSeriesIE, +) +from .tv4 import TV4IE +from .tv5mondeplus import TV5MondePlusIE +from .tv5unis import ( + TV5UnisVideoIE, + TV5UnisIE, +) +from .tva import ( + TVAIE, + QubIE, +) +from .tvanouvelles import ( + TVANouvellesIE, + TVANouvellesArticleIE, +) +from .tvc import ( + TVCIE, + TVCArticleIE, +) +from .tver import TVerIE +from .tvigle import TvigleIE +from .tviplayer import TVIPlayerIE +from .tvland import TVLandIE +from .tvn24 import TVN24IE +from .tvnoe import TVNoeIE +from .tvopengr import ( + TVOpenGrWatchIE, + TVOpenGrEmbedIE, +) +from .tvp import ( + TVPEmbedIE, + TVPIE, + TVPStreamIE, + TVPVODSeriesIE, + TVPVODVideoIE, +) +from .tvplay import ( + TVPlayIE, + TVPlayHomeIE, +) +from .tvplayer import TVPlayerIE +from .tweakers import TweakersIE +from .twentymin import TwentyMinutenIE +from .twentythreevideo import TwentyThreeVideoIE +from .twitcasting import ( + TwitCastingIE, + TwitCastingLiveIE, + TwitCastingUserIE, +) +from .twitch import ( + TwitchVodIE, + TwitchCollectionIE, + TwitchVideosIE, + TwitchVideosClipsIE, + TwitchVideosCollectionsIE, + TwitchStreamIE, + TwitchClipsIE, +) +from .twitter import ( + TwitterCardIE, + TwitterIE, + TwitterAmplifyIE, + TwitterBroadcastIE, + TwitterSpacesIE, + TwitterShortenerIE, +) +from .txxx import ( + TxxxIE, + PornTopIE, +) +from .udemy import ( + UdemyIE, + UdemyCourseIE +) +from .udn import UDNEmbedIE +from .ufctv import ( + UFCTVIE, + UFCArabiaIE, +) +from .ukcolumn import UkColumnIE +from .uktvplay import UKTVPlayIE +from .digiteka import DigitekaIE +from .dlive import ( + DLiveVODIE, + DLiveStreamIE, +) +from .drooble import DroobleIE +from .umg import UMGDeIE +from .unistra import UnistraIE +from .unity import UnityIE +from .unsupported import KnownDRMIE, KnownPiracyIE +from .uol import UOLIE +from .uplynk import ( + UplynkIE, + UplynkPreplayIE, +) +from .urort import UrortIE +from .urplay import URPlayIE +from .usanetwork import USANetworkIE +from .usatoday import USATodayIE +from .ustream import UstreamIE, UstreamChannelIE +from .ustudio import ( + UstudioIE, + UstudioEmbedIE, +) +from .utreon import UtreonIE +from .varzesh3 import Varzesh3IE +from .vbox7 import Vbox7IE +from .veo import VeoIE +from .veoh import ( + VeohIE, + VeohUserIE +) +from .vesti import VestiIE +from .vevo import ( + VevoIE, + VevoPlaylistIE, +) +from .vgtv import ( + BTArticleIE, + BTVestlendingenIE, + VGTVIE, +) +from .vh1 import VH1IE +from .vice import ( + ViceIE, + ViceArticleIE, + ViceShowIE, +) +from .viddler import ViddlerIE +from .videa import VideaIE +from .videocampus_sachsen import ( + VideocampusSachsenIE, + ViMPPlaylistIE, +) +from .videodetective import VideoDetectiveIE +from .videofyme import VideofyMeIE +from .videoken import ( + VideoKenIE, + VideoKenPlayerIE, + VideoKenPlaylistIE, + VideoKenCategoryIE, + VideoKenTopicIE, +) +from .videomore import ( + VideomoreIE, + VideomoreVideoIE, + VideomoreSeasonIE, +) +from .videopress import VideoPressIE +from .vidio import ( + VidioIE, + VidioPremierIE, + VidioLiveIE +) +from .vidlii import VidLiiIE +from .vidly import VidlyIE +from .viewlift import ( + ViewLiftIE, + ViewLiftEmbedIE, +) +from .viidea import ViideaIE +from .vimeo import ( + VimeoIE, + VimeoAlbumIE, + VimeoChannelIE, + VimeoGroupsIE, + VimeoLikesIE, + VimeoOndemandIE, + VimeoProIE, + VimeoReviewIE, + VimeoUserIE, + VimeoWatchLaterIE, + VHXEmbedIE, +) +from .vimm import ( + VimmIE, + VimmRecordingIE, +) +from .vine import ( + VineIE, + VineUserIE, +) +from .viki import ( + VikiIE, + VikiChannelIE, +) +from .viously import ViouslyIE +from .viqeo import ViqeoIE +from .viu import ( + ViuIE, + ViuPlaylistIE, + ViuOTTIE, + ViuOTTIndonesiaIE, +) +from .vk import ( + VKIE, + VKUserVideosIE, + VKWallPostIE, + VKPlayIE, + VKPlayLiveIE, +) +from .vocaroo import VocarooIE +from .vodpl import VODPlIE +from .vodplatform import VODPlatformIE +from .voicy import ( + VoicyIE, + VoicyChannelIE, +) +from .volejtv import VolejTVIE +from .voot import ( + VootIE, + VootSeriesIE, +) +from .voxmedia import ( + VoxMediaVolumeIE, + VoxMediaIE, +) +from .vrt import ( + VRTIE, + VrtNUIE, + KetnetIE, + DagelijkseKostIE, +) +from .vtm import VTMIE +from .medialaan import MedialaanIE +from .vuclip import VuClipIE +from .vvvvid import ( + VVVVIDIE, + VVVVIDShowIE, +) +from .walla import WallaIE +from .washingtonpost import ( + WashingtonPostIE, + WashingtonPostArticleIE, +) +from .wat import WatIE +from .wdr import ( + WDRIE, + WDRPageIE, + WDRElefantIE, + WDRMobileIE, +) +from .webcamerapl import WebcameraplIE +from .webcaster import ( + WebcasterIE, + WebcasterFeedIE, +) +from .webofstories import ( + WebOfStoriesIE, + WebOfStoriesPlaylistIE, +) +from .weibo import ( + WeiboIE, + WeiboVideoIE, + WeiboUserIE, +) +from .weiqitv import WeiqiTVIE +from .weverse import ( + WeverseIE, + WeverseMediaIE, + WeverseMomentIE, + WeverseLiveTabIE, + WeverseMediaTabIE, + WeverseLiveIE, +) +from .wevidi import WeVidiIE +from .weyyak import WeyyakIE +from .whyp import WhypIE +from .wikimedia import WikimediaIE +from .wimbledon import WimbledonIE +from .wimtv import WimTVIE +from .whowatch import WhoWatchIE +from .wistia import ( + WistiaIE, + WistiaPlaylistIE, + WistiaChannelIE, +) +from .wordpress import ( + WordpressPlaylistEmbedIE, + WordpressMiniAudioPlayerEmbedIE, +) +from .worldstarhiphop import WorldStarHipHopIE +from .wppilot import ( + WPPilotIE, + WPPilotChannelsIE, +) +from .wrestleuniverse import ( + WrestleUniverseVODIE, + WrestleUniversePPVIE, +) +from .wsj import ( + WSJIE, + WSJArticleIE, +) +from .wwe import WWEIE +from .wykop import ( + WykopDigIE, + WykopDigCommentIE, + WykopPostIE, + WykopPostCommentIE, +) +from .xanimu import XanimuIE +from .xboxclips import XboxClipsIE +from .xfileshare import XFileShareIE +from .xhamster import ( + XHamsterIE, + XHamsterEmbedIE, + XHamsterUserIE, +) +from .ximalaya import ( + XimalayaIE, + XimalayaAlbumIE +) +from .xinpianchang import XinpianchangIE +from .xminus import XMinusIE +from .xnxx import XNXXIE +from .xstream import XstreamIE +from .xvideos import ( + XVideosIE, + XVideosQuickiesIE +) +from .xxxymovies import XXXYMoviesIE +from .yahoo import ( + YahooIE, + YahooSearchIE, + YahooJapanNewsIE, +) +from .yandexdisk import YandexDiskIE +from .yandexmusic import ( + YandexMusicTrackIE, + YandexMusicAlbumIE, + YandexMusicPlaylistIE, + YandexMusicArtistTracksIE, + YandexMusicArtistAlbumsIE, +) +from .yandexvideo import ( + YandexVideoIE, + YandexVideoPreviewIE, + ZenYandexIE, + ZenYandexChannelIE, +) +from .yapfiles import YapFilesIE +from .yappy import ( + YappyIE, + YappyProfileIE, +) +from .yle_areena import YleAreenaIE +from .youjizz import YouJizzIE +from .youku import ( + YoukuIE, + YoukuShowIE, +) +from .younow import ( + YouNowLiveIE, + YouNowChannelIE, + YouNowMomentIE, +) +from .youporn import YouPornIE +from .yourporn import YourPornIE +from .yourupload import YourUploadIE +from .zaiko import ( + ZaikoIE, + ZaikoETicketIE, +) +from .zapiks import ZapiksIE +from .zattoo import ( + BBVTVIE, + BBVTVLiveIE, + BBVTVRecordingsIE, + EinsUndEinsTVIE, + EinsUndEinsTVLiveIE, + EinsUndEinsTVRecordingsIE, + EWETVIE, + EWETVLiveIE, + EWETVRecordingsIE, + GlattvisionTVIE, + GlattvisionTVLiveIE, + GlattvisionTVRecordingsIE, + MNetTVIE, + MNetTVLiveIE, + MNetTVRecordingsIE, + NetPlusTVIE, + NetPlusTVLiveIE, + NetPlusTVRecordingsIE, + OsnatelTVIE, + OsnatelTVLiveIE, + OsnatelTVRecordingsIE, + QuantumTVIE, + QuantumTVLiveIE, + QuantumTVRecordingsIE, + SaltTVIE, + SaltTVLiveIE, + SaltTVRecordingsIE, + SAKTVIE, + SAKTVLiveIE, + SAKTVRecordingsIE, + VTXTVIE, + VTXTVLiveIE, + VTXTVRecordingsIE, + WalyTVIE, + WalyTVLiveIE, + WalyTVRecordingsIE, + ZattooIE, + ZattooLiveIE, + ZattooMoviesIE, + ZattooRecordingsIE, +) +from .zdf import ZDFIE, ZDFChannelIE +from .zee5 import ( + Zee5IE, + Zee5SeriesIE, +) +from .zeenews import ZeeNewsIE +from .zenporn import ZenPornIE +from .zetland import ZetlandDKArticleIE +from .zhihu import ZhihuIE +from .zingmp3 import ( + ZingMp3IE, + ZingMp3AlbumIE, + ZingMp3ChartHomeIE, + ZingMp3WeekChartIE, + ZingMp3ChartMusicVideoIE, + ZingMp3UserIE, + ZingMp3HubIE, + ZingMp3LiveRadioIE, + ZingMp3PodcastEpisodeIE, + ZingMp3PodcastIE, +) +from .zoom import ZoomIE +from .zype import ZypeIE diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py new file mode 100644 index 0000000..b217422 --- /dev/null +++ b/yt_dlp/extractor/abc.py @@ -0,0 +1,421 @@ +import hashlib +import hmac +import re +import time + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + dict_get, + ExtractorError, + js_to_json, + int_or_none, + parse_iso8601, + str_or_none, + traverse_obj, + try_get, + unescapeHTML, + update_url_query, + url_or_none, +) + + +class ABCIE(InfoExtractor): + IE_NAME = 'abc.net.au' + _VALID_URL = r'https?://(?:www\.)?abc\.net\.au/(?:news|btn)/(?:[^/]+/){1,4}(?P\d{5,})' + + _TESTS = [{ + 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', + 'md5': 'cb3dd03b18455a661071ee1e28344d9f', + 'info_dict': { + 'id': '5868334', + 'ext': 'mp4', + 'title': 'Australia to help staff Ebola treatment centre in Sierra Leone', + 'description': 'md5:809ad29c67a05f54eb41f2a105693a67', + }, + 'skip': 'this video has expired', + }, { + 'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326', + 'md5': '4ebd61bdc82d9a8b722f64f1f4b4d121', + 'info_dict': { + 'id': 'NvqvPeNZsHU', + 'ext': 'mp4', + 'upload_date': '20150816', + 'uploader': 'ABC News (Australia)', + 'description': 'Government backbencher Warren Entsch introduces a cross-party sponsored bill to legalise same-sex marriage, saying the bill is designed to promote "an inclusive Australia, not a divided one.". Read more here: http://ab.co/1Mwc6ef', + 'uploader_id': 'NewsOnABC', + 'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill', + }, + 'add_ie': ['Youtube'], + 'skip': 'Not accessible from Travis CI server', + }, { + 'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080', + 'md5': 'b96eee7c9edf4fc5a358a0252881cc1f', + 'info_dict': { + 'id': '6880080', + 'ext': 'mp3', + 'title': 'NAB lifts interest rates, following Westpac and CBA', + 'description': 'md5:f13d8edc81e462fce4a0437c7dc04728', + }, + }, { + 'url': 'http://www.abc.net.au/news/2015-10-19/6866214', + 'only_matching': True, + }, { + 'url': 'https://www.abc.net.au/btn/classroom/wwi-centenary/10527914', + 'info_dict': { + 'id': '10527914', + 'ext': 'mp4', + 'title': 'WWI Centenary', + 'description': 'md5:c2379ec0ca84072e86b446e536954546', + } + }, { + 'url': 'https://www.abc.net.au/news/programs/the-world/2020-06-10/black-lives-matter-protests-spawn-support-for/12342074', + 'info_dict': { + 'id': '12342074', + 'ext': 'mp4', + 'title': 'Black Lives Matter protests spawn support for Papuans in Indonesia', + 'description': 'md5:2961a17dc53abc558589ccd0fb8edd6f', + } + }, { + 'url': 'https://www.abc.net.au/btn/newsbreak/btn-newsbreak-20200814/12560476', + 'info_dict': { + 'id': 'tDL8Ld4dK_8', + 'ext': 'mp4', + 'title': 'Fortnite Banned From Apple and Google App Stores', + 'description': 'md5:a6df3f36ce8f816b74af4bd6462f5651', + 'upload_date': '20200813', + 'uploader': 'Behind the News', + 'uploader_id': 'behindthenews', + } + }, { + 'url': 'https://www.abc.net.au/news/2023-06-25/wagner-boss-orders-troops-back-to-bases-to-avoid-bloodshed/102520540', + 'info_dict': { + 'id': '102520540', + 'title': 'Wagner Group retreating from Russia, leader Prigozhin to move to Belarus', + 'ext': 'mp4', + 'description': 'Wagner troops leave Rostov-on-Don and\xa0Yevgeny Prigozhin will move to Belarus under a deal brokered by Belarusian President Alexander Lukashenko to end the mutiny.', + 'thumbnail': 'https://live-production.wcms.abc-cdn.net.au/0c170f5b57f0105c432f366c0e8e267b?impolicy=wcms_crop_resize&cropH=2813&cropW=5000&xPos=0&yPos=249&width=862&height=485', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + mobj = re.search(r'[^"]+)"\s+data-duration="\d+"\s+title="Download audio directly">', webpage) + if mobj: + urls_info = mobj.groupdict() + youtube = False + video = False + else: + mobj = re.search(r'External Link:', + webpage) + if mobj is None: + mobj = re.search(r'' + + xml_root = self._html_search_regex( + PLAYER_REGEX, start_page, 'xml root', default=None) + if xml_root is None: + # Probably need to authenticate + login_res = self._login(webpage_url, display_id) + if login_res is None: + self.report_warning('Could not login.') + else: + start_page = login_res + # Grab the url from the authenticated page + xml_root = self._html_search_regex( + PLAYER_REGEX, start_page, 'xml root') + + xml_name = self._html_search_regex( + r'', webpage): + url = self._search_regex( + r'src=(["\'])(?P.+?partnerplayer.+?)\1', iframe, + 'player URL', default=None, group='url') + if url: + break + + if not url: + url = self._og_search_url(webpage) + + mobj = re.match( + self._VALID_URL, self._proto_relative_url(url.strip())) + + player_id = mobj.group('player_id') + if not display_id: + display_id = player_id + if player_id: + player_page = self._download_webpage( + url, display_id, note='Downloading player page', + errnote='Could not download player page') + video_id = self._search_regex( + r'\d+)' + _TESTS = [ + { + 'url': 'https://pbskids.org/video/molly-of-denali/3030407927', + 'md5': '1ded20a017cc6b53446238f1804ce4c7', + 'info_dict': { + 'id': '3030407927', + 'title': 'Bird in the Hand/Bye-Bye Birdie', + 'channel': 'molly-of-denali', + 'duration': 1540, + 'ext': 'mp4', + 'series': 'Molly of Denali', + 'description': 'md5:d006b2211633685d8ebc8d03b6d5611e', + 'categories': ['Episode'], + 'upload_date': '20190718', + } + }, + { + 'url': 'https://pbskids.org/video/plum-landing/2365205059', + 'md5': '92e5d189851a64ae1d0237a965be71f5', + 'info_dict': { + 'id': '2365205059', + 'title': 'Cooper\'s Favorite Place in Nature', + 'channel': 'plum-landing', + 'duration': 67, + 'ext': 'mp4', + 'series': 'Plum Landing', + 'description': 'md5:657e5fc4356a84ead1c061eb280ff05d', + 'categories': ['Episode'], + 'upload_date': '20140302', + } + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + meta = self._search_json(r'window\._PBS_KIDS_DEEPLINK\s*=', webpage, 'video info', video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + traverse_obj(meta, ('video_obj', 'URI', {url_or_none})), video_id, ext='mp4') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(meta, { + 'categories': ('video_obj', 'video_type', {str}, {lambda x: [x] if x else None}), + 'channel': ('show_slug', {str}), + 'description': ('video_obj', 'description', {str}), + 'duration': ('video_obj', 'duration', {int_or_none}), + 'series': ('video_obj', 'program_title', {str}), + 'title': ('video_obj', 'title', {str}), + 'upload_date': ('video_obj', 'air_date', {unified_strdate}), + }) + } diff --git a/yt_dlp/extractor/pearvideo.py b/yt_dlp/extractor/pearvideo.py new file mode 100644 index 0000000..e27e5a7 --- /dev/null +++ b/yt_dlp/extractor/pearvideo.py @@ -0,0 +1,68 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + qualities, + unified_timestamp, + traverse_obj, +) + + +class PearVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pearvideo\.com/video_(?P\d+)' + _TEST = { + 'url': 'http://www.pearvideo.com/video_1076290', + 'info_dict': { + 'id': '1076290', + 'ext': 'mp4', + 'title': '小浣熊在主人家玻璃上滚石头:没砸', + 'description': 'md5:01d576b747de71be0ee85eb7cac25f9d', + 'timestamp': 1494275280, + 'upload_date': '20170508', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + quality = qualities( + ('ldflv', 'ld', 'sdflv', 'sd', 'hdflv', 'hd', 'src')) + + formats = [{ + 'url': mobj.group('url'), + 'format_id': mobj.group('id'), + 'quality': quality(mobj.group('id')), + } for mobj in re.finditer( + r'(?P[a-zA-Z]+)Url\s*=\s*(["\'])(?P(?:https?:)?//.+?)\2', + webpage)] + if not formats: + info = self._download_json( + 'https://www.pearvideo.com/videoStatus.jsp', video_id=video_id, + query={'contId': video_id}, headers={'Referer': url}) + formats = [{ + 'format_id': k, + 'url': v.replace(info['systemTime'], f'cont-{video_id}') if k == 'srcUrl' else v + } for k, v in traverse_obj(info, ('videoInfo', 'videos'), default={}).items() if v] + + title = self._search_regex( + (r']+\bclass=(["\'])video-tt\1[^>]*>(?P[^<]+)', + r'<[^>]+\bdata-title=(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'title', group='value') + description = self._search_regex( + (r']+\bclass=(["\'])summary\1[^>]*>(?P[^<]+)', + r'<[^>]+\bdata-summary=(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'description', default=None, + group='value') or self._html_search_meta('Description', webpage) + timestamp = unified_timestamp(self._search_regex( + r']+\bclass=["\']date["\'][^>]*>([^<]+)', + webpage, 'timestamp', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/yt_dlp/extractor/peekvids.py b/yt_dlp/extractor/peekvids.py new file mode 100644 index 0000000..939c26d --- /dev/null +++ b/yt_dlp/extractor/peekvids.py @@ -0,0 +1,188 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_class, + int_or_none, + merge_dicts, + url_or_none, +) + + +class PeekVidsBaseIE(InfoExtractor): + def _real_extract(self, url): + domain, video_id = self._match_valid_url(url).group('domain', 'id') + webpage = self._download_webpage(url, video_id, expected_status=429) + if '>Rate Limit Exceeded' in webpage: + raise ExtractorError( + f'You are suspected as a bot. Wait, or pass the captcha on the site and provide cookies. {self._login_hint()}', + video_id=video_id, expected=True) + + title = self._html_search_regex(r'(?s)]*>(.+?)', webpage, 'title') + + display_id = video_id + video_id = self._search_regex(r'(?s)]+\bdata-id\s*=\s*["\']?([\w-]+)', webpage, 'short video ID') + srcs = self._download_json( + f'https://www.{domain}/v-alt/{video_id}', video_id, + note='Downloading list of source files') + + formats = [] + for k, v in srcs.items(): + f_url = url_or_none(v) + if not f_url: + continue + + height = self._search_regex(r'^data-src(\d{3,})$', k, 'height', default=None) + if not height: + continue + + formats.append({ + 'url': f_url, + 'format_id': height, + 'height': int_or_none(height), + }) + + if not formats: + formats = [{'url': url} for url in srcs.values()] + + info = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={}) + info.pop('url', None) + + # may not have found the thumbnail if it was in a list in the ld+json + info.setdefault('thumbnail', self._og_search_thumbnail(webpage)) + detail = (get_element_by_class('detail-video-block', webpage) + or get_element_by_class('detail-block', webpage) or '') + info['description'] = self._html_search_regex( + rf'(?s)(.+?)(?:{re.escape(info.get("description", ""))}\s*<|]*>\s*{re.escape(name)}\s*:\s*(.+?)', + html, name, default='') + return list(filter(None, re.split(r'\s+', l))) + + return merge_dicts({ + 'id': video_id, + 'display_id': display_id, + 'age_limit': 18, + 'formats': formats, + 'categories': cat_tags('Categories', detail), + 'tags': cat_tags('Tags', detail), + 'uploader': self._html_search_regex(r'[Uu]ploaded\s+by\s(.+?)"', webpage, 'uploader', default=None), + }, info) + + +class PeekVidsIE(PeekVidsBaseIE): + _VALID_URL = r'''(?x) + https?://(?:www\.)?(?Ppeekvids\.com)/ + (?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=) + (?P[^/?&#]*) + ''' + _TESTS = [{ + 'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd', + 'md5': '2ff6a357a9717dc9dc9894b51307e9a2', + 'info_dict': { + 'id': '1262717', + 'display_id': 'BSyLMbN0YCd', + 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:0a61df3620de26c0af8963b1a730cd69', + 'timestamp': 1642579329, + 'upload_date': '20220119', + 'duration': 416, + 'view_count': int, + 'age_limit': 18, + 'uploader': 'SEXYhub.com', + 'categories': list, + 'tags': list, + }, + }] + + +class PlayVidsIE(PeekVidsBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?Pplayvids\.com)/(?:embed/|\w\w?/)?(?P[^/?#]*)' + _TESTS = [{ + 'url': 'https://www.playvids.com/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', + 'md5': '2f12e50213dd65f142175da633c4564c', + 'info_dict': { + 'id': '1978030', + 'display_id': 'U3pBrYhsjXM', + 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:0a61df3620de26c0af8963b1a730cd69', + 'timestamp': 1640435839, + 'upload_date': '20211225', + 'duration': 416, + 'view_count': int, + 'age_limit': 18, + 'uploader': 'SEXYhub.com', + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://www.playvids.com/es/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', + 'only_matching': True, + }, { + 'url': 'https://www.playvids.com/embed/U3pBrYhsjXM', + 'only_matching': True, + }, { + 'url': 'https://www.playvids.com/bKmGLe3IwjZ/sv/brazzers-800-phone-sex-madison-ivy-always-on-the-line', + 'md5': 'e783986e596cafbf46411a174ab42ba6', + 'info_dict': { + 'id': '762385', + 'display_id': 'bKmGLe3IwjZ', + 'ext': 'mp4', + 'title': 'Brazzers - 1 800 Phone Sex: Madison Ivy Always On The Line 6', + 'description': 'md5:bdcd2db2b8ad85831a491d7c8605dcef', + 'timestamp': 1516958544, + 'upload_date': '20180126', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 480, + 'uploader': 'Brazzers', + 'age_limit': 18, + 'view_count': int, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://www.playvids.com/v/47iUho33toY', + 'md5': 'b056b5049d34b648c1e86497cf4febce', + 'info_dict': { + 'id': '700621', + 'display_id': '47iUho33toY', + 'ext': 'mp4', + 'title': 'KATEE OWEN STRIPTIASE IN SEXY RED LINGERIE', + 'timestamp': 1507052209, + 'upload_date': '20171003', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 332, + 'uploader': 'Cacerenele', + 'age_limit': 18, + 'view_count': int, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://www.playvids.com/z3_7iwWCmqt/sexy-teen-filipina-striptease-beautiful-pinay-bargirl-strips-and-dances', + 'md5': 'efa09be9f031314b7b7e3bc6510cd0df', + 'info_dict': { + 'id': '1523518', + 'display_id': 'z3_7iwWCmqt', + 'ext': 'mp4', + 'title': 'SEXY TEEN FILIPINA STRIPTEASE - Beautiful Pinay Bargirl Strips and Dances', + 'timestamp': 1607470323, + 'upload_date': '20201208', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 593, + 'uploader': 'yorours', + 'age_limit': 18, + 'view_count': int, + 'categories': list, + 'tags': list, + }, + }] diff --git a/yt_dlp/extractor/peertube.py b/yt_dlp/extractor/peertube.py new file mode 100644 index 0000000..730b239 --- /dev/null +++ b/yt_dlp/extractor/peertube.py @@ -0,0 +1,1647 @@ +import functools +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + format_field, + int_or_none, + parse_resolution, + str_or_none, + try_get, + unified_timestamp, + url_or_none, + urljoin, + OnDemandPagedList, +) + + +class PeerTubeIE(InfoExtractor): + _INSTANCES_RE = r'''(?: + # Taken from https://instances.joinpeertube.org/instances + 0ch\.tv| + 3dctube\.3dcandy\.social| + all\.electric\.kitchen| + alterscope\.fr| + anarchy\.tube| + apathy\.tv| + apertatube\.net| + archive\.nocopyrightintended\.tv| + archive\.reclaim\.tv| + area51\.media| + astrotube-ufe\.obspm\.fr| + astrotube\.obspm\.fr| + audio\.freediverse\.com| + azxtube\.youssefc\.tn| + bark\.video| + battlepenguin\.video| + bava\.tv| + bee-tube\.fr| + beetoons\.tv| + biblion\.refchat\.net| + biblioteca\.theowlclub\.net| + bideoak\.argia\.eus| + bideoteka\.eus| + birdtu\.be| + bitcointv\.com| + bonn\.video| + breeze\.tube| + brioco\.live| + brocosoup\.fr| + canal\.facil\.services| + canard\.tube| + cdn01\.tilvids\.com| + celluloid-media\.huma-num\.fr| + chicago1\.peertube\.support| + cliptube\.org| + cloudtube\.ise\.fraunhofer\.de| + comf\.tube| + comics\.peertube\.biz| + commons\.tube| + communitymedia\.video| + conspiracydistillery\.com| + crank\.recoil\.org| + dalek\.zone| + dalliance\.network| + dangly\.parts| + darkvapor\.nohost\.me| + daschauher\.aksel\.rocks| + digitalcourage\.video| + displayeurope\.video| + ds106\.tv| + dud-video\.inf\.tu-dresden\.de| + dud175\.inf\.tu-dresden\.de| + dytube\.com| + ebildungslabor\.video| + evangelisch\.video| + fair\.tube| + fedi\.video| + fedimovie\.com| + fediverse\.tv| + film\.k-prod\.fr| + flipboard\.video| + foss\.video| + fossfarmers\.company| + fotogramas\.politicaconciencia\.org| + freediverse\.com| + freesoto-u2151\.vm\.elestio\.app| + freesoto\.tv| + garr\.tv| + greatview\.video| + grypstube\.uni-greifswald\.de| + habratube\.site| + ilbjach\.ru| + infothema\.net| + itvplus\.iiens\.net| + johnydeep\.net| + juggling\.digital| + jupiter\.tube| + kadras\.live| + kino\.kompot\.si| + kino\.schuerz\.at| + kinowolnosc\.pl| + kirche\.peertube-host\.de| + kiwi\.froggirl\.club| + kodcast\.com| + kolektiva\.media| + kpop\.22x22\.ru| + kumi\.tube| + la2\.peertube\.support| + la3\.peertube\.support| + la4\.peertube\.support| + lastbreach\.tv| + lawsplaining\.peertube\.biz| + leopard\.tube| + live\.codinglab\.ch| + live\.libratoi\.org| + live\.oldskool\.fi| + live\.solari\.com| + lucarne\.balsamine\.be| + luxtube\.lu| + makertube\.net| + media\.econoalchemist\.com| + media\.exo\.cat| + media\.fsfe\.org| + media\.gzevd\.de| + media\.interior\.edu\.uy| + media\.krashboyz\.org| + media\.mzhd\.de| + media\.smz-ma\.de| + media\.theplattform\.net| + media\.undeadnetwork\.de| + medias\.debrouillonet\.org| + medias\.pingbase\.net| + mediatube\.fermalo\.fr| + melsungen\.peertube-host\.de| + merci-la-police\.fr| + mindlyvideos\.com| + mirror\.peertube\.metalbanana\.net| + mirrored\.rocks| + mix\.video| + mountaintown\.video| + movies\.metricsmaster\.eu| + mtube\.mooo\.com| + mytube\.kn-cloud\.de| + mytube\.le5emeaxe\.fr| + mytube\.madzel\.de| + nadajemy\.com| + nanawel-peertube\.dyndns\.org| + neat\.tube| + nethack\.tv| + nicecrew\.tv| + nightshift\.minnix\.dev| + nolog\.media| + nyltube\.nylarea\.com| + ocfedtest\.hosted\.spacebear\.ee| + openmedia\.edunova\.it| + p2ptv\.ru| + p\.eertu\.be| + p\.lu| + pastafriday\.club| + patriottube\.sonsofliberty\.red| + pcbu\.nl| + peer\.azurs\.fr| + peer\.d0g4\.me| + peer\.lukeog\.com| + peer\.madiator\.cloud| + peer\.raise-uav\.com| + peershare\.togart\.de| + peertube-blablalinux\.be| + peertube-demo\.learning-hub\.fr| + peertube-docker\.cpy\.re| + peertube-eu\.howlround\.com| + peertube-u5014\.vm\.elestio\.app| + peertube-us\.howlround\.com| + peertube\.020\.pl| + peertube\.0x5e\.eu| + peertube\.1984\.cz| + peertube\.2i2l\.net| + peertube\.adjutor\.xyz| + peertube\.adresse\.data\.gouv\.fr| + peertube\.alpharius\.io| + peertube\.am-networks\.fr| + peertube\.anduin\.net| + peertube\.anti-logic\.com| + peertube\.arch-linux\.cz| + peertube\.art3mis\.de| + peertube\.artsrn\.ualberta\.ca| + peertube\.askan\.info| + peertube\.astral0pitek\.synology\.me| + peertube\.atsuchan\.page| + peertube\.automat\.click| + peertube\.b38\.rural-it\.org| + peertube\.be| + peertube\.beeldengeluid\.nl| + peertube\.bgzashtita\.es| + peertube\.bike| + peertube\.bildung-ekhn\.de| + peertube\.biz| + peertube\.br0\.fr| + peertube\.bridaahost\.ynh\.fr| + peertube\.bubbletea\.dev| + peertube\.bubuit\.net| + peertube\.cabaal\.net| + peertube\.chatinbit\.com| + peertube\.chaunchy\.com| + peertube\.chir\.rs| + peertube\.christianpacaud\.com| + peertube\.chtisurel\.net| + peertube\.chuggybumba\.com| + peertube\.cipherbliss\.com| + peertube\.cirkau\.art| + peertube\.cloud\.nerdraum\.de| + peertube\.cloud\.sans\.pub| + peertube\.coko\.foundation| + peertube\.communecter\.org| + peertube\.concordia\.social| + peertube\.corrigan\.xyz| + peertube\.cpge-brizeux\.fr| + peertube\.ctseuro\.com| + peertube\.cuatrolibertades\.org| + peertube\.cube4fun\.net| + peertube\.dair-institute\.org| + peertube\.davigge\.com| + peertube\.dc\.pini\.fr| + peertube\.deadtom\.me| + peertube\.debian\.social| + peertube\.delta0189\.xyz| + peertube\.demonix\.fr| + peertube\.designersethiques\.org| + peertube\.desmu\.fr| + peertube\.devol\.it| + peertube\.dk| + peertube\.doesstuff\.social| + peertube\.eb8\.org| + peertube\.education-forum\.com| + peertube\.elforcer\.ru| + peertube\.em\.id\.lv| + peertube\.ethibox\.fr| + peertube\.eu\.org| + peertube\.european-pirates\.eu| + peertube\.eus| + peertube\.euskarabildua\.eus| + peertube\.expi\.studio| + peertube\.familie-berner\.de| + peertube\.familleboisteau\.fr| + peertube\.fedihost\.website| + peertube\.fenarinarsa\.com| + peertube\.festnoz\.de| + peertube\.forteza\.fr| + peertube\.freestorm\.online| + peertube\.functional\.cafe| + peertube\.gaminglinux\.fr| + peertube\.gargantia\.fr| + peertube\.geekgalaxy\.fr| + peertube\.gemlog\.ca| + peertube\.genma\.fr| + peertube\.get-racing\.de| + peertube\.ghis94\.ovh| + peertube\.gidikroon\.eu| + peertube\.giftedmc\.com| + peertube\.grosist\.fr| + peertube\.gruntwerk\.org| + peertube\.gsugambit\.com| + peertube\.hackerfoo\.com| + peertube\.hellsite\.net| + peertube\.helvetet\.eu| + peertube\.histoirescrepues\.fr| + peertube\.home\.x0r\.fr| + peertube\.hyperfreedom\.org| + peertube\.ichigo\.everydayimshuflin\.com| + peertube\.ifwo\.eu| + peertube\.in\.ua| + peertube\.inapurna\.org| + peertube\.informaction\.info| + peertube\.interhop\.org| + peertube\.it| + peertube\.it-arts\.net| + peertube\.jensdiemer\.de| + peertube\.johntheserg\.al| + peertube\.kaleidos\.net| + peertube\.kalua\.im| + peertube\.kcore\.org| + peertube\.keazilla\.net| + peertube\.klaewyss\.fr| + peertube\.kleph\.eu| + peertube\.kodein\.be| + peertube\.kooperatywa\.tech| + peertube\.kriom\.net| + peertube\.kx\.studio| + peertube\.kyriog\.eu| + peertube\.la-famille-muller\.fr| + peertube\.labeuropereunion\.eu| + peertube\.lagvoid\.com| + peertube\.lhc\.net\.br| + peertube\.libresolutions\.network| + peertube\.libretic\.fr| + peertube\.librosphere\.fr| + peertube\.logilab\.fr| + peertube\.lon\.tv| + peertube\.louisematic\.site| + peertube\.luckow\.org| + peertube\.luga\.at| + peertube\.lyceeconnecte\.fr| + peertube\.madixam\.xyz| + peertube\.magicstone\.dev| + peertube\.marienschule\.de| + peertube\.marud\.fr| + peertube\.maxweiss\.io| + peertube\.miguelcr\.me| + peertube\.mikemestnik\.net| + peertube\.mobilsicher\.de| + peertube\.monlycee\.net| + peertube\.mxinfo\.fr| + peertube\.naln1\.ca| + peertube\.netzbegruenung\.de| + peertube\.nicolastissot\.fr| + peertube\.nogafam\.fr| + peertube\.normalgamingcommunity\.cz| + peertube\.nz| + peertube\.offerman\.com| + peertube\.ohioskates\.com| + peertube\.onionstorm\.net| + peertube\.opencloud\.lu| + peertube\.otakufarms\.com| + peertube\.paladyn\.org| + peertube\.pix-n-chill\.fr| + peertube\.r2\.enst\.fr| + peertube\.r5c3\.fr| + peertube\.redpill-insight\.com| + peertube\.researchinstitute\.at| + peertube\.revelin\.fr| + peertube\.rlp\.schule| + peertube\.rokugan\.fr| + peertube\.rougevertbleu\.tv| + peertube\.roundpond\.net| + peertube\.rural-it\.org| + peertube\.satoshishop\.de| + peertube\.scyldings\.com| + peertube\.securitymadein\.lu| + peertube\.semperpax\.com| + peertube\.semweb\.pro| + peertube\.sensin\.eu| + peertube\.sidh\.bzh| + peertube\.skorpil\.cz| + peertube\.smertrios\.com| + peertube\.sqweeb\.net| + peertube\.stattzeitung\.org| + peertube\.stream| + peertube\.su| + peertube\.swrs\.net| + peertube\.takeko\.cyou| + peertube\.taxinachtegel\.de| + peertube\.teftera\.com| + peertube\.teutronic-services\.de| + peertube\.ti-fr\.com| + peertube\.tiennot\.net| + peertube\.tmp\.rcp\.tf| + peertube\.tspu\.edu\.ru| + peertube\.tv| + peertube\.tweb\.tv| + peertube\.underworld\.fr| + peertube\.vapronva\.pw| + peertube\.veen\.world| + peertube\.vesdia\.eu| + peertube\.virtual-assembly\.org| + peertube\.viviers-fibre\.net| + peertube\.vlaki\.cz| + peertube\.wiesbaden\.social| + peertube\.wivodaim\.net| + peertube\.wtf| + peertube\.wtfayla\.net| + peertube\.xrcb\.cat| + peertube\.xwiki\.com| + peertube\.zd\.do| + peertube\.zetamc\.net| + peertube\.zmuuf\.org| + peertube\.zoz-serv\.org| + peertube\.zwindler\.fr| + peervideo\.ru| + periscope\.numenaute\.org| + pete\.warpnine\.de| + petitlutinartube\.fr| + phijkchu\.com| + phoenixproject\.group| + piraten\.space| + pirtube\.calut\.fr| + pityu\.flaki\.hu| + play\.mittdata\.se| + player\.ojamajo\.moe| + podlibre\.video| + portal\.digilab\.nfa\.cz| + private\.fedimovie\.com| + pt01\.lehrerfortbildung-bw\.de| + pt\.diaspodon\.fr| + pt\.freedomwolf\.cc| + pt\.gordons\.gen\.nz| + pt\.ilyamikcoder\.com| + pt\.irnok\.net| + pt\.mezzo\.moe| + pt\.na4\.eu| + pt\.netcraft\.ch| + pt\.rwx\.ch| + pt\.sfunk1x\.com| + pt\.thishorsie\.rocks| + pt\.vern\.cc| + ptb\.lunarviews\.net| + ptube\.de| + ptube\.ranranhome\.info| + puffy\.tube| + puppet\.zone| + qtube\.qlyoung\.net| + quantube\.win| + rankett\.net| + replay\.jres\.org| + review\.peertube\.biz| + sdmtube\.fr| + secure\.direct-live\.net| + secure\.scanovid\.com| + seka\.pona\.la| + serv3\.wiki-tube\.de| + skeptube\.fr| + social\.fedimovie\.com| + socpeertube\.ru| + sovran\.video| + special\.videovortex\.tv| + spectra\.video| + stl1988\.peertube-host\.de| + stream\.biovisata\.lt| + stream\.conesphere\.cloud| + stream\.elven\.pw| + stream\.jurnalfm\.md| + stream\.k-prod\.fr| + stream\.litera\.tools| + stream\.nuemedia\.se| + stream\.rlp-media\.de| + stream\.vrse\.be| + studios\.racer159\.com| + styxhexenhammer666\.com| + syrteplay\.obspm\.fr| + t\.0x0\.st| + tbh\.co-shaoghal\.net| + test-fab\.ynh\.fr| + testube\.distrilab\.fr| + tgi\.hosted\.spacebear\.ee| + theater\.ethernia\.net| + thecool\.tube| + thevideoverse\.com| + tilvids\.com| + tinkerbetter\.tube| + tinsley\.video| + trailers\.ddigest\.com| + tube-action-educative\.apps\.education\.fr| + tube-arts-lettres-sciences-humaines\.apps\.education\.fr| + tube-cycle-2\.apps\.education\.fr| + tube-cycle-3\.apps\.education\.fr| + tube-education-physique-et-sportive\.apps\.education\.fr| + tube-enseignement-professionnel\.apps\.education\.fr| + tube-institutionnel\.apps\.education\.fr| + tube-langues-vivantes\.apps\.education\.fr| + tube-maternelle\.apps\.education\.fr| + tube-numerique-educatif\.apps\.education\.fr| + tube-sciences-technologies\.apps\.education\.fr| + tube-test\.apps\.education\.fr| + tube1\.perron-service\.de| + tube\.9minuti\.it| + tube\.abolivier\.bzh| + tube\.alado\.space| + tube\.amic37\.fr| + tube\.area404\.cloud| + tube\.arthack\.nz| + tube\.asulia\.fr| + tube\.awkward\.company| + tube\.azbyka\.ru| + tube\.azkware\.net| + tube\.bartrip\.me\.uk| + tube\.belowtoxic\.media| + tube\.bingle\.plus| + tube\.bit-friends\.de| + tube\.bstly\.de| + tube\.chosto\.me| + tube\.cms\.garden| + tube\.communia\.org| + tube\.cyberia\.club| + tube\.cybershock\.life| + tube\.dembased\.xyz| + tube\.dev\.displ\.eu| + tube\.digitalesozialearbeit\.de| + tube\.distrilab\.fr| + tube\.doortofreedom\.org| + tube\.dsocialize\.net| + tube\.e-jeremy\.com| + tube\.ebin\.club| + tube\.elemac\.fr| + tube\.erzbistum-hamburg\.de| + tube\.exozy\.me| + tube\.fdn\.fr| + tube\.fedi\.quebec| + tube\.fediverse\.at| + tube\.felinn\.org| + tube\.flokinet\.is| + tube\.foad\.me\.uk| + tube\.freepeople\.fr| + tube\.friloux\.me| + tube\.froth\.zone| + tube\.fulda\.social| + tube\.futuretic\.fr| + tube\.g1zm0\.de| + tube\.g4rf\.net| + tube\.gaiac\.io| + tube\.geekyboo\.net| + tube\.genb\.de| + tube\.ghk-academy\.info| + tube\.gi-it\.de| + tube\.grap\.coop| + tube\.graz\.social| + tube\.grin\.hu| + tube\.hokai\.lol| + tube\.int5\.net| + tube\.interhacker\.space| + tube\.invisible\.ch| + tube\.io18\.top| + tube\.itsg\.host| + tube\.jeena\.net| + tube\.kh-berlin\.de| + tube\.kockatoo\.org| + tube\.kotur\.org| + tube\.koweb\.fr| + tube\.la-dina\.net| + tube\.lab\.nrw| + tube\.lacaveatonton\.ovh| + tube\.laurent-malys\.fr| + tube\.leetdreams\.ch| + tube\.linkse\.media| + tube\.lokad\.com| + tube\.lucie-philou\.com| + tube\.media-techport\.de| + tube\.morozoff\.pro| + tube\.neshweb\.net| + tube\.nestor\.coop| + tube\.network\.europa\.eu| + tube\.nicfab\.eu| + tube\.nieuwwestbrabant\.nl| + tube\.nogafa\.org| + tube\.novg\.net| + tube\.nox-rhea\.org| + tube\.nuagelibre\.fr| + tube\.numerique\.gouv\.fr| + tube\.nuxnik\.com| + tube\.nx12\.net| + tube\.octaplex\.net| + tube\.oisux\.org| + tube\.okcinfo\.news| + tube\.onlinekirche\.net| + tube\.opportunis\.me| + tube\.oraclefilms\.com| + tube\.org\.il| + tube\.pacapime\.ovh| + tube\.parinux\.org| + tube\.pastwind\.top| + tube\.picasoft\.net| + tube\.pilgerweg-21\.de| + tube\.pmj\.rocks| + tube\.pol\.social| + tube\.ponsonaille\.fr| + tube\.portes-imaginaire\.org| + tube\.public\.apolut\.net| + tube\.pustule\.org| + tube\.pyngu\.com| + tube\.querdenken-711\.de| + tube\.rebellion\.global| + tube\.reseau-canope\.fr| + tube\.rhythms-of-resistance\.org| + tube\.risedsky\.ovh| + tube\.rooty\.fr| + tube\.rsi\.cnr\.it| + tube\.ryne\.moe| + tube\.schleuss\.online| + tube\.schule\.social| + tube\.sekretaerbaer\.net| + tube\.shanti\.cafe| + tube\.shela\.nu| + tube\.skrep\.in| + tube\.sleeping\.town| + tube\.sp-codes\.de| + tube\.spdns\.org| + tube\.systerserver\.net| + tube\.systest\.eu| + tube\.tappret\.fr| + tube\.techeasy\.org| + tube\.thierrytalbert\.fr| + tube\.tinfoil-hat\.net| + tube\.toldi\.eu| + tube\.tpshd\.de| + tube\.trax\.im| + tube\.troopers\.agency| + tube\.ttk\.is| + tube\.tuxfriend\.fr| + tube\.tylerdavis\.xyz| + tube\.ullihome\.de| + tube\.ulne\.be| + tube\.undernet\.uy| + tube\.vrpnet\.org| + tube\.wolfe\.casa| + tube\.xd0\.de| + tube\.xn--baw-joa\.social| + tube\.xy-space\.de| + tube\.yapbreak\.fr| + tubedu\.org| + tubulus\.openlatin\.org| + turtleisland\.video| + tututu\.tube| + tv\.adast\.dk| + tv\.adn\.life| + tv\.arns\.lt| + tv\.atmx\.ca| + tv\.based\.quest| + tv\.farewellutopia\.com| + tv\.filmfreedom\.net| + tv\.gravitons\.org| + tv\.io\.seg\.br| + tv\.lumbung\.space| + tv\.pirateradio\.social| + tv\.pirati\.cz| + tv\.santic-zombie\.ru| + tv\.undersco\.re| + tv\.zonepl\.net| + tvox\.ru| + twctube\.twc-zone\.eu| + twobeek\.com| + urbanists\.video| + v\.9tail\.net| + v\.basspistol\.org| + v\.j4\.lc| + v\.kisombrella\.top| + v\.koa\.im| + v\.kyaru\.xyz| + v\.lor\.sh| + v\.mkp\.ca| + v\.posm\.gay| + v\.slaycer\.top| + veedeo\.org| + vhs\.absturztau\.be| + vid\.cthos\.dev| + vid\.kinuseka\.us| + vid\.mkp\.ca| + vid\.nocogabriel\.fr| + vid\.norbipeti\.eu| + vid\.northbound\.online| + vid\.ohboii\.de| + vid\.plantplotting\.co\.uk| + vid\.pretok\.tv| + vid\.prometheus\.systems| + vid\.soafen\.love| + vid\.twhtv\.club| + vid\.wildeboer\.net| + video-cave-v2\.de| + video-liberty\.com| + video\.076\.ne\.jp| + video\.1146\.nohost\.me| + video\.9wd\.eu| + video\.abraum\.de| + video\.ados\.accoord\.fr| + video\.amiga-ng\.org| + video\.anartist\.org| + video\.asgardius\.company| + video\.audiovisuel-participatif\.org| + video\.bards\.online| + video\.barkoczy\.social| + video\.benetou\.fr| + video\.beyondwatts\.social| + video\.bgeneric\.net| + video\.bilecik\.edu\.tr| + video\.blast-info\.fr| + video\.bmu\.cloud| + video\.catgirl\.biz| + video\.causa-arcana\.com| + video\.chasmcity\.net| + video\.chbmeyer\.de| + video\.cigliola\.com| + video\.citizen4\.eu| + video\.clumsy\.computer| + video\.cnnumerique\.fr| + video\.cnr\.it| + video\.cnt\.social| + video\.coales\.co| + video\.comune\.trento\.it| + video\.coyp\.us| + video\.csc49\.fr| + video\.davduf\.net| + video\.davejansen\.com| + video\.dlearning\.nl| + video\.dnfi\.no| + video\.dresden\.network| + video\.drgnz\.club| + video\.dudenas\.lt| + video\.eientei\.org| + video\.ellijaymakerspace\.org| + video\.emergeheart\.info| + video\.eradicatinglove\.xyz| + video\.everythingbagel\.me| + video\.extremelycorporate\.ca| + video\.fabiomanganiello\.com| + video\.fedi\.bzh| + video\.fhtagn\.org| + video\.firehawk-systems\.com| + video\.fox-romka\.ru| + video\.fuss\.bz\.it| + video\.glassbeadcollective\.org| + video\.graine-pdl\.org| + video\.gyt\.is| + video\.hainry\.fr| + video\.hardlimit\.com| + video\.hostux\.net| + video\.igem\.org| + video\.infojournal\.fr| + video\.internet-czas-dzialac\.pl| + video\.interru\.io| + video\.ipng\.ch| + video\.ironsysadmin\.com| + video\.islameye\.com| + video\.jacen\.moe| + video\.jadin\.me| + video\.jeffmcbride\.net| + video\.jigmedatse\.com| + video\.kuba-orlik\.name| + video\.lacalligramme\.fr| + video\.lanceurs-alerte\.fr| + video\.laotra\.red| + video\.lapineige\.fr| + video\.laraffinerie\.re| + video\.lavolte\.net| + video\.liberta\.vip| + video\.libreti\.net| + video\.licentia\.net| + video\.linc\.systems| + video\.linux\.it| + video\.linuxtrent\.it| + video\.liveitlive\.show| + video\.lono\.space| + video\.lrose\.de| + video\.lunago\.net| + video\.lundi\.am| + video\.lycee-experimental\.org| + video\.maechler\.cloud| + video\.marcorennmaus\.de| + video\.mass-trespass\.uk| + video\.matomocamp\.org| + video\.medienzentrum-harburg\.de| + video\.mentality\.rip| + video\.metaversum\.wtf| + video\.midreality\.com| + video\.mttv\.it| + video\.mugoreve\.fr| + video\.mxtthxw\.art| + video\.mycrowd\.ca| + video\.niboe\.info| + video\.nogafam\.es| + video\.nstr\.no| + video\.occm\.cc| + video\.off-investigation\.fr| + video\.olos311\.org| + video\.ordinobsolete\.fr| + video\.osvoj\.ru| + video\.ourcommon\.cloud| + video\.ozgurkon\.org| + video\.pcf\.fr| + video\.pcgaldo\.com| + video\.phyrone\.de| + video\.poul\.org| + video\.publicspaces\.net| + video\.pullopen\.xyz| + video\.r3s\.nrw| + video\.rainevixen\.com| + video\.resolutions\.it| + video\.retroedge\.tech| + video\.rhizome\.org| + video\.rlp-media\.de| + video\.rs-einrich\.de| + video\.rubdos\.be| + video\.sadmin\.io| + video\.sftblw\.moe| + video\.shitposter\.club| + video\.simplex-software\.ru| + video\.slipfox\.xyz| + video\.snug\.moe| + video\.software-fuer-engagierte\.de| + video\.soi\.ch| + video\.sonet\.ws| + video\.surazal\.net| + video\.taskcards\.eu| + video\.team-lcbs\.eu| + video\.techforgood\.social| + video\.telemillevaches\.net| + video\.thepolarbear\.co\.uk| + video\.thinkof\.name| + video\.tii\.space| + video\.tkz\.es| + video\.trankil\.info| + video\.triplea\.fr| + video\.tum\.social| + video\.turbo\.chat| + video\.uriopss-pdl\.fr| + video\.ustim\.ru| + video\.ut0pia\.org| + video\.vaku\.org\.ua| + video\.vegafjord\.me| + video\.veloma\.org| + video\.violoncello\.ch| + video\.voidconspiracy\.band| + video\.wakkeren\.nl| + video\.windfluechter\.org| + video\.ziez\.eu| + videos-passages\.huma-num\.fr| + videos\.aadtp\.be| + videos\.ahp-numerique\.fr| + videos\.alamaisondulibre\.org| + videos\.archigny\.net| + videos\.aroaduntraveled\.com| + videos\.b4tech\.org| + videos\.benjaminbrady\.ie| + videos\.bik\.opencloud\.lu| + videos\.cloudron\.io| + videos\.codingotaku\.com| + videos\.coletivos\.org| + videos\.collate\.social| + videos\.danksquad\.org| + videos\.digitaldragons\.eu| + videos\.dromeadhere\.fr| + videos\.explain-it\.org| + videos\.factsonthegroundshow\.com| + videos\.foilen\.com| + videos\.fsci\.in| + videos\.gamercast\.net| + videos\.gianmarco\.gg| + videos\.globenet\.org| + videos\.grafo\.zone| + videos\.hauspie\.fr| + videos\.hush\.is| + videos\.hyphalfusion\.network| + videos\.icum\.to| + videos\.im\.allmendenetz\.de| + videos\.jacksonchen666\.com| + videos\.john-livingston\.fr| + videos\.knazarov\.com| + videos\.kuoushi\.com| + videos\.laliguepaysdelaloire\.org| + videos\.lemouvementassociatif-pdl\.org| + videos\.leslionsfloorball\.fr| + videos\.librescrum\.org| + videos\.mastodont\.cat| + videos\.metus\.ca| + videos\.miolo\.org| + videos\.offroad\.town| + videos\.openmandriva\.org| + videos\.parleur\.net| + videos\.pcorp\.us| + videos\.pop\.eu\.com| + videos\.rampin\.org| + videos\.rauten\.co\.za| + videos\.ritimo\.org| + videos\.sarcasmstardust\.com| + videos\.scanlines\.xyz| + videos\.shmalls\.pw| + videos\.stadtfabrikanten\.org| + videos\.supertuxkart\.net| + videos\.testimonia\.org| + videos\.thinkerview\.com| + videos\.torrenezzi10\.xyz| + videos\.trom\.tf| + videos\.utsukta\.org| + videos\.viorsan\.com| + videos\.wherelinux\.xyz| + videos\.wikilibriste\.fr| + videos\.yesil\.club| + videos\.yeswiki\.net| + videotube\.duckdns\.org| + vids\.capypara\.de| + vids\.roshless\.me| + vids\.stary\.pc\.pl| + vids\.tekdmn\.me| + vidz\.julien\.ovh| + views\.southfox\.me| + virtual-girls-are\.definitely-for\.me| + viste\.pt| + vnchich\.com| + vnop\.org| + vod\.newellijay\.tv| + voluntarytube\.com| + vtr\.chikichiki\.tube| + vulgarisation-informatique\.fr| + watch\.easya\.solutions| + watch\.goodluckgabe\.life| + watch\.ignorance\.eu| + watch\.jimmydore\.com| + watch\.libertaria\.space| + watch\.nuked\.social| + watch\.ocaml\.org| + watch\.thelema\.social| + watch\.tubelab\.video| + web-fellow\.de| + webtv\.vandoeuvre\.net| + wetubevid\.online| + wikileaks\.video| + wiwi\.video| + wow\.such\.disappointment\.fail| + www\.jvideos\.net| + www\.kotikoff\.net| + www\.makertube\.net| + www\.mypeer\.tube| + www\.nadajemy\.com| + www\.neptube\.io| + www\.rocaguinarda\.tv| + www\.vnshow\.net| + xxivproduction\.video| + yt\.orokoro\.ru| + ytube\.retronerd\.at| + zumvideo\.de| + + # from youtube-dl + peertube\.rainbowswingers\.net| + tube\.stanisic\.nl| + peer\.suiri\.us| + medias\.libox\.fr| + videomensoif\.ynh\.fr| + peertube\.travelpandas\.eu| + peertube\.rachetjay\.fr| + peertube\.montecsys\.fr| + tube\.eskuero\.me| + peer\.tube| + peertube\.umeahackerspace\.se| + tube\.nx-pod\.de| + video\.monsieurbidouille\.fr| + tube\.openalgeria\.org| + vid\.lelux\.fi| + video\.anormallostpod\.ovh| + tube\.crapaud-fou\.org| + peertube\.stemy\.me| + lostpod\.space| + exode\.me| + peertube\.snargol\.com| + vis\.ion\.ovh| + videosdulib\.re| + v\.mbius\.io| + videos\.judrey\.eu| + peertube\.osureplayviewer\.xyz| + peertube\.mathieufamily\.ovh| + www\.videos-libr\.es| + fightforinfo\.com| + peertube\.fediverse\.ru| + peertube\.oiseauroch\.fr| + video\.nesven\.eu| + v\.bearvideo\.win| + video\.qoto\.org| + justporn\.cc| + video\.vny\.fr| + peervideo\.club| + tube\.taker\.fr| + peertube\.chantierlibre\.org| + tube\.ipfixe\.info| + tube\.kicou\.info| + tube\.dodsorf\.as| + videobit\.cc| + video\.yukari\.moe| + videos\.elbinario\.net| + hkvideo\.live| + pt\.tux\.tf| + www\.hkvideo\.live| + FIGHTFORINFO\.com| + pt\.765racing\.com| + peertube\.gnumeria\.eu\.org| + nordenmedia\.com| + peertube\.co\.uk| + tube\.darfweb\.eu| + tube\.kalah-france\.org| + 0ch\.in| + vod\.mochi\.academy| + film\.node9\.org| + peertube\.hatthieves\.es| + video\.fitchfamily\.org| + peertube\.ddns\.net| + video\.ifuncle\.kr| + video\.fdlibre\.eu| + tube\.22decembre\.eu| + peertube\.harmoniescreatives\.com| + tube\.fabrigli\.fr| + video\.thedwyers\.co| + video\.bruitbruit\.com| + peertube\.foxfam\.club| + peer\.philoxweb\.be| + videos\.bugs\.social| + peertube\.malbert\.xyz| + peertube\.bilange\.ca| + libretube\.net| + diytelevision\.com| + peertube\.fedilab\.app| + libre\.video| + video\.mstddntfdn\.online| + us\.tv| + peertube\.sl-network\.fr| + peertube\.dynlinux\.io| + peertube\.david\.durieux\.family| + peertube\.linuxrocks\.online| + peerwatch\.xyz| + v\.kretschmann\.social| + tube\.otter\.sh| + yt\.is\.nota\.live| + tube\.dragonpsi\.xyz| + peertube\.boneheadmedia\.com| + videos\.funkwhale\.audio| + watch\.44con\.com| + peertube\.gcaillaut\.fr| + peertube\.icu| + pony\.tube| + spacepub\.space| + tube\.stbr\.io| + v\.mom-gay\.faith| + tube\.port0\.xyz| + peertube\.simounet\.net| + play\.jergefelt\.se| + peertube\.zeteo\.me| + tube\.danq\.me| + peertube\.kerenon\.com| + tube\.fab-l3\.org| + tube\.calculate\.social| + peertube\.mckillop\.org| + tube\.netzspielplatz\.de| + vod\.ksite\.de| + peertube\.laas\.fr| + tube\.govital\.net| + peertube\.stephenson\.cc| + bistule\.nohost\.me| + peertube\.kajalinifi\.de| + video\.ploud\.jp| + video\.omniatv\.com| + peertube\.ffs2play\.fr| + peertube\.leboulaire\.ovh| + peertube\.tronic-studio\.com| + peertube\.public\.cat| + peertube\.metalbanana\.net| + video\.1000i100\.fr| + peertube\.alter-nativ-voll\.de| + tube\.pasa\.tf| + tube\.worldofhauru\.xyz| + pt\.kamp\.site| + peertube\.teleassist\.fr| + videos\.mleduc\.xyz| + conf\.tube| + media\.privacyinternational\.org| + pt\.forty-two\.nl| + video\.halle-leaks\.de| + video\.grosskopfgames\.de| + peertube\.schaeferit\.de| + peertube\.jackbot\.fr| + tube\.extinctionrebellion\.fr| + peertube\.f-si\.org| + video\.subak\.ovh| + videos\.koweb\.fr| + peertube\.zergy\.net| + peertube\.roflcopter\.fr| + peertube\.floss-marketing-school\.com| + vloggers\.social| + peertube\.iriseden\.eu| + videos\.ubuntu-paris\.org| + peertube\.mastodon\.host| + armstube\.com| + peertube\.s2s\.video| + peertube\.lol| + tube\.open-plug\.eu| + open\.tube| + peertube\.ch| + peertube\.normandie-libre\.fr| + peertube\.slat\.org| + video\.lacaveatonton\.ovh| + peertube\.uno| + peertube\.servebeer\.com| + peertube\.fedi\.quebec| + tube\.h3z\.jp| + tube\.plus200\.com| + peertube\.eric\.ovh| + tube\.metadocs\.cc| + tube\.unmondemeilleur\.eu| + gouttedeau\.space| + video\.antirep\.net| + nrop\.cant\.at| + tube\.ksl-bmx\.de| + tube\.plaf\.fr| + tube\.tchncs\.de| + video\.devinberg\.com| + hitchtube\.fr| + peertube\.kosebamse\.com| + yunopeertube\.myddns\.me| + peertube\.varney\.fr| + peertube\.anon-kenkai\.com| + tube\.maiti\.info| + tubee\.fr| + videos\.dinofly\.com| + toobnix\.org| + videotape\.me| + voca\.tube| + video\.heromuster\.com| + video\.lemediatv\.fr| + video\.up\.edu\.ph| + balafon\.video| + video\.ivel\.fr| + thickrips\.cloud| + pt\.laurentkruger\.fr| + video\.monarch-pass\.net| + peertube\.artica\.center| + video\.alternanet\.fr| + indymotion\.fr| + fanvid\.stopthatimp\.net| + video\.farci\.org| + v\.lesterpig\.com| + video\.okaris\.de| + tube\.pawelko\.net| + peertube\.mablr\.org| + tube\.fede\.re| + pytu\.be| + evertron\.tv| + devtube\.dev-wiki\.de| + raptube\.antipub\.org| + video\.selea\.se| + peertube\.mygaia\.org| + video\.oh14\.de| + peertube\.livingutopia\.org| + peertube\.the-penguin\.de| + tube\.thechangebook\.org| + tube\.anjara\.eu| + pt\.pube\.tk| + video\.samedi\.pm| + mplayer\.demouliere\.eu| + widemus\.de| + peertube\.me| + peertube\.zapashcanon\.fr| + video\.latavernedejohnjohn\.fr| + peertube\.pcservice46\.fr| + peertube\.mazzonetto\.eu| + video\.irem\.univ-paris-diderot\.fr| + video\.livecchi\.cloud| + alttube\.fr| + video\.coop\.tools| + video\.cabane-libre\.org| + peertube\.openstreetmap\.fr| + videos\.alolise\.org| + irrsinn\.video| + video\.antopie\.org| + scitech\.video| + tube2\.nemsia\.org| + video\.amic37\.fr| + peertube\.freeforge\.eu| + video\.arbitrarion\.com| + video\.datsemultimedia\.com| + stoptrackingus\.tv| + peertube\.ricostrongxxx\.com| + docker\.videos\.lecygnenoir\.info| + peertube\.togart\.de| + tube\.postblue\.info| + videos\.domainepublic\.net| + peertube\.cyber-tribal\.com| + video\.gresille\.org| + peertube\.dsmouse\.net| + cinema\.yunohost\.support| + tube\.theocevaer\.fr| + repro\.video| + tube\.4aem\.com| + quaziinc\.com| + peertube\.metawurst\.space| + videos\.wakapo\.com| + video\.ploud\.fr| + video\.freeradical\.zone| + tube\.valinor\.fr| + refuznik\.video| + pt\.kircheneuenburg\.de| + peertube\.asrun\.eu| + peertube\.lagob\.fr| + videos\.side-ways\.net| + 91video\.online| + video\.valme\.io| + video\.taboulisme\.com| + videos-libr\.es| + tv\.mooh\.fr| + nuage\.acostey\.fr| + video\.monsieur-a\.fr| + peertube\.librelois\.fr| + videos\.pair2jeux\.tube| + videos\.pueseso\.club| + peer\.mathdacloud\.ovh| + media\.assassinate-you\.net| + vidcommons\.org| + ptube\.rousset\.nom\.fr| + tube\.cyano\.at| + videos\.squat\.net| + video\.iphodase\.fr| + peertube\.makotoworkshop\.org| + peertube\.serveur\.slv-valbonne\.fr| + vault\.mle\.party| + hostyour\.tv| + videos\.hack2g2\.fr| + libre\.tube| + pire\.artisanlogiciel\.net| + videos\.numerique-en-commun\.fr| + video\.netsyms\.com| + video\.die-partei\.social| + video\.writeas\.org| + peertube\.swarm\.solvingmaz\.es| + tube\.pericoloso\.ovh| + watching\.cypherpunk\.observer| + videos\.adhocmusic\.com| + tube\.rfc1149\.net| + peertube\.librelabucm\.org| + videos\.numericoop\.fr| + peertube\.koehn\.com| + peertube\.anarchmusicall\.net| + tube\.kampftoast\.de| + vid\.y-y\.li| + peertube\.xtenz\.xyz| + diode\.zone| + tube\.egf\.mn| + peertube\.nomagic\.uk| + visionon\.tv| + videos\.koumoul\.com| + video\.rastapuls\.com| + video\.mantlepro\.com| + video\.deadsuperhero\.com| + peertube\.musicstudio\.pro| + peertube\.we-keys\.fr| + artitube\.artifaille\.fr| + peertube\.ethernia\.net| + tube\.midov\.pl| + peertube\.fr| + watch\.snoot\.tube| + peertube\.donnadieu\.fr| + argos\.aquilenet\.fr| + tube\.nemsia\.org| + tube\.bruniau\.net| + videos\.darckoune\.moe| + tube\.traydent\.info| + dev\.videos\.lecygnenoir\.info| + peertube\.nayya\.org| + peertube\.live| + peertube\.mofgao\.space| + video\.lequerrec\.eu| + peertube\.amicale\.net| + aperi\.tube| + tube\.ac-lyon\.fr| + video\.lw1\.at| + www\.yiny\.org| + videos\.pofilo\.fr| + tube\.lou\.lt| + choob\.h\.etbus\.ch| + tube\.hoga\.fr| + peertube\.heberge\.fr| + video\.obermui\.de| + videos\.cloudfrancois\.fr| + betamax\.video| + video\.typica\.us| + tube\.piweb\.be| + video\.blender\.org| + peertube\.cat| + tube\.kdy\.ch| + pe\.ertu\.be| + peertube\.social| + videos\.lescommuns\.org| + tv\.datamol\.org| + videonaute\.fr| + dialup\.express| + peertube\.nogafa\.org| + megatube\.lilomoino\.fr| + peertube\.tamanoir\.foucry\.net| + peertube\.devosi\.org| + peertube\.1312\.media| + tube\.bootlicker\.party| + skeptikon\.fr| + video\.blueline\.mg| + tube\.homecomputing\.fr| + tube\.ouahpiti\.info| + video\.tedomum\.net| + video\.g3l\.org| + fontube\.fr| + peertube\.gaialabs\.ch| + tube\.kher\.nl| + peertube\.qtg\.fr| + video\.migennes\.net| + tube\.p2p\.legal| + troll\.tv| + videos\.iut-orsay\.fr| + peertube\.solidev\.net| + videos\.cemea\.org| + video\.passageenseine\.fr| + videos\.festivalparminous\.org| + peertube\.touhoppai\.moe| + sikke\.fi| + peer\.hostux\.social| + share\.tube| + peertube\.walkingmountains\.fr| + videos\.benpro\.fr| + peertube\.parleur\.net| + peertube\.heraut\.eu| + tube\.aquilenet\.fr| + peertube\.gegeweb\.eu| + framatube\.org| + thinkerview\.video| + tube\.conferences-gesticulees\.net| + peertube\.datagueule\.tv| + video\.lqdn\.fr| + tube\.mochi\.academy| + media\.zat\.im| + video\.colibris-outilslibres\.org| + tube\.svnet\.fr| + peertube\.video| + peertube2\.cpy\.re| + peertube3\.cpy\.re| + videos\.tcit\.fr| + peertube\.cpy\.re| + canard\.tube + )''' + _UUID_RE = r'[\da-zA-Z]{22}|[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' + _API_BASE = 'https://%s/api/v1/videos/%s/%s' + _VALID_URL = r'''(?x) + (?: + peertube:(?P[^:]+):| + https?://(?P%s)/(?:videos/(?:watch|embed)|api/v\d/videos|w)/ + ) + (?P%s) + ''' % (_INSTANCES_RE, _UUID_RE) + _EMBED_REGEX = [r'''(?x)]+\bsrc=["\'](?P(?:https?:)?//{_INSTANCES_RE}/videos/embed/{cls._UUID_RE})'''] + _TESTS = [{ + 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', + 'md5': '8563064d245a4be5705bddb22bb00a28', + 'info_dict': { + 'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d', + 'ext': 'mp4', + 'title': 'What is PeerTube?', + 'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', + 'timestamp': 1538391166, + 'upload_date': '20181001', + 'uploader': 'Framasoft', + 'uploader_id': '3', + 'uploader_url': 'https://framatube.org/accounts/framasoft', + 'channel': 'A propos de PeerTube', + 'channel_id': '2215', + 'channel_url': 'https://framatube.org/video-channels/joinpeertube', + 'language': 'en', + 'license': 'Attribution - Share Alike', + 'duration': 113, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': ['framasoft', 'peertube'], + 'categories': ['Science & Technology'], + } + }, { + 'url': 'https://peertube2.cpy.re/w/122d093a-1ede-43bd-bd34-59d2931ffc5e', + 'info_dict': { + 'id': '122d093a-1ede-43bd-bd34-59d2931ffc5e', + 'ext': 'mp4', + 'title': 'E2E tests', + 'uploader_id': '37855', + 'timestamp': 1589276219, + 'upload_date': '20200512', + 'uploader': 'chocobozzz', + } + }, { + 'url': 'https://peertube2.cpy.re/w/3fbif9S3WmtTP8gGsC5HBd', + 'info_dict': { + 'id': '3fbif9S3WmtTP8gGsC5HBd', + 'ext': 'mp4', + 'title': 'E2E tests', + 'uploader_id': '37855', + 'timestamp': 1589276219, + 'upload_date': '20200512', + 'uploader': 'chocobozzz', + }, + }, { + 'url': 'https://peertube2.cpy.re/api/v1/videos/3fbif9S3WmtTP8gGsC5HBd', + 'info_dict': { + 'id': '3fbif9S3WmtTP8gGsC5HBd', + 'ext': 'mp4', + 'title': 'E2E tests', + 'uploader_id': '37855', + 'timestamp': 1589276219, + 'upload_date': '20200512', + 'uploader': 'chocobozzz', + }, + }, { + # Issue #26002 + 'url': 'peertube:spacepub.space:d8943b2d-8280-497b-85ec-bc282ec2afdc', + 'info_dict': { + 'id': 'd8943b2d-8280-497b-85ec-bc282ec2afdc', + 'ext': 'mp4', + 'title': 'Dot matrix printer shell demo', + 'uploader_id': '3', + 'timestamp': 1587401293, + 'upload_date': '20200420', + 'uploader': 'Drew DeVault', + } + }, { + 'url': 'https://peertube.debian.social/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', + 'only_matching': True, + }, { + # nsfw + 'url': 'https://vod.ksite.de/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', + 'only_matching': True, + }, { + 'url': 'https://vod.ksite.de/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7', + 'only_matching': True, + }, { + 'url': 'https://peertube.tv/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', + 'only_matching': True, + }, { + 'url': 'peertube:framatube.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', + 'only_matching': True, + }] + + @staticmethod + def _extract_peertube_url(webpage, source_url): + mobj = re.match( + r'https?://(?P[^/]+)/(?:videos/(?:watch|embed)|w)/(?P%s)' + % PeerTubeIE._UUID_RE, source_url) + if mobj and any(p in webpage for p in ( + 'meta property="og:platform" content="PeerTube"', + 'PeerTube<', + 'There will be other non JS-based clients to access PeerTube', + '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): + return 'peertube:%s:%s' % mobj.group('host', 'id') + + @classmethod + def _extract_embed_urls(cls, url, webpage): + embeds = tuple(super()._extract_embed_urls(url, webpage)) + if embeds: + return embeds + + peertube_url = cls._extract_peertube_url(webpage, url) + if peertube_url: + return [peertube_url] + + def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): + return self._download_json( + self._API_BASE % (host, video_id, path), video_id, + note=note, errnote=errnote, fatal=fatal) + + def _get_subtitles(self, host, video_id): + captions = self._call_api( + host, video_id, 'captions', note='Downloading captions JSON', + fatal=False) + if not isinstance(captions, dict): + return + data = captions.get('data') + if not isinstance(data, list): + return + subtitles = {} + for e in data: + language_id = try_get(e, lambda x: x['language']['id'], compat_str) + caption_url = urljoin('https://%s' % host, e.get('captionPath')) + if not caption_url: + continue + subtitles.setdefault(language_id or 'en', []).append({ + 'url': caption_url, + }) + return subtitles + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + host = mobj.group('host') or mobj.group('host_2') + video_id = mobj.group('id') + + video = self._call_api( + host, video_id, '', note='Downloading video JSON') + + title = video['name'] + + formats = [] + files = video.get('files') or [] + for playlist in (video.get('streamingPlaylists') or []): + if not isinstance(playlist, dict): + continue + playlist_files = playlist.get('files') + if not (playlist_files and isinstance(playlist_files, list)): + continue + files.extend(playlist_files) + for file_ in files: + if not isinstance(file_, dict): + continue + file_url = url_or_none(file_.get('fileUrl')) + if not file_url: + continue + file_size = int_or_none(file_.get('size')) + format_id = try_get( + file_, lambda x: x['resolution']['label'], compat_str) + f = parse_resolution(format_id) + f.update({ + 'url': file_url, + 'format_id': format_id, + 'filesize': file_size, + }) + if format_id == '0p': + f['vcodec'] = 'none' + else: + f['fps'] = int_or_none(file_.get('fps')) + formats.append(f) + + description = video.get('description') + if description and len(description) >= 250: + # description is shortened + full_description = self._call_api( + host, video_id, 'description', note='Downloading description JSON', + fatal=False) + + if isinstance(full_description, dict): + description = str_or_none(full_description.get('description')) or description + + subtitles = self.extract_subtitles(host, video_id) + + def data(section, field, type_): + return try_get(video, lambda x: x[section][field], type_) + + def account_data(field, type_): + return data('account', field, type_) + + def channel_data(field, type_): + return data('channel', field, type_) + + category = data('category', 'label', compat_str) + categories = [category] if category else None + + nsfw = video.get('nsfw') + if nsfw is bool: + age_limit = 18 if nsfw else 0 + else: + age_limit = None + + webpage_url = 'https://%s/videos/watch/%s' % (host, video_id) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': urljoin(webpage_url, video.get('thumbnailPath')), + 'timestamp': unified_timestamp(video.get('publishedAt')), + 'uploader': account_data('displayName', compat_str), + 'uploader_id': str_or_none(account_data('id', int)), + 'uploader_url': url_or_none(account_data('url', compat_str)), + 'channel': channel_data('displayName', compat_str), + 'channel_id': str_or_none(channel_data('id', int)), + 'channel_url': url_or_none(channel_data('url', compat_str)), + 'language': data('language', 'id', compat_str), + 'license': data('licence', 'label', compat_str), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likes')), + 'dislike_count': int_or_none(video.get('dislikes')), + 'age_limit': age_limit, + 'tags': try_get(video, lambda x: x['tags'], list), + 'categories': categories, + 'formats': formats, + 'subtitles': subtitles, + 'webpage_url': webpage_url, + } + + +class PeerTubePlaylistIE(InfoExtractor): + IE_NAME = 'PeerTube:Playlist' + _TYPES = { + 'a': 'accounts', + 'c': 'video-channels', + 'w/p': 'video-playlists', + } + _VALID_URL = r'''(?x) + https?://(?P<host>%s)/(?P<type>(?:%s))/ + (?P<id>[^/]+) + ''' % (PeerTubeIE._INSTANCES_RE, '|'.join(_TYPES.keys())) + _TESTS = [{ + 'url': 'https://peertube.debian.social/w/p/hFdJoTuyhNJVa1cDWd1d12', + 'info_dict': { + 'id': 'hFdJoTuyhNJVa1cDWd1d12', + 'description': 'Diversas palestras do Richard Stallman no Brasil.', + 'title': 'Richard Stallman no Brasil', + 'timestamp': 1599676222, + }, + 'playlist_mincount': 9, + }, { + 'url': 'https://peertube2.cpy.re/a/chocobozzz/videos', + 'info_dict': { + 'id': 'chocobozzz', + 'timestamp': 1553874564, + 'title': 'chocobozzz', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://framatube.org/c/bf54d359-cfad-4935-9d45-9d6be93f63e8/videos', + 'info_dict': { + 'id': 'bf54d359-cfad-4935-9d45-9d6be93f63e8', + 'timestamp': 1519917377, + 'title': 'Les vidéos de Framasoft', + }, + 'playlist_mincount': 345, + }, { + 'url': 'https://peertube2.cpy.re/c/blender_open_movies@video.blender.org/videos', + 'info_dict': { + 'id': 'blender_open_movies@video.blender.org', + 'timestamp': 1542287810, + 'title': 'Official Blender Open Movies', + }, + 'playlist_mincount': 11, + }] + _API_BASE = 'https://%s/api/v1/%s/%s%s' + _PAGE_SIZE = 30 + + def call_api(self, host, name, path, base, **kwargs): + return self._download_json( + self._API_BASE % (host, base, name, path), name, **kwargs) + + def fetch_page(self, host, id, type, page): + page += 1 + video_data = self.call_api( + host, id, + f'/videos?sort=-createdAt&start={self._PAGE_SIZE * (page - 1)}&count={self._PAGE_SIZE}&nsfw=both', + type, note=f'Downloading page {page}').get('data', []) + for video in video_data: + shortUUID = video.get('shortUUID') or try_get(video, lambda x: x['video']['shortUUID']) + video_title = video.get('name') or try_get(video, lambda x: x['video']['name']) + yield self.url_result( + f'https://{host}/w/{shortUUID}', PeerTubeIE.ie_key(), + video_id=shortUUID, video_title=video_title) + + def _extract_playlist(self, host, type, id): + info = self.call_api(host, id, '', type, note='Downloading playlist information', fatal=False) + + playlist_title = info.get('displayName') + playlist_description = info.get('description') + playlist_timestamp = unified_timestamp(info.get('createdAt')) + channel = try_get(info, lambda x: x['ownerAccount']['name']) or info.get('displayName') + channel_id = try_get(info, lambda x: x['ownerAccount']['id']) or info.get('id') + thumbnail = format_field(info, 'thumbnailPath', f'https://{host}%s') + + entries = OnDemandPagedList(functools.partial( + self.fetch_page, host, id, type), self._PAGE_SIZE) + + return self.playlist_result( + entries, id, playlist_title, playlist_description, + timestamp=playlist_timestamp, channel=channel, channel_id=channel_id, thumbnail=thumbnail) + + def _real_extract(self, url): + type, host, id = self._match_valid_url(url).group('type', 'host', 'id') + type = self._TYPES[type] + return self._extract_playlist(host, type, id) diff --git a/yt_dlp/extractor/peertv.py b/yt_dlp/extractor/peertv.py new file mode 100644 index 0000000..a709e21 --- /dev/null +++ b/yt_dlp/extractor/peertv.py @@ -0,0 +1,52 @@ +from .common import InfoExtractor +from ..utils import js_to_json + + +class PeerTVIE(InfoExtractor): + IE_NAME = 'peer.tv' + _VALID_URL = r'https?://(?:www\.)?peer\.tv/(?:de|it|en)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.peer.tv/de/841', + 'info_dict': { + 'id': '841', + 'ext': 'mp4', + 'title': 'Die Brunnenburg', + 'description': 'md5:4395f6142b090338340ab88a3aae24ed', + }, + }, { + 'url': 'https://www.peer.tv/it/404', + 'info_dict': { + 'id': '404', + 'ext': 'mp4', + 'title': 'Cascate di ghiaccio in Val Gardena', + 'description': 'md5:e8e5907f236171842674e8090e3577b8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_key = self._html_search_regex(r'player\.peer\.tv/js/([a-zA-Z0-9]+)', webpage, 'video key') + + js = self._download_webpage(f'https://player.peer.tv/js/{video_key}/', video_id, + headers={'Referer': 'https://www.peer.tv/'}, note='Downloading session id') + + session_id = self._search_regex(r'["\']session_id["\']:\s*["\']([a-zA-Z0-9]+)["\']', js, 'session id') + + player_webpage = self._download_webpage( + f'https://player.peer.tv/jsc/{video_key}/{session_id}?jsr=aHR0cHM6Ly93d3cucGVlci50di9kZS84NDE=&cs=UTF-8&mq=2&ua=0&webm=p&mp4=p&hls=1', + video_id, note='Downloading player webpage') + + m3u8_url = self._search_regex(r'["\']playlist_url["\']:\s*(["\'][^"\']+["\'])', player_webpage, 'm3u8 url') + m3u8_url = self._parse_json(m3u8_url, video_id, transform_source=js_to_json) + + formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') + + return { + 'id': video_id, + 'title': self._html_search_regex(r'<h1>(.+?)</h1>', webpage, 'title').replace('\xa0', ' '), + 'formats': formats, + 'description': self._html_search_meta(('og:description', 'description'), webpage), + 'thumbnail': self._html_search_meta(('og:image', 'image'), webpage) + } diff --git a/yt_dlp/extractor/peloton.py b/yt_dlp/extractor/peloton.py new file mode 100644 index 0000000..7864299 --- /dev/null +++ b/yt_dlp/extractor/peloton.py @@ -0,0 +1,215 @@ +import json +import re +import urllib.parse + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + float_or_none, + str_or_none, + traverse_obj, + url_or_none, +) + + +class PelotonIE(InfoExtractor): + IE_NAME = 'peloton' + _NETRC_MACHINE = 'peloton' + _VALID_URL = r'https?://members\.onepeloton\.com/classes/player/(?P<id>[a-f0-9]+)' + _TESTS = [{ + 'url': 'https://members.onepeloton.com/classes/player/0e9653eb53544eeb881298c8d7a87b86', + 'info_dict': { + 'id': '0e9653eb53544eeb881298c8d7a87b86', + 'title': '20 min Chest & Back Strength', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.+\.jpg', + 'description': 'md5:fcd5be9b9eda0194b470e13219050a66', + 'creator': 'Chase Tucker', + 'release_timestamp': 1556141400, + 'timestamp': 1556141400, + 'upload_date': '20190424', + 'duration': 1389, + 'categories': ['Strength'], + 'tags': ['Workout Mat', 'Light Weights', 'Medium Weights'], + 'is_live': False, + 'chapters': 'count:1', + 'subtitles': {'en': [{ + 'url': r're:^https?://.+', + 'ext': 'vtt' + }]}, + }, 'params': { + 'skip_download': 'm3u8', + }, + '_skip': 'Account needed' + }, { + 'url': 'https://members.onepeloton.com/classes/player/26603d53d6bb4de1b340514864a6a6a8', + 'info_dict': { + 'id': '26603d53d6bb4de1b340514864a6a6a8', + 'title': '30 min Earth Day Run', + 'ext': 'm4a', + 'thumbnail': r're:https://.+\.jpg', + 'description': 'md5:adc065a073934d7ee0475d217afe0c3d', + 'creator': 'Selena Samuela', + 'release_timestamp': 1587567600, + 'timestamp': 1587567600, + 'upload_date': '20200422', + 'duration': 1802, + 'categories': ['Running'], + 'is_live': False, + 'chapters': 'count:3' + }, 'params': { + 'skip_download': 'm3u8', + }, + '_skip': 'Account needed' + }] + + _MANIFEST_URL_TEMPLATE = '%s?hdnea=%s' + + def _start_session(self, video_id): + self._download_webpage('https://api.onepeloton.com/api/started_client_session', video_id, note='Starting session') + + def _login(self, video_id): + username, password = self._get_login_info() + if not (username and password): + self.raise_login_required() + try: + self._download_json( + 'https://api.onepeloton.com/auth/login', video_id, note='Logging in', + data=json.dumps({ + 'username_or_email': username, + 'password': password, + 'with_pubsub': False + }).encode(), + headers={'Content-Type': 'application/json', 'User-Agent': 'web'}) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + json_string = self._webpage_read_content(e.cause.response, None, video_id) + res = self._parse_json(json_string, video_id) + raise ExtractorError(res['message'], expected=res['message'] == 'Login failed') + else: + raise + + def _get_token(self, video_id): + try: + subscription = self._download_json( + 'https://api.onepeloton.com/api/subscription/stream', video_id, note='Downloading token', + data=json.dumps({}).encode(), headers={'Content-Type': 'application/json'}) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + json_string = self._webpage_read_content(e.cause.response, None, video_id) + res = self._parse_json(json_string, video_id) + raise ExtractorError(res['message'], expected=res['message'] == 'Stream limit reached') + else: + raise + return subscription['token'] + + def _real_extract(self, url): + video_id = self._match_id(url) + try: + self._start_session(video_id) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + self._login(video_id) + self._start_session(video_id) + else: + raise + + metadata = self._download_json('https://api.onepeloton.com/api/ride/%s/details?stream_source=multichannel' % video_id, video_id) + ride_data = metadata.get('ride') + if not ride_data: + raise ExtractorError('Missing stream metadata') + token = self._get_token(video_id) + + is_live = False + if ride_data.get('content_format') == 'audio': + url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('vod_stream_url'), urllib.parse.quote(token)) + formats = [{ + 'url': url, + 'ext': 'm4a', + 'format_id': 'audio', + 'vcodec': 'none', + }] + subtitles = {} + else: + if ride_data.get('vod_stream_url'): + url = 'https://members.onepeloton.com/.netlify/functions/m3u8-proxy?displayLanguage=en&acceptedSubtitles=%s&url=%s?hdnea=%s' % ( + ','.join([re.sub('^([a-z]+)-([A-Z]+)$', r'\1', caption) for caption in ride_data['captions']]), + ride_data['vod_stream_url'], + urllib.parse.quote(urllib.parse.quote(token))) + elif ride_data.get('live_stream_url'): + url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('live_stream_url'), urllib.parse.quote(token)) + is_live = True + else: + raise ExtractorError('Missing video URL') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + + if metadata.get('instructor_cues'): + subtitles['cues'] = [{ + 'data': json.dumps(metadata.get('instructor_cues')), + 'ext': 'json' + }] + + category = ride_data.get('fitness_discipline_display_name') + chapters = [{ + 'start_time': segment.get('start_time_offset'), + 'end_time': segment.get('start_time_offset') + segment.get('length'), + 'title': segment.get('name') + } for segment in traverse_obj(metadata, ('segments', 'segment_list'))] + + return { + 'id': video_id, + 'title': ride_data.get('title'), + 'formats': formats, + 'thumbnail': url_or_none(ride_data.get('image_url')), + 'description': str_or_none(ride_data.get('description')), + 'creator': traverse_obj(ride_data, ('instructor', 'name')), + 'release_timestamp': ride_data.get('original_air_time'), + 'timestamp': ride_data.get('original_air_time'), + 'subtitles': subtitles, + 'duration': float_or_none(ride_data.get('length')), + 'categories': [category] if category else None, + 'tags': traverse_obj(ride_data, ('equipment_tags', ..., 'name')), + 'is_live': is_live, + 'chapters': chapters + } + + +class PelotonLiveIE(InfoExtractor): + IE_NAME = 'peloton:live' + IE_DESC = 'Peloton Live' + _VALID_URL = r'https?://members\.onepeloton\.com/player/live/(?P<id>[a-f0-9]+)' + _TEST = { + 'url': 'https://members.onepeloton.com/player/live/eedee2d19f804a9788f53aa8bd38eb1b', + 'info_dict': { + 'id': '32edc92d28044be5bf6c7b6f1f8d1cbc', + 'title': '30 min HIIT Ride: Live from Home', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.+\.png', + 'description': 'md5:f0d7d8ed3f901b7ee3f62c1671c15817', + 'creator': 'Alex Toussaint', + 'release_timestamp': 1587736620, + 'timestamp': 1587736620, + 'upload_date': '20200424', + 'duration': 2014, + 'categories': ['Cycling'], + 'is_live': False, + 'chapters': 'count:3' + }, + 'params': { + 'skip_download': 'm3u8', + }, + '_skip': 'Account needed' + } + + def _real_extract(self, url): + workout_id = self._match_id(url) + peloton = self._download_json(f'https://api.onepeloton.com/api/peloton/{workout_id}', workout_id) + + if peloton.get('ride_id'): + if not peloton.get('is_live') or peloton.get('is_encore') or peloton.get('status') != 'PRE_START': + return self.url_result('https://members.onepeloton.com/classes/player/%s' % peloton['ride_id']) + else: + raise ExtractorError('Ride has not started', expected=True) + else: + raise ExtractorError('Missing video ID') diff --git a/yt_dlp/extractor/performgroup.py b/yt_dlp/extractor/performgroup.py new file mode 100644 index 0000000..f4d7f22 --- /dev/null +++ b/yt_dlp/extractor/performgroup.py @@ -0,0 +1,77 @@ +from .common import InfoExtractor +from ..utils import int_or_none + + +class PerformGroupIE(InfoExtractor): + _VALID_URL = r'https?://player\.performgroup\.com/eplayer(?:/eplayer\.html|\.js)#/?(?P<id>[0-9a-f]{26})\.(?P<auth_token>[0-9a-z]{26})' + _TESTS = [{ + # http://www.faz.net/aktuell/sport/fussball/wm-2018-playoffs-schweiz-besiegt-nordirland-1-0-15286104.html + 'url': 'http://player.performgroup.com/eplayer/eplayer.html#d478c41c5d192f56b9aa859de8.1w4crrej5w14e1ed4s1ce4ykab', + 'md5': '259cb03d142e2e52471e8837ecacb29f', + 'info_dict': { + 'id': 'xgrwobuzumes1lwjxtcdpwgxd', + 'ext': 'mp4', + 'title': 'Liga MX: Keine Einsicht nach Horrorfoul', + 'description': 'md5:7cd3b459c82725b021e046ab10bf1c5b', + 'timestamp': 1511533477, + 'upload_date': '20171124', + } + }] + + def _call_api(self, service, auth_token, content_id, referer_url): + return self._download_json( + 'http://ep3.performfeeds.com/ep%s/%s/%s/' % (service, auth_token, content_id), + content_id, headers={ + 'Referer': referer_url, + 'Origin': 'http://player.performgroup.com', + }, query={ + '_fmt': 'json', + }) + + def _real_extract(self, url): + player_id, auth_token = self._match_valid_url(url).groups() + bootstrap = self._call_api('bootstrap', auth_token, player_id, url) + video = bootstrap['config']['dataSource']['sourceItems'][0]['videos'][0] + video_id = video['uuid'] + vod = self._call_api('vod', auth_token, video_id, url) + media = vod['videos']['video'][0]['media'] + + formats = [] + hls_url = media.get('hls', {}).get('url') + if hls_url: + formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + hds_url = media.get('hds', {}).get('url') + if hds_url: + formats.extend(self._extract_f4m_formats(hds_url + '?hdcore', video_id, f4m_id='hds', fatal=False)) + + for c in media.get('content', []): + c_url = c.get('url') + if not c_url: + continue + tbr = int_or_none(c.get('bitrate'), 1000) + format_id = 'http' + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': c_url, + 'tbr': tbr, + 'width': int_or_none(c.get('width')), + 'height': int_or_none(c.get('height')), + 'filesize': int_or_none(c.get('fileSize')), + 'vcodec': c.get('type'), + 'fps': int_or_none(c.get('videoFrameRate')), + 'vbr': int_or_none(c.get('videoRate'), 1000), + 'abr': int_or_none(c.get('audioRate'), 1000), + }) + + return { + 'id': video_id, + 'title': video['title'], + 'description': video.get('description'), + 'thumbnail': video.get('poster'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': int_or_none(video.get('publishedTime'), 1000), + 'formats': formats, + } diff --git a/yt_dlp/extractor/periscope.py b/yt_dlp/extractor/periscope.py new file mode 100644 index 0000000..d2351df --- /dev/null +++ b/yt_dlp/extractor/periscope.py @@ -0,0 +1,188 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + unescapeHTML, +) +from ..utils.traversal import traverse_obj + + +class PeriscopeBaseIE(InfoExtractor): + _M3U8_HEADERS = { + 'Referer': 'https://www.periscope.tv/' + } + + def _call_api(self, method, query, item_id): + return self._download_json( + 'https://api.periscope.tv/api/v2/%s' % method, + item_id, query=query) + + def _parse_broadcast_data(self, broadcast, video_id): + title = broadcast.get('status') or 'Periscope Broadcast' + uploader = broadcast.get('user_display_name') or broadcast.get('username') + title = '%s - %s' % (uploader, title) if uploader else title + thumbnails = [{ + 'url': broadcast[image], + } for image in ('image_url', 'image_url_medium', 'image_url_small') if broadcast.get(image)] + + return { + 'id': broadcast.get('id') or video_id, + 'title': title, + 'timestamp': parse_iso8601(broadcast.get('created_at')) or int_or_none( + broadcast.get('created_at_ms'), scale=1000), + 'release_timestamp': int_or_none(broadcast.get('scheduled_start_ms'), scale=1000), + 'uploader': uploader, + 'uploader_id': broadcast.get('user_id') or broadcast.get('username'), + 'thumbnails': thumbnails, + 'view_count': int_or_none(broadcast.get('total_watched')), + 'concurrent_view_count': int_or_none(broadcast.get('total_watching')), + 'tags': broadcast.get('tags'), + 'live_status': { + 'running': 'is_live', + 'not_started': 'is_upcoming', + }.get(traverse_obj(broadcast, ('state', {str.lower}))) or 'was_live' + } + + @staticmethod + def _extract_common_format_info(broadcast): + return broadcast.get('state').lower(), int_or_none(broadcast.get('width')), int_or_none(broadcast.get('height')) + + @staticmethod + def _add_width_and_height(f, width, height): + for key, val in (('width', width), ('height', height)): + if not f.get(key): + f[key] = val + + def _extract_pscp_m3u8_formats(self, m3u8_url, video_id, format_id, state, width, height, fatal=True): + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + entry_protocol='m3u8_native' + if state in ('ended', 'timed_out') else 'm3u8', + m3u8_id=format_id, fatal=fatal, headers=self._M3U8_HEADERS) + if len(m3u8_formats) == 1: + self._add_width_and_height(m3u8_formats[0], width, height) + for f in m3u8_formats: + f.setdefault('http_headers', {}).update(self._M3U8_HEADERS) + return m3u8_formats + + +class PeriscopeIE(PeriscopeBaseIE): + IE_DESC = 'Periscope' + IE_NAME = 'periscope' + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)' + _EMBED_REGEX = [r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1'] + # Alive example URLs can be found here https://www.periscope.tv/ + _TESTS = [{ + 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', + 'md5': '65b57957972e503fcbbaeed8f4fa04ca', + 'info_dict': { + 'id': '56102209', + 'ext': 'mp4', + 'title': 'Bec Boop - 🚠✈️🇬🇧 Fly above #London in Emirates Air Line cable car at night 🇬🇧✈️🚠 #BoopScope 🎀💗', + 'timestamp': 1438978559, + 'upload_date': '20150807', + 'uploader': 'Bec Boop', + 'uploader_id': '1465763', + }, + 'skip': 'Expires in 24 hours', + }, { + 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', + 'only_matching': True, + }, { + 'url': 'https://www.periscope.tv/bastaakanoggano/1OdKrlkZZjOJX', + 'only_matching': True, + }, { + 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', + 'only_matching': True, + }] + + def _real_extract(self, url): + token = self._match_id(url) + + stream = self._call_api( + 'accessVideoPublic', {'broadcast_id': token}, token) + + broadcast = stream['broadcast'] + info = self._parse_broadcast_data(broadcast, token) + + state = broadcast.get('state').lower() + width = int_or_none(broadcast.get('width')) + height = int_or_none(broadcast.get('height')) + + def add_width_and_height(f): + for key, val in (('width', width), ('height', height)): + if not f.get(key): + f[key] = val + + video_urls = set() + formats = [] + for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'): + video_url = stream.get(format_id + '_url') + if not video_url or video_url in video_urls: + continue + video_urls.add(video_url) + if format_id != 'rtmp': + m3u8_formats = self._extract_pscp_m3u8_formats( + video_url, token, format_id, state, width, height, False) + formats.extend(m3u8_formats) + continue + rtmp_format = { + 'url': video_url, + 'ext': 'flv' if format_id == 'rtmp' else 'mp4', + } + self._add_width_and_height(rtmp_format) + formats.append(rtmp_format) + + info['formats'] = formats + return info + + +class PeriscopeUserIE(PeriscopeBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/(?P<id>[^/]+)/?$' + IE_DESC = 'Periscope user videos' + IE_NAME = 'periscope:user' + + _TEST = { + 'url': 'https://www.periscope.tv/LularoeHusbandMike/', + 'info_dict': { + 'id': 'LularoeHusbandMike', + 'title': 'LULAROE HUSBAND MIKE', + 'description': 'md5:6cf4ec8047768098da58e446e82c82f0', + }, + # Periscope only shows videos in the last 24 hours, so it's possible to + # get 0 videos + 'playlist_mincount': 0, + } + + def _real_extract(self, url): + user_name = self._match_id(url) + + webpage = self._download_webpage(url, user_name) + + data_store = self._parse_json( + unescapeHTML(self._search_regex( + r'data-store=(["\'])(?P<data>.+?)\1', + webpage, 'data store', default='{}', group='data')), + user_name) + + user = list(data_store['UserCache']['users'].values())[0]['user'] + user_id = user['id'] + session_id = data_store['SessionToken']['public']['broadcastHistory']['token']['session_id'] + + broadcasts = self._call_api( + 'getUserBroadcastsPublic', + {'user_id': user_id, 'session_id': session_id}, + user_name)['broadcasts'] + + broadcast_ids = [ + broadcast['id'] for broadcast in broadcasts if broadcast.get('id')] + + title = user.get('display_name') or user.get('username') or user_name + description = user.get('description') + + entries = [ + self.url_result( + 'https://www.periscope.tv/%s/%s' % (user_name, broadcast_id)) + for broadcast_id in broadcast_ids] + + return self.playlist_result(entries, user_id, title, description) diff --git a/yt_dlp/extractor/pgatour.py b/yt_dlp/extractor/pgatour.py new file mode 100644 index 0000000..36c2c62 --- /dev/null +++ b/yt_dlp/extractor/pgatour.py @@ -0,0 +1,47 @@ +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor + + +class PGATourIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pgatour\.com/video/[\w-]+/(?P<tc>T)?(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.pgatour.com/video/competition/T6322447785112/adam-hadwin-2023-the-players-round-4-18th-hole-shot-1', + 'info_dict': { + 'id': '6322447785112', + 'ext': 'mp4', + 'title': 'Adam Hadwin | 2023 THE PLAYERS | Round 4 | 18th hole | Shot 1', + 'uploader_id': '6116716431001', + 'upload_date': '20230312', + 'timestamp': 1678653136, + 'duration': 20.011, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': 'count:7', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.pgatour.com/video/features/6322506425112/follow-the-players-trophy-on-championship-sunday', + 'info_dict': { + 'id': '6322506425112', + 'ext': 'mp4', + 'title': 'Follow THE PLAYERS trophy on Championship Sunday', + 'description': 'md5:4d29e4bdfa03694a0ebfd08950398568', + 'uploader_id': '6082840763001', + 'upload_date': '20230313', + 'timestamp': 1678739835, + 'duration': 123.435, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': 'count:8', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id, is_tourcast = self._match_valid_url(url).group('id', 'tc') + + # From https://www.pgatour.com/_next/static/chunks/pages/_app-8bcf849560daf38d.js + account_id = '6116716431001' if is_tourcast else '6082840763001' + player_id = 'Vsd5Umu8r' if is_tourcast else 'FWIBYMBPj' + + return self.url_result( + f'https://players.brightcove.net/{account_id}/{player_id}_default/index.html?videoId={video_id}', + BrightcoveNewIE) diff --git a/yt_dlp/extractor/philharmoniedeparis.py b/yt_dlp/extractor/philharmoniedeparis.py new file mode 100644 index 0000000..e8494a0 --- /dev/null +++ b/yt_dlp/extractor/philharmoniedeparis.py @@ -0,0 +1,97 @@ +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import try_get + + +class PhilharmonieDeParisIE(InfoExtractor): + IE_DESC = 'Philharmonie de Paris' + _VALID_URL = r'''(?x) + https?:// + (?: + live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|embed(?:app)?/|misc/Playlist\.ashx\?id=)| + pad\.philharmoniedeparis\.fr/(?:doc/CIMU/|player\.aspx\?id=)| + philharmoniedeparis\.fr/fr/live/concert/| + otoplayer\.philharmoniedeparis\.fr/fr/embed/ + ) + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'https://philharmoniedeparis.fr/fr/live/concert/1129666-danses-symphoniques', + 'md5': '24bdb7e86c200c107680e1f7770330ae', + 'info_dict': { + 'id': '1129666', + 'ext': 'mp4', + 'title': 'Danses symphoniques. Orchestre symphonique Divertimento - Zahia Ziouani. Bizet, de Falla, Stravinski, Moussorgski, Saint-Saëns', + }, + }, { + 'url': 'https://philharmoniedeparis.fr/fr/live/concert/1032066-akademie-fur-alte-musik-berlin-rias-kammerchor-rene-jacobs-passion-selon-saint-jean-de-johann', + 'info_dict': { + 'id': '1032066', + 'title': 'Akademie für alte Musik Berlin, Rias Kammerchor, René Jacobs : Passion selon saint Jean de Johann Sebastian Bach', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://philharmoniedeparis.fr/fr/live/concert/1030324-orchestre-philharmonique-de-radio-france-myung-whun-chung-renaud-capucon-pascal-dusapin-johannes', + 'only_matching': True, + }, { + 'url': 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=1030324&track=&lang=fr', + 'only_matching': True, + }, { + 'url': 'https://live.philharmoniedeparis.fr/embedapp/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR', + 'only_matching': True, + }, { + 'url': 'https://otoplayer.philharmoniedeparis.fr/fr/embed/1098406?lang=fr-FR', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + config = self._download_json( + 'https://otoplayer.philharmoniedeparis.fr/fr/config/%s.json' % video_id, video_id, query={ + 'id': video_id, + 'lang': 'fr-FR', + }) + + def extract_entry(source): + if not isinstance(source, dict): + return + title = source.get('title') + if not title: + return + files = source.get('files') + if not isinstance(files, dict): + return + format_urls = set() + formats = [] + for format_id in ('mobile', 'desktop'): + format_url = try_get( + files, lambda x: x[format_id]['file'], compat_str) + if not format_url or format_url in format_urls: + continue + format_urls.add(format_url) + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + if not formats and not self.get_param('ignore_no_formats'): + return + return { + 'title': title, + 'formats': formats, + 'thumbnail': files.get('thumbnail'), + } + info = extract_entry(config) + if info: + info.update({ + 'id': video_id, + }) + return info + entries = [] + for num, chapter in enumerate(config['chapters'], start=1): + entry = extract_entry(chapter) + if entry is None: + continue + entry['id'] = '%s-%d' % (video_id, num) + entries.append(entry) + + return self.playlist_result(entries, video_id, config.get('title')) diff --git a/yt_dlp/extractor/phoenix.py b/yt_dlp/extractor/phoenix.py new file mode 100644 index 0000000..5fa133a --- /dev/null +++ b/yt_dlp/extractor/phoenix.py @@ -0,0 +1,130 @@ +import re + +from .youtube import YoutubeIE +from .zdf import ZDFBaseIE +from ..compat import compat_str +from ..utils import ( + int_or_none, + merge_dicts, + try_get, + unified_timestamp, + urljoin, +) + + +class PhoenixIE(ZDFBaseIE): + IE_NAME = 'phoenix.de' + _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/]+/)*[^/?#&]*-a-(?P<id>\d+)\.html' + _TESTS = [{ + # Same as https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html + 'url': 'https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html', + 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'info_dict': { + 'id': '210222_phx_nachgehakt_corona_protest', + 'ext': 'mp4', + 'title': 'Wohin führt der Protest in der Pandemie?', + 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', + 'duration': 1691, + 'timestamp': 1613902500, + 'upload_date': '20210221', + 'uploader': 'Phoenix', + 'series': 'corona nachgehakt', + 'episode': 'Wohin führt der Protest in der Pandemie?', + }, + }, { + # Youtube embed + 'url': 'https://www.phoenix.de/sendungen/gespraeche/phoenix-streitgut-brennglas-corona-a-1965505.html', + 'info_dict': { + 'id': 'hMQtqFYjomk', + 'ext': 'mp4', + 'title': 'phoenix streitgut: Brennglas Corona - Wie gerecht ist unsere Gesellschaft?', + 'description': 'md5:ac7a02e2eb3cb17600bc372e4ab28fdd', + 'duration': 3509, + 'upload_date': '20201219', + 'uploader': 'phoenix', + 'uploader_id': 'phoenix', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.phoenix.de/entwicklungen-in-russland-a-2044720.html', + 'only_matching': True, + }, { + # no media + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/mit-dem-jumbo-durch-die-nacht-a-89625.html', + 'only_matching': True, + }, { + # Same as https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche', + 'only_matching': True, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + + article = self._download_json( + 'https://www.phoenix.de/response/id/%s' % article_id, article_id, + 'Downloading article JSON') + + video = article['absaetze'][0] + title = video.get('titel') or article.get('subtitel') + + if video.get('typ') == 'video-youtube': + video_id = video['id'] + return self.url_result( + video_id, ie=YoutubeIE.ie_key(), video_id=video_id, + video_title=title) + + video_id = compat_str(video.get('basename') or video.get('content')) + + details = self._download_json( + 'https://www.phoenix.de/php/mediaplayer/data/beitrags_details.php', + video_id, 'Downloading details JSON', query={ + 'ak': 'web', + 'ptmd': 'true', + 'id': video_id, + 'profile': 'player2', + }) + + title = title or details['title'] + content_id = details['tracking']['nielsen']['content']['assetid'] + + info = self._extract_ptmd( + 'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/%s' % content_id, + content_id, None, url) + + duration = int_or_none(try_get( + details, lambda x: x['tracking']['nielsen']['content']['length'])) + timestamp = unified_timestamp(details.get('editorialDate')) + series = try_get( + details, lambda x: x['tracking']['nielsen']['content']['program'], + compat_str) + episode = title if details.get('contentType') == 'episode' else None + + thumbnails = [] + teaser_images = try_get(details, lambda x: x['teaserImageRef']['layouts'], dict) or {} + for thumbnail_key, thumbnail_url in teaser_images.items(): + thumbnail_url = urljoin(url, thumbnail_url) + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } + m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) + thumbnails.append(thumbnail) + + return merge_dicts(info, { + 'id': content_id, + 'title': title, + 'description': details.get('leadParagraph'), + 'duration': duration, + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'uploader': details.get('tvService'), + 'series': series, + 'episode': episode, + }) diff --git a/yt_dlp/extractor/photobucket.py b/yt_dlp/extractor/photobucket.py new file mode 100644 index 0000000..71e9a48 --- /dev/null +++ b/yt_dlp/extractor/photobucket.py @@ -0,0 +1,43 @@ +import json + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote + + +class PhotobucketIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' + _TEST = { + 'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0', + 'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99', + 'info_dict': { + 'id': 'zpsc0c3b9fa', + 'ext': 'mp4', + 'timestamp': 1367669341, + 'upload_date': '20130504', + 'uploader': 'rachaneronas', + 'title': 'Tired of Link Building? Try BacklinkMyDomain.com!', + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + video_extension = mobj.group('ext') + + webpage = self._download_webpage(url, video_id) + + # Extract URL, uploader, and title from webpage + self.report_extraction(video_id) + info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);', + webpage, 'info json') + info = json.loads(info_json) + url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url')) + return { + 'id': video_id, + 'url': url, + 'uploader': info['username'], + 'timestamp': info['creationDate'], + 'title': info['title'], + 'ext': video_extension, + 'thumbnail': info['thumbUrl'], + } diff --git a/yt_dlp/extractor/piapro.py b/yt_dlp/extractor/piapro.py new file mode 100644 index 0000000..3ae985d --- /dev/null +++ b/yt_dlp/extractor/piapro.py @@ -0,0 +1,121 @@ +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + ExtractorError, + parse_duration, + parse_filesize, + str_to_int, + unified_timestamp, + urlencode_postdata, +) + + +class PiaproIE(InfoExtractor): + _NETRC_MACHINE = 'piapro' + _VALID_URL = r'https?://piapro\.jp/(?:t|content)/(?P<id>[\w-]+)/?' + _TESTS = [{ + 'url': 'https://piapro.jp/t/NXYR', + 'md5': 'f7c0f760913fb1d44a1c45a4af793909', + 'info_dict': { + 'id': 'NXYR', + 'ext': 'mp3', + 'uploader': 'wowaka', + 'uploader_id': 'wowaka', + 'title': '裏表ラバーズ', + 'description': 'http://www.nicovideo.jp/watch/sm8082467', + 'duration': 189.0, + 'timestamp': 1251785475, + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', + 'upload_date': '20090901', + 'view_count': int, + } + }, { + 'note': 'There are break lines in description, mandating (?s) flag', + 'url': 'https://piapro.jp/t/9cSd', + 'md5': '952bb6d1e8de95050206408a87790676', + 'info_dict': { + 'id': '9cSd', + 'ext': 'mp3', + 'title': '青に溶けた風船 / 初音ミク', + 'description': 'md5:d395a9bd151447631a5a1460bc7f9132', + 'uploader': 'シアン・キノ', + 'duration': 229.0, + 'timestamp': 1644030039, + 'upload_date': '20220205', + 'view_count': int, + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', + 'uploader_id': 'cyankino', + } + }, { + 'url': 'https://piapro.jp/content/hcw0z3a169wtemz6', + 'only_matching': True + }, { + 'url': 'https://piapro.jp/t/-SO-', + 'only_matching': True + }] + + _login_status = False + + def _perform_login(self, username, password): + login_ok = True + login_form_strs = { + '_username': username, + '_password': password, + '_remember_me': 'on', + 'login': 'ログイン' + } + self._request_webpage('https://piapro.jp/login/', None) + urlh = self._request_webpage( + 'https://piapro.jp/login/exe', None, + note='Logging in', errnote='Unable to log in', + data=urlencode_postdata(login_form_strs)) + if urlh is False: + login_ok = False + else: + parts = compat_urlparse.urlparse(urlh.url) + if parts.path != '/': + login_ok = False + if not login_ok: + self.report_warning( + 'unable to log in: bad username or password') + self._login_status = login_ok + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + category_id = self._search_regex(r'categoryId=(.+)">', webpage, 'category ID') + if category_id not in ('1', '2', '21', '22', '23', '24', '25'): + raise ExtractorError('The URL does not contain audio.', expected=True) + + str_duration, str_filesize = self._search_regex( + r'サイズ:</span>(.+?)/\(([0-9,]+?[KMG]?B))', webpage, 'duration and size', + group=(1, 2), default=(None, None)) + str_viewcount = self._search_regex(r'閲覧数:</span>([0-9,]+)\s+', webpage, 'view count', fatal=False) + + uploader_id, uploader = self._search_regex( + r'<a\s+class="cd_user-name"\s+href="/(.*)">([^<]+)さん<', webpage, 'uploader', + group=(1, 2), default=(None, None)) + content_id = self._search_regex(r'contentId\:\'(.+)\'', webpage, 'content ID') + create_date = self._search_regex(r'createDate\:\'(.+)\'', webpage, 'timestamp') + + player_webpage = self._download_webpage( + f'https://piapro.jp/html5_player_popup/?id={content_id}&cdate={create_date}', + video_id, note='Downloading player webpage') + + return { + 'id': video_id, + 'title': self._html_search_regex(r'<h1\s+class="cd_works-title">(.+?)</h1>', webpage, 'title', fatal=False), + 'description': self._html_search_regex(r'(?s)<p\s+class="cd_dtl_cap">(.+?)</p>\s*<div', webpage, 'description', fatal=False), + 'uploader': uploader, + 'uploader_id': uploader_id, + 'timestamp': unified_timestamp(create_date, False), + 'duration': parse_duration(str_duration), + 'view_count': str_to_int(str_viewcount), + 'thumbnail': self._html_search_meta('twitter:image', webpage), + + 'filesize_approx': parse_filesize(str_filesize.replace(',', '')), + 'url': self._search_regex(r'mp3:\s*\'(.*?)\'\}', player_webpage, 'url'), + 'ext': 'mp3', + 'vcodec': 'none', + } diff --git a/yt_dlp/extractor/piaulizaportal.py b/yt_dlp/extractor/piaulizaportal.py new file mode 100644 index 0000000..1eb6d92 --- /dev/null +++ b/yt_dlp/extractor/piaulizaportal.py @@ -0,0 +1,70 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + parse_qs, + time_seconds, + traverse_obj, +) + + +class PIAULIZAPortalIE(InfoExtractor): + IE_DESC = 'ulizaportal.jp - PIA LIVE STREAM' + _VALID_URL = r'https?://(?:www\.)?ulizaportal\.jp/pages/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ + 'url': 'https://ulizaportal.jp/pages/005f18b7-e810-5618-cb82-0987c5755d44', + 'info_dict': { + 'id': '005f18b7-e810-5618-cb82-0987c5755d44', + 'title': 'プレゼンテーションプレイヤーのサンプル', + 'live_status': 'not_live', + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + }, { + 'url': 'https://ulizaportal.jp/pages/005e1b23-fe93-5780-19a0-98e917cc4b7d?expires=4102412400&signature=f422a993b683e1068f946caf406d211c17d1ef17da8bef3df4a519502155aa91&version=1', + 'info_dict': { + 'id': '005e1b23-fe93-5780-19a0-98e917cc4b7d', + 'title': '【確認用】視聴サンプルページ(ULIZA)', + 'live_status': 'not_live', + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + expires = int_or_none(traverse_obj(parse_qs(url), ('expires', 0))) + if expires and expires <= time_seconds(): + raise ExtractorError('The link is expired.', video_id=video_id, expected=True) + + webpage = self._download_webpage(url, video_id) + + player_data = self._download_webpage( + self._search_regex( + r'<script [^>]*\bsrc="(https://player-api\.p\.uliza\.jp/v1/players/[^"]+)"', + webpage, 'player data url'), + video_id, headers={'Referer': 'https://ulizaportal.jp/'}, + note='Fetching player data', errnote='Unable to fetch player data') + + formats = self._extract_m3u8_formats( + self._search_regex( + r'["\'](https://vms-api\.p\.uliza\.jp/v1/prog-index\.m3u8[^"\']+)', player_data, + 'm3u8 url', default=None), + video_id, fatal=False) + m3u8_type = self._search_regex( + r'/hls/(dvr|video)/', traverse_obj(formats, (0, 'url')), 'm3u8 type', default=None) + + return { + 'id': video_id, + 'title': self._html_extract_title(webpage), + 'formats': formats, + 'live_status': { + 'video': 'is_live', + 'dvr': 'was_live', # short-term archives + }.get(m3u8_type, 'not_live'), # VOD or long-term archives + } diff --git a/yt_dlp/extractor/picarto.py b/yt_dlp/extractor/picarto.py new file mode 100644 index 0000000..d415ba2 --- /dev/null +++ b/yt_dlp/extractor/picarto.py @@ -0,0 +1,152 @@ +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + str_or_none, + traverse_obj, +) + + +class PicartoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)' + _TEST = { + 'url': 'https://picarto.tv/Setz', + 'info_dict': { + 'id': 'Setz', + 'ext': 'mp4', + 'title': 're:^Setz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'timestamp': int, + 'is_live': True + }, + 'skip': 'Stream is offline', + } + + @classmethod + def suitable(cls, url): + return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) + + def _real_extract(self, url): + channel_id = self._match_id(url) + + data = self._download_json( + 'https://ptvintern.picarto.tv/ptvapi', channel_id, query={ + 'query': '''{ + channel(name: "%s") { + adult + id + online + stream_name + title + } + getLoadBalancerUrl(channel_name: "%s") { + url + } +}''' % (channel_id, channel_id), + })['data'] + metadata = data['channel'] + + if metadata.get('online') == 0: + raise ExtractorError('Stream is offline', expected=True) + title = metadata['title'] + + cdn_data = self._download_json( + data['getLoadBalancerUrl']['url'] + '/stream/json_' + metadata['stream_name'] + '.js', + channel_id, 'Downloading load balancing info') + + formats = [] + for source in (cdn_data.get('source') or []): + source_url = source.get('url') + if not source_url: + continue + source_type = source.get('type') + if source_type == 'html5/application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + source_url, channel_id, 'mp4', m3u8_id='hls', fatal=False)) + elif source_type == 'html5/video/mp4': + formats.append({ + 'url': source_url, + }) + + mature = metadata.get('adult') + if mature is None: + age_limit = None + else: + age_limit = 18 if mature is True else 0 + + return { + 'id': channel_id, + 'title': title.strip(), + 'is_live': True, + 'channel': channel_id, + 'channel_id': metadata.get('id'), + 'channel_url': 'https://picarto.tv/%s' % channel_id, + 'age_limit': age_limit, + 'formats': formats, + } + + +class PicartoVodIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?picarto\.tv/(?:videopopout|\w+/videos)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv', + 'md5': '3ab45ba4352c52ee841a28fb73f2d9ca', + 'info_dict': { + 'id': 'ArtofZod_2017.12.12.00.13.23.flv', + 'ext': 'mp4', + 'title': 'ArtofZod_2017.12.12.00.13.23.flv', + 'thumbnail': r're:^https?://.*\.jpg' + }, + 'skip': 'The VOD does not exist', + }, { + 'url': 'https://picarto.tv/ArtofZod/videos/772650', + 'md5': '00067a0889f1f6869cc512e3e79c521b', + 'info_dict': { + 'id': '772650', + 'ext': 'mp4', + 'title': 'Art of Zod - Drawing and Painting', + 'thumbnail': r're:^https?://.*\.jpg', + 'channel': 'ArtofZod', + 'age_limit': 18, + } + }, { + 'url': 'https://picarto.tv/videopopout/Plague', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + 'https://ptvintern.picarto.tv/ptvapi', video_id, query={ + 'query': f'''{{ + video(id: "{video_id}") {{ + id + title + adult + file_name + video_recording_image_url + channel {{ + name + }} + }} +}}''' + })['data']['video'] + + file_name = data['file_name'] + netloc = urllib.parse.urlparse(data['video_recording_image_url']).netloc + + formats = self._extract_m3u8_formats( + f'https://{netloc}/stream/hls/{file_name}/index.m3u8', video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + **traverse_obj(data, { + 'id': ('id', {str_or_none}), + 'title': ('title', {str}), + 'thumbnail': 'video_recording_image_url', + 'channel': ('channel', 'name', {str}), + 'age_limit': ('adult', {lambda x: 18 if x else 0}), + }), + 'formats': formats, + } diff --git a/yt_dlp/extractor/piksel.py b/yt_dlp/extractor/piksel.py new file mode 100644 index 0000000..97a9bf5 --- /dev/null +++ b/yt_dlp/extractor/piksel.py @@ -0,0 +1,174 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + dict_get, + ExtractorError, + int_or_none, + join_nonempty, + parse_iso8601, + traverse_obj, + try_get, + unescapeHTML, + urljoin, +) + + +class PikselIE(InfoExtractor): + _VALID_URL = r'''(?x)https?:// + (?: + (?: + player\. + (?: + olympusattelecom| + vibebyvista + )| + (?:api|player)\.multicastmedia| + (?:api-ovp|player)\.piksel + )\.com| + (?: + mz-edge\.stream\.co| + movie-s\.nhk\.or + )\.jp| + vidego\.baltimorecity\.gov + )/v/(?:refid/(?P<refid>[^/]+)/prefid/)?(?P<id>[\w-]+)''' + _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)'] + _TESTS = [ + { + 'url': 'http://player.piksel.com/v/ums2867l', + 'md5': '34e34c8d89dc2559976a6079db531e85', + 'info_dict': { + 'id': 'ums2867l', + 'ext': 'mp4', + 'title': 'GX-005 with Caption', + 'timestamp': 1481335659, + 'upload_date': '20161210' + } + }, + { + # Original source: http://www.uscourts.gov/cameras-courts/state-washington-vs-donald-j-trump-et-al + 'url': 'https://player.piksel.com/v/v80kqp41', + 'md5': '753ddcd8cc8e4fa2dda4b7be0e77744d', + 'info_dict': { + 'id': 'v80kqp41', + 'ext': 'mp4', + 'title': 'WAW- State of Washington vs. Donald J. Trump, et al', + 'description': 'State of Washington vs. Donald J. Trump, et al, Case Number 17-CV-00141-JLR, TRO Hearing, Civil Rights Case, 02/3/2017, 1:00 PM (PST), Seattle Federal Courthouse, Seattle, WA, Judge James L. Robart presiding.', + 'timestamp': 1486171129, + 'upload_date': '20170204' + } + }, + { + # https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2019240/ + 'url': 'http://player.piksel.com/v/refid/nhkworld/prefid/nw_vod_v_en_2019_240_20190823233000_02_1566873477', + 'only_matching': True, + } + ] + + def _call_api(self, app_token, resource, display_id, query, host='https://player.piksel.com', fatal=True): + url = urljoin(host, f'/ws/ws_{resource}/api/{app_token}/mode/json/apiv/5') + response = traverse_obj( + self._download_json(url, display_id, query=query, fatal=fatal), ('response', {dict})) or {} + failure = traverse_obj(response, ('failure', 'reason')) if response else 'Empty response from API' + if failure: + if fatal: + raise ExtractorError(failure, expected=True) + self.report_warning(failure) + return response + + def _real_extract(self, url): + ref_id, display_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, display_id) + app_token = self._search_regex([ + r'clientAPI\s*:\s*"([^"]+)"', + r'data-de-api-key\s*=\s*"([^"]+)"' + ], webpage, 'app token') + query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id} + program = self._call_api( + app_token, 'program', display_id, query, url)['WsProgramResponse']['program'] + video_id = program['uuid'] + video_data = program['asset'] + title = video_data['title'] + asset_type = dict_get(video_data, ['assetType', 'asset_type']) + + formats = [] + + def process_asset_file(asset_file): + if not asset_file: + return + # TODO: extract rtmp formats + http_url = asset_file.get('http_url') + if not http_url: + return + tbr = None + vbr = int_or_none(asset_file.get('videoBitrate'), 1024) + abr = int_or_none(asset_file.get('audioBitrate'), 1024) + if asset_type == 'video': + tbr = vbr + abr + elif asset_type == 'audio': + tbr = abr + + formats.append({ + 'format_id': join_nonempty('http', tbr), + 'url': unescapeHTML(http_url), + 'vbr': vbr, + 'abr': abr, + 'width': int_or_none(asset_file.get('videoWidth')), + 'height': int_or_none(asset_file.get('videoHeight')), + 'filesize': int_or_none(asset_file.get('filesize')), + 'tbr': tbr, + }) + + def process_asset_files(asset_files): + for asset_file in (asset_files or []): + process_asset_file(asset_file) + + process_asset_files(video_data.get('assetFiles')) + process_asset_file(video_data.get('referenceFile')) + if not formats: + asset_id = video_data.get('assetid') or program.get('assetid') + if asset_id: + process_asset_files(try_get(self._call_api( + app_token, 'asset_file', display_id, { + 'assetid': asset_id, + }, url, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) + + m3u8_url = dict_get(video_data, [ + 'm3u8iPadURL', + 'ipadM3u8Url', + 'm3u8AndroidURL', + 'm3u8iPhoneURL', + 'iphoneM3u8Url']) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + smil_url = dict_get(video_data, ['httpSmil', 'hdSmil', 'rtmpSmil']) + if smil_url: + transform_source = None + if ref_id == 'nhkworld': + # TODO: figure out if this is something to be fixed in urljoin, + # _parse_smil_formats or keep it here + transform_source = lambda x: x.replace('src="/', 'src="').replace('/media"', '/media/"') + formats.extend(self._extract_smil_formats( + re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, + transform_source=transform_source, fatal=False)) + + subtitles = {} + for caption in video_data.get('captions', []): + caption_url = caption.get('url') + if caption_url: + subtitles.setdefault(caption.get('locale', 'en'), []).append({ + 'url': caption_url}) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnailUrl'), + 'timestamp': parse_iso8601(video_data.get('dateadd')), + 'formats': formats, + 'subtitles': subtitles, + '_format_sort_fields': ('tbr', ), # Incomplete resolution information + } diff --git a/yt_dlp/extractor/pinkbike.py b/yt_dlp/extractor/pinkbike.py new file mode 100644 index 0000000..e4e1caa --- /dev/null +++ b/yt_dlp/extractor/pinkbike.py @@ -0,0 +1,93 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_end, + remove_start, + str_to_int, + unified_strdate, +) + + +class PinkbikeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.pinkbike.com/video/402811/', + 'md5': '4814b8ca7651034cd87e3361d5c2155a', + 'info_dict': { + 'id': '402811', + 'ext': 'mp4', + 'title': 'Brandon Semenuk - RAW 100', + 'description': 'Official release: www.redbull.ca/rupertwalker', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 100, + 'upload_date': '20150406', + 'uploader': 'revelco', + 'location': 'Victoria, British Columbia, Canada', + 'view_count': int, + 'comment_count': int, + } + }, { + 'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://www.pinkbike.com/video/%s' % video_id, video_id) + + formats = [] + for _, format_id, src in re.findall( + r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage): + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)) + formats.append({ + 'url': src, + 'format_id': format_id, + 'height': height, + }) + + title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike') + description = self._html_search_regex( + r'(?s)id="media-description"[^>]*>(.+?)<', + webpage, 'description', default=None) or remove_start( + self._og_search_description(webpage), title + '. ') + thumbnail = self._og_search_thumbnail(webpage) + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration')) + + uploader = self._search_regex( + r'<a[^>]+\brel=["\']author[^>]+>([^<]+)', webpage, + 'uploader', fatal=False) + upload_date = unified_strdate(self._search_regex( + r'class="fullTime"[^>]+title="([^"]+)"', + webpage, 'upload date', fatal=False)) + + location = self._html_search_regex( + r'(?s)<dt>Location</dt>\s*<dd>(.+?)<', + webpage, 'location', fatal=False) + + def extract_count(webpage, label): + return str_to_int(self._search_regex( + r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label, + webpage, label, fatal=False)) + + view_count = extract_count(webpage, 'Views') + comment_count = extract_count(webpage, 'Comments') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'upload_date': upload_date, + 'uploader': uploader, + 'location': location, + 'view_count': view_count, + 'comment_count': comment_count, + 'formats': formats + } diff --git a/yt_dlp/extractor/pinterest.py b/yt_dlp/extractor/pinterest.py new file mode 100644 index 0000000..8361fbb --- /dev/null +++ b/yt_dlp/extractor/pinterest.py @@ -0,0 +1,248 @@ +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + str_or_none, + strip_or_none, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class PinterestBaseIE(InfoExtractor): + _VALID_URL_BASE = r'''(?x) + https?://(?:[^/]+\.)?pinterest\.(?: + com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx| + dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu| + co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)''' + + def _call_api(self, resource, video_id, options): + return self._download_json( + 'https://www.pinterest.com/resource/%sResource/get/' % resource, + video_id, 'Download %s JSON metadata' % resource, query={ + 'data': json.dumps({'options': options}) + })['resource_response'] + + def _extract_video(self, data, extract_formats=True): + video_id = data['id'] + thumbnails = [] + images = data.get('images') + if isinstance(images, dict): + for thumbnail_id, thumbnail in images.items(): + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + info = { + 'title': strip_or_none(traverse_obj(data, 'title', 'grid_title', default='')), + 'description': traverse_obj(data, 'seo_description', 'description'), + 'timestamp': unified_timestamp(data.get('created_at')), + 'thumbnails': thumbnails, + 'uploader': traverse_obj(data, ('closeup_attribution', 'full_name')), + 'uploader_id': str_or_none(traverse_obj(data, ('closeup_attribution', 'id'))), + 'repost_count': int_or_none(data.get('repin_count')), + 'comment_count': int_or_none(data.get('comment_count')), + 'categories': traverse_obj(data, ('pin_join', 'visual_annotation'), expected_type=list), + 'tags': traverse_obj(data, 'hashtags', expected_type=list), + } + + urls = [] + formats = [] + duration = None + domain = data.get('domain', '') + if domain.lower() != 'uploaded by user' and traverse_obj(data, ('embed', 'src')): + if not info['title']: + info['title'] = None + return { + '_type': 'url_transparent', + 'url': data['embed']['src'], + **info, + } + + elif extract_formats: + video_list = traverse_obj( + data, ('videos', 'video_list'), + ('story_pin_data', 'pages', ..., 'blocks', ..., 'video', 'video_list'), + expected_type=dict, get_all=False, default={}) + for format_id, format_dict in video_list.items(): + if not isinstance(format_dict, dict): + continue + format_url = url_or_none(format_dict.get('url')) + if not format_url or format_url in urls: + continue + urls.append(format_url) + duration = float_or_none(format_dict.get('duration'), scale=1000) + ext = determine_ext(format_url) + if 'hls' in format_id.lower() or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'width': int_or_none(format_dict.get('width')), + 'height': int_or_none(format_dict.get('height')), + 'duration': duration, + }) + + return { + 'id': video_id, + 'formats': formats, + 'duration': duration, + 'webpage_url': f'https://www.pinterest.com/pin/{video_id}/', + 'extractor_key': PinterestIE.ie_key(), + 'extractor': PinterestIE.IE_NAME, + **info, + } + + +class PinterestIE(PinterestBaseIE): + _VALID_URL = r'%s/pin/(?P<id>\d+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + # formats found in data['videos'] + 'url': 'https://www.pinterest.com/pin/664281013778109217/', + 'md5': '6550c2af85d6d9f3fe3b88954d1577fc', + 'info_dict': { + 'id': '664281013778109217', + 'ext': 'mp4', + 'title': 'Origami', + 'description': 'md5:e29801cab7d741ea8c741bc50c8d00ab', + 'duration': 57.7, + 'timestamp': 1593073622, + 'upload_date': '20200625', + 'repost_count': int, + 'comment_count': int, + 'categories': list, + 'tags': list, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, + }, { + # formats found in data['story_pin_data'] + 'url': 'https://www.pinterest.com/pin/1084663891475263837/', + 'md5': '069ac19919ab9e1e13fa60de46290b03', + 'info_dict': { + 'id': '1084663891475263837', + 'ext': 'mp4', + 'title': 'Gadget, Cool products, Amazon product, technology, Kitchen gadgets', + 'description': 'md5:d0a4b6ae996ff0c6eed83bc869598d13', + 'uploader': 'CoolCrazyGadgets', + 'uploader_id': '1084664028912989237', + 'upload_date': '20211003', + 'timestamp': 1633246654.0, + 'duration': 14.9, + 'comment_count': int, + 'repost_count': int, + 'categories': 'count:9', + 'tags': list, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, + }, { + # vimeo.com embed + 'url': 'https://www.pinterest.ca/pin/441282463481903715/', + 'info_dict': { + 'id': '111691128', + 'ext': 'mp4', + 'title': 'Tonite Let\'s All Make Love In London (1967)', + 'description': 'md5:8190f37b3926807809ec57ec21aa77b2', + 'uploader': 'Vimeo', + 'uploader_id': '473792960706651251', + 'upload_date': '20180120', + 'timestamp': 1516409040, + 'duration': 3404, + 'comment_count': int, + 'repost_count': int, + 'categories': 'count:9', + 'tags': [], + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'uploader_url': 'https://vimeo.com/willardandrade', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://co.pinterest.com/pin/824721750502199491/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api( + 'Pin', video_id, { + 'field_set_key': 'unauth_react_main_pin', + 'id': video_id, + })['data'] + return self._extract_video(data) + + +class PinterestCollectionIE(PinterestBaseIE): + _VALID_URL = r'%s/(?P<username>[^/]+)/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', + 'info_dict': { + 'id': '585890301462791043', + 'title': 'cool diys', + }, + 'playlist_count': 8, + }, { + 'url': 'https://www.pinterest.ca/fudohub/videos/', + 'info_dict': { + 'id': '682858430939307450', + 'title': 'VIDEOS', + }, + 'playlist_mincount': 365, + 'skip': 'Test with extract_formats=False', + }] + + @classmethod + def suitable(cls, url): + return False if PinterestIE.suitable(url) else super( + PinterestCollectionIE, cls).suitable(url) + + def _real_extract(self, url): + username, slug = self._match_valid_url(url).groups() + board = self._call_api( + 'Board', slug, { + 'slug': slug, + 'username': username + })['data'] + board_id = board['id'] + options = { + 'board_id': board_id, + 'page_size': 250, + } + bookmark = None + entries = [] + while True: + if bookmark: + options['bookmarks'] = [bookmark] + board_feed = self._call_api('BoardFeed', board_id, options) + for item in (board_feed.get('data') or []): + if not isinstance(item, dict) or item.get('type') != 'pin': + continue + video_id = item.get('id') + if video_id: + # Some pins may not be available anonymously via pin URL + # video = self._extract_video(item, extract_formats=False) + # video.update({ + # '_type': 'url_transparent', + # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, + # }) + # entries.append(video) + entries.append(self._extract_video(item)) + bookmark = board_feed.get('bookmark') + if not bookmark: + break + return self.playlist_result( + entries, playlist_id=board_id, playlist_title=board.get('name')) diff --git a/yt_dlp/extractor/pixivsketch.py b/yt_dlp/extractor/pixivsketch.py new file mode 100644 index 0000000..850c6f2 --- /dev/null +++ b/yt_dlp/extractor/pixivsketch.py @@ -0,0 +1,118 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_timestamp, +) + + +class PixivSketchBaseIE(InfoExtractor): + def _call_api(self, video_id, path, referer, note='Downloading JSON metadata'): + response = self._download_json(f'https://sketch.pixiv.net/api/{path}', video_id, note=note, headers={ + 'Referer': referer, + 'X-Requested-With': referer, + }) + errors = traverse_obj(response, ('errors', ..., 'message')) + if errors: + raise ExtractorError(' '.join(f'{e}.' for e in errors)) + return response.get('data') or {} + + +class PixivSketchIE(PixivSketchBaseIE): + IE_NAME = 'pixiv:sketch' + _VALID_URL = r'https?://sketch\.pixiv\.net/@(?P<uploader_id>[a-zA-Z0-9_-]+)/lives/(?P<id>\d+)/?' + _TESTS = [{ + 'url': 'https://sketch.pixiv.net/@nuhutya/lives/3654620468641830507', + 'info_dict': { + 'id': '7370666691623196569', + 'title': 'まにあえクリスマス!', + 'uploader': 'ぬふちゃ', + 'uploader_id': 'nuhutya', + 'channel_id': '9844815', + 'age_limit': 0, + 'timestamp': 1640351536, + }, + 'skip': True, + }, { + # these two (age_limit > 0) requires you to login on website, but it's actually not required for download + 'url': 'https://sketch.pixiv.net/@namahyou/lives/4393103321546851377', + 'info_dict': { + 'id': '4907995960957946943', + 'title': 'クリスマスなんて知らん🖕', + 'uploader': 'すゃもり', + 'uploader_id': 'suya2mori2', + 'channel_id': '31169300', + 'age_limit': 15, + 'timestamp': 1640347640, + }, + 'skip': True, + }, { + 'url': 'https://sketch.pixiv.net/@8aki/lives/3553803162487249670', + 'info_dict': { + 'id': '1593420639479156945', + 'title': 'おまけ本作業(リョナ有)', + 'uploader': 'おぶい / Obui', + 'uploader_id': 'oving', + 'channel_id': '17606', + 'age_limit': 18, + 'timestamp': 1640330263, + }, + 'skip': True, + }] + + def _real_extract(self, url): + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') + data = self._call_api(video_id, f'lives/{video_id}.json', url) + + if not traverse_obj(data, 'is_broadcasting'): + raise ExtractorError(f'This live is offline. Use https://sketch.pixiv.net/@{uploader_id} for ongoing live.', expected=True) + + m3u8_url = traverse_obj(data, ('owner', 'hls_movie', 'url')) + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + + return { + 'id': video_id, + 'title': data.get('name'), + 'formats': formats, + 'uploader': traverse_obj(data, ('user', 'name'), ('owner', 'user', 'name')), + 'uploader_id': traverse_obj(data, ('user', 'unique_name'), ('owner', 'user', 'unique_name')), + 'channel_id': str(traverse_obj(data, ('user', 'pixiv_user_id'), ('owner', 'user', 'pixiv_user_id'))), + 'age_limit': 18 if data.get('is_r18') else 15 if data.get('is_r15') else 0, + 'timestamp': unified_timestamp(data.get('created_at')), + 'is_live': True + } + + +class PixivSketchUserIE(PixivSketchBaseIE): + IE_NAME = 'pixiv:sketch:user' + _VALID_URL = r'https?://sketch\.pixiv\.net/@(?P<id>[a-zA-Z0-9_-]+)/?' + _TESTS = [{ + 'url': 'https://sketch.pixiv.net/@nuhutya', + 'only_matching': True, + }, { + 'url': 'https://sketch.pixiv.net/@namahyou', + 'only_matching': True, + }, { + 'url': 'https://sketch.pixiv.net/@8aki', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return super(PixivSketchUserIE, cls).suitable(url) and not PixivSketchIE.suitable(url) + + def _real_extract(self, url): + user_id = self._match_id(url) + data = self._call_api(user_id, f'lives/users/@{user_id}.json', url) + + if not traverse_obj(data, 'is_broadcasting'): + try: + self._call_api(user_id, 'users/current.json', url, 'Investigating reason for request failure') + except ExtractorError as ex: + if ex.cause and ex.cause.code == 401: + self.raise_login_required(f'Please log in, or use direct link like https://sketch.pixiv.net/@{user_id}/1234567890', method='cookies') + raise ExtractorError('This user is offline', expected=True) + + return self.url_result(f'https://sketch.pixiv.net/@{user_id}/lives/{data["id"]}') diff --git a/yt_dlp/extractor/pladform.py b/yt_dlp/extractor/pladform.py new file mode 100644 index 0000000..d67f600 --- /dev/null +++ b/yt_dlp/extractor/pladform.py @@ -0,0 +1,135 @@ +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_qs, + xpath_text, + qualities, +) + + +class PladformIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?: + out\.pladform\.ru/player| + static\.pladform\.ru/player\.swf + ) + \?.*\bvideoid=| + video\.pladform\.ru/catalog/video/videoid/ + ) + (?P<id>\d+) + ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1'] + _TESTS = [{ + 'url': 'http://out.pladform.ru/player?pl=18079&type=html5&videoid=100231282', + 'info_dict': { + 'id': '6216d548e755edae6e8280667d774791', + 'ext': 'mp4', + 'timestamp': 1406117012, + 'title': 'Гарик Мартиросян и Гарик Харламов - Кастинг на концерт ко Дню милиции', + 'age_limit': 0, + 'upload_date': '20140723', + 'thumbnail': str, + 'view_count': int, + 'description': str, + 'uploader_id': '12082', + 'uploader': 'Comedy Club', + 'duration': 367, + }, + 'expected_warnings': ['HTTP Error 404: Not Found'] + }, { + 'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0', + 'md5': '53362fac3a27352da20fa2803cc5cd6f', + 'info_dict': { + 'id': '3777899', + 'ext': 'mp4', + 'title': 'СТУДИЯ СОЮЗ • Шоу Студия Союз, 24 выпуск (01.02.2018) Нурлан Сабуров и Слава Комиссаренко', + 'description': 'md5:05140e8bf1b7e2d46e7ba140be57fd95', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3190, + }, + }, { + 'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', + 'only_matching': True, + }, { + 'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + qs = parse_qs(url) + pl = qs.get('pl', ['1'])[0] + + video = self._download_xml( + 'http://out.pladform.ru/getVideo', video_id, query={ + 'pl': pl, + 'videoid': video_id, + }, fatal=False) + + def fail(text): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, text), + expected=True) + + if not video: + targetUrl = self._request_webpage(url, video_id, note='Resolving final URL').url + if targetUrl == url: + raise ExtractorError('Can\'t parse page') + return self.url_result(targetUrl) + + if video.tag == 'error': + fail(video.text) + + quality = qualities(('ld', 'sd', 'hd')) + + formats = [] + for src in video.findall('./src'): + if src is None: + continue + format_url = src.text + if not format_url: + continue + if src.get('type') == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src.text, + 'format_id': src.get('quality'), + 'quality': quality(src.get('quality')), + }) + + if not formats: + error = xpath_text(video, './cap', 'error', default=None) + if error: + fail(error) + + webpage = self._download_webpage( + 'http://video.pladform.ru/catalog/video/videoid/%s' % video_id, + video_id) + + title = self._og_search_title(webpage, fatal=False) or xpath_text( + video, './/title', 'title', fatal=True) + description = self._search_regex( + r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) or xpath_text( + video, './/cover', 'cover') + + duration = int_or_none(xpath_text(video, './/time', 'duration')) + age_limit = int_or_none(xpath_text(video, './/age18', 'age limit')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/yt_dlp/extractor/planetmarathi.py b/yt_dlp/extractor/planetmarathi.py new file mode 100644 index 0000000..a4b612a --- /dev/null +++ b/yt_dlp/extractor/planetmarathi.py @@ -0,0 +1,71 @@ +from .common import InfoExtractor +from ..utils import ( + try_get, + unified_strdate, +) + + +class PlanetMarathiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?planetmarathi\.com/titles/(?P<id>[^/#&?$]+)' + _TESTS = [{ + 'url': 'https://www.planetmarathi.com/titles/ek-unad-divas', + 'playlist_mincount': 2, + 'info_dict': { + 'id': 'ek-unad-divas', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'ASSETS-MOVIE-ASSET-01_ek-unad-divas', + 'ext': 'mp4', + 'title': 'ek unad divas', + 'alt_title': 'चित्रपट', + 'description': 'md5:41c7ed6b041c2fea9820a3f3125bd881', + 'episode_number': 1, + 'duration': 5539, + 'upload_date': '20210829', + }, + }] # Trailer skipped + }, { + 'url': 'https://www.planetmarathi.com/titles/baap-beep-baap-season-1', + 'playlist_mincount': 10, + 'info_dict': { + 'id': 'baap-beep-baap-season-1', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'ASSETS-CHARACTER-PROFILE-SEASON-01-ASSET-01_baap-beep-baap-season-1', + 'ext': 'mp4', + 'title': 'Manohar Kanhere', + 'alt_title': 'मनोहर कान्हेरे', + 'description': 'md5:285ed45d5c0ab5522cac9a043354ebc6', + 'season_number': 1, + 'episode_number': 1, + 'duration': 29, + 'upload_date': '20210829', + }, + }] # Trailers, Episodes, other Character profiles skipped + }] + + def _real_extract(self, url): + id = self._match_id(url) + entries = [] + json_data = self._download_json(f'https://www.planetmarathi.com/api/v1/titles/{id}/assets', id)['assets'] + for asset in json_data: + asset_title = asset['mediaAssetName']['en'] + if asset_title == 'Movie': + asset_title = id.replace('-', ' ') + asset_id = f'{asset["sk"]}_{id}'.replace('#', '-') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(asset['mediaAssetURL'], asset_id) + entries.append({ + 'id': asset_id, + 'title': asset_title, + 'alt_title': try_get(asset, lambda x: x['mediaAssetName']['mr']), + 'description': try_get(asset, lambda x: x['mediaAssetDescription']['en']), + 'season_number': asset.get('mediaAssetSeason'), + 'episode_number': asset.get('mediaAssetIndexForAssetType'), + 'duration': asset.get('mediaAssetDurationInSeconds'), + 'upload_date': unified_strdate(asset.get('created')), + 'formats': formats, + 'subtitles': subtitles, + }) + return self.playlist_result(entries, playlist_id=id) diff --git a/yt_dlp/extractor/platzi.py b/yt_dlp/extractor/platzi.py new file mode 100644 index 0000000..166b98c --- /dev/null +++ b/yt_dlp/extractor/platzi.py @@ -0,0 +1,213 @@ +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_str, +) +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + str_or_none, + try_get, + url_or_none, + urlencode_postdata, + urljoin, +) + + +class PlatziBaseIE(InfoExtractor): + _LOGIN_URL = 'https://platzi.com/login/' + _NETRC_MACHINE = 'platzi' + + def _perform_login(self, username, password): + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'email': username, + 'password': password, + }) + + urlh = self._request_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Referer': self._LOGIN_URL}) + + # login succeeded + if 'platzi.com/login' not in urlh.url: + return + + login_error = self._webpage_read_content( + urlh, self._LOGIN_URL, None, 'Downloading login error page') + + login = self._parse_json( + self._search_regex( + r'login\s*=\s*({.+?})(?:\s*;|\s*</script)', login_error, 'login'), + None) + + for kind in ('error', 'password', 'nonFields'): + error = str_or_none(login.get('%sError' % kind)) + if error: + raise ExtractorError( + 'Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class PlatziIE(PlatziBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + platzi\.com/clases| # es version + courses\.platzi\.com/classes # en version + )/[^/]+/(?P<id>\d+)-[^/?\#&]+ + ''' + + _TESTS = [{ + 'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/', + 'md5': '8f56448241005b561c10f11a595b37e3', + 'info_dict': { + 'id': '12074', + 'ext': 'mp4', + 'title': 'Creando nuestra primera página', + 'description': 'md5:4c866e45034fc76412fbf6e60ae008bc', + 'duration': 420, + }, + 'skip': 'Requires platzi account credentials', + }, { + 'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/', + 'info_dict': { + 'id': '13430', + 'ext': 'mp4', + 'title': 'Background', + 'description': 'md5:49c83c09404b15e6e71defaf87f6b305', + 'duration': 360, + }, + 'skip': 'Requires platzi account credentials', + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + lecture_id = self._match_id(url) + + webpage = self._download_webpage(url, lecture_id) + + data = self._parse_json( + self._search_regex( + # client_data may contain "};" so that we have to try more + # strict regex first + (r'client_data\s*=\s*({.+?})\s*;\s*\n', + r'client_data\s*=\s*({.+?})\s*;'), + webpage, 'client data'), + lecture_id) + + material = data['initialState']['material'] + desc = material['description'] + title = desc['title'] + + formats = [] + for server_id, server in material['videos'].items(): + if not isinstance(server, dict): + continue + for format_id in ('hls', 'dash'): + format_url = url_or_none(server.get(format_id)) + if not format_url: + continue + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, lecture_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, + note='Downloading %s m3u8 information' % server_id, + fatal=False)) + elif format_id == 'dash': + formats.extend(self._extract_mpd_formats( + format_url, lecture_id, mpd_id=format_id, + note='Downloading %s MPD manifest' % server_id, + fatal=False)) + + content = str_or_none(desc.get('content')) + description = (clean_html(compat_b64decode(content).decode('utf-8')) + if content else None) + duration = int_or_none(material.get('duration'), invscale=60) + + return { + 'id': lecture_id, + 'title': title, + 'description': description, + 'duration': duration, + 'formats': formats, + } + + +class PlatziCourseIE(PlatziBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + platzi\.com/clases| # es version + courses\.platzi\.com/classes # en version + )/(?P<id>[^/?\#&]+) + ''' + _TESTS = [{ + 'url': 'https://platzi.com/clases/next-js/', + 'info_dict': { + 'id': '1311', + 'title': 'Curso de Next.js', + }, + 'playlist_count': 22, + }, { + 'url': 'https://courses.platzi.com/classes/communication-codestream/', + 'info_dict': { + 'id': '1367', + 'title': 'Codestream Course', + }, + 'playlist_count': 14, + }] + + @classmethod + def suitable(cls, url): + return False if PlatziIE.suitable(url) else super(PlatziCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_name = self._match_id(url) + + webpage = self._download_webpage(url, course_name) + + props = self._parse_json( + self._search_regex(r'data\s*=\s*({.+?})\s*;', webpage, 'data'), + course_name)['initialProps'] + + entries = [] + for chapter_num, chapter in enumerate(props['concepts'], 1): + if not isinstance(chapter, dict): + continue + materials = chapter.get('materials') + if not materials or not isinstance(materials, list): + continue + chapter_title = chapter.get('title') + chapter_id = str_or_none(chapter.get('id')) + for material in materials: + if not isinstance(material, dict): + continue + if material.get('material_type') != 'video': + continue + video_url = urljoin(url, material.get('url')) + if not video_url: + continue + entries.append({ + '_type': 'url_transparent', + 'url': video_url, + 'title': str_or_none(material.get('name')), + 'id': str_or_none(material.get('id')), + 'ie_key': PlatziIE.ie_key(), + 'chapter': chapter_title, + 'chapter_number': chapter_num, + 'chapter_id': chapter_id, + }) + + course_id = compat_str(try_get(props, lambda x: x['course']['id'])) + course_title = try_get(props, lambda x: x['course']['name'], compat_str) + + return self.playlist_result(entries, course_id, course_title) diff --git a/yt_dlp/extractor/playplustv.py b/yt_dlp/extractor/playplustv.py new file mode 100644 index 0000000..a4439c8 --- /dev/null +++ b/yt_dlp/extractor/playplustv.py @@ -0,0 +1,100 @@ +import json + +from .common import InfoExtractor +from ..networking import PUTRequest +from ..networking.exceptions import HTTPError +from ..utils import ExtractorError, clean_html, int_or_none + + +class PlayPlusTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?playplus\.(?:com|tv)/VOD/(?P<project_id>[0-9]+)/(?P<id>[0-9a-f]{32})' + _TEST = { + 'url': 'https://www.playplus.tv/VOD/7572/db8d274a5163424e967f35a30ddafb8e', + 'md5': 'd078cb89d7ab6b9df37ce23c647aef72', + 'info_dict': { + 'id': 'db8d274a5163424e967f35a30ddafb8e', + 'ext': 'mp4', + 'title': 'Capítulo 179 - Final', + 'description': 'md5:01085d62d8033a1e34121d3c3cabc838', + 'timestamp': 1529992740, + 'upload_date': '20180626', + }, + 'skip': 'Requires account credential', + } + _NETRC_MACHINE = 'playplustv' + _GEO_COUNTRIES = ['BR'] + _token = None + _profile_id = None + + def _call_api(self, resource, video_id=None, query=None): + return self._download_json('https://api.playplus.tv/api/media/v2/get' + resource, video_id, headers={ + 'Authorization': 'Bearer ' + self._token, + }, query=query) + + def _perform_login(self, username, password): + req = PUTRequest( + 'https://api.playplus.tv/api/web/login', json.dumps({ + 'email': username, + 'password': password, + }).encode(), { + 'Content-Type': 'application/json; charset=utf-8', + }) + + try: + self._token = self._download_json(req, None)['token'] + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + raise ExtractorError(self._parse_json( + e.cause.response.read(), None)['errorMessage'], expected=True) + raise + + self._profile = self._call_api('Profiles')['list'][0]['_id'] + + def _real_initialize(self): + if not self._token: + self.raise_login_required(method='password') + + def _real_extract(self, url): + project_id, media_id = self._match_valid_url(url).groups() + media = self._call_api( + 'Media', media_id, { + 'profileId': self._profile, + 'projectId': project_id, + 'mediaId': media_id, + })['obj'] + title = media['title'] + + formats = [] + for f in media.get('files', []): + f_url = f.get('url') + if not f_url: + continue + file_info = f.get('fileInfo') or {} + formats.append({ + 'url': f_url, + 'width': int_or_none(file_info.get('width')), + 'height': int_or_none(file_info.get('height')), + }) + + thumbnails = [] + for thumb in media.get('thumbs', []): + thumb_url = thumb.get('url') + if not thumb_url: + continue + thumbnails.append({ + 'url': thumb_url, + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + }) + + return { + 'id': media_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clean_html(media.get('description')) or media.get('shortDescription'), + 'timestamp': int_or_none(media.get('publishDate'), 1000), + 'view_count': int_or_none(media.get('numberOfViews')), + 'comment_count': int_or_none(media.get('numberOfComments')), + 'tags': media.get('tags'), + } diff --git a/yt_dlp/extractor/playsuisse.py b/yt_dlp/extractor/playsuisse.py new file mode 100644 index 0000000..7c5cad1 --- /dev/null +++ b/yt_dlp/extractor/playsuisse.py @@ -0,0 +1,234 @@ +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + parse_qs, + traverse_obj, + update_url_query, + urlencode_postdata, +) + + +class PlaySuisseIE(InfoExtractor): + _NETRC_MACHINE = 'playsuisse' + _VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/(?:watch|detail)/(?:[^#]*[?&]episodeId=)?(?P<id>[0-9]+)' + _TESTS = [ + { + # Old URL + 'url': 'https://www.playsuisse.ch/watch/763211/0', + 'only_matching': True, + }, + { + # episode in a series + 'url': 'https://www.playsuisse.ch/watch/763182?episodeId=763211', + 'md5': '82df2a470b2dfa60c2d33772a8a60cf8', + 'info_dict': { + 'id': '763211', + 'ext': 'mp4', + 'title': 'Knochen', + 'description': 'md5:8ea7a8076ba000cd9e8bc132fd0afdd8', + 'duration': 3344, + 'series': 'Wilder', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Knochen', + 'episode_number': 1, + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + } + }, { + # film + 'url': 'https://www.playsuisse.ch/watch/808675', + 'md5': '818b94c1d2d7c4beef953f12cb8f3e75', + 'info_dict': { + 'id': '808675', + 'ext': 'mp4', + 'title': 'Der Läufer', + 'description': 'md5:9f61265c7e6dcc3e046137a792b275fd', + 'duration': 5280, + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + } + }, { + # series (treated as a playlist) + 'url': 'https://www.playsuisse.ch/detail/1115687', + 'info_dict': { + 'description': 'md5:e4a2ae29a8895823045b5c3145a02aa3', + 'id': '1115687', + 'series': 'They all came out to Montreux', + 'title': 'They all came out to Montreux', + }, + 'playlist': [{ + 'info_dict': { + 'description': 'md5:f2462744834b959a31adc6292380cda2', + 'duration': 3180, + 'episode': 'Folge 1', + 'episode_number': 1, + 'id': '1112663', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 1', + 'ext': 'mp4' + }, + }, { + 'info_dict': { + 'description': 'md5:9dfd308699fe850d3bce12dc1bad9b27', + 'duration': 2935, + 'episode': 'Folge 2', + 'episode_number': 2, + 'id': '1112661', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 2', + 'ext': 'mp4' + }, + }, { + 'info_dict': { + 'description': 'md5:14a93a3356b2492a8f786ab2227ef602', + 'duration': 2994, + 'episode': 'Folge 3', + 'episode_number': 3, + 'id': '1112664', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 3', + 'ext': 'mp4' + } + }], + } + ] + + _GRAPHQL_QUERY = ''' + query AssetWatch($assetId: ID!) { + assetV2(id: $assetId) { + ...Asset + episodes { + ...Asset + } + } + } + fragment Asset on AssetV2 { + id + name + description + duration + episodeNumber + seasonNumber + seriesName + medias { + type + url + } + thumbnail16x9 { + ...ImageDetails + } + thumbnail2x3 { + ...ImageDetails + } + thumbnail16x9WithTitle { + ...ImageDetails + } + thumbnail2x3WithTitle { + ...ImageDetails + } + } + fragment ImageDetails on AssetImage { + id + url + }''' + _LOGIN_BASE_URL = 'https://login.srgssr.ch/srgssrlogin.onmicrosoft.com' + _LOGIN_PATH = 'B2C_1A__SignInV2' + _ID_TOKEN = None + + def _perform_login(self, username, password): + login_page = self._download_webpage( + 'https://www.playsuisse.ch/api/sso/login', None, note='Downloading login page', + query={'x': 'x', 'locale': 'de', 'redirectUrl': 'https://www.playsuisse.ch/'}) + settings = self._search_json(r'var\s+SETTINGS\s*=', login_page, 'settings', None) + + csrf_token = settings['csrf'] + query = {'tx': settings['transId'], 'p': self._LOGIN_PATH} + + status = traverse_obj(self._download_json( + f'{self._LOGIN_BASE_URL}/{self._LOGIN_PATH}/SelfAsserted', None, 'Logging in', + query=query, headers={'X-CSRF-TOKEN': csrf_token}, data=urlencode_postdata({ + 'request_type': 'RESPONSE', + 'signInName': username, + 'password': password + }), expected_status=400), ('status', {int_or_none})) + if status == 400: + raise ExtractorError('Invalid username or password', expected=True) + + urlh = self._request_webpage( + f'{self._LOGIN_BASE_URL}/{self._LOGIN_PATH}/api/CombinedSigninAndSignup/confirmed', + None, 'Downloading ID token', query={ + 'rememberMe': 'false', + 'csrf_token': csrf_token, + **query, + 'diags': '', + }) + + self._ID_TOKEN = traverse_obj(parse_qs(urlh.url), ('id_token', 0)) + if not self._ID_TOKEN: + raise ExtractorError('Login failed') + + def _get_media_data(self, media_id): + # NOTE In the web app, the "locale" header is used to switch between languages, + # However this doesn't seem to take effect when passing the header here. + response = self._download_json( + 'https://www.playsuisse.ch/api/graphql', + media_id, data=json.dumps({ + 'operationName': 'AssetWatch', + 'query': self._GRAPHQL_QUERY, + 'variables': {'assetId': media_id} + }).encode('utf-8'), + headers={'Content-Type': 'application/json', 'locale': 'de'}) + + return response['data']['assetV2'] + + def _real_extract(self, url): + if not self._ID_TOKEN: + self.raise_login_required(method='password') + + media_id = self._match_id(url) + media_data = self._get_media_data(media_id) + info = self._extract_single(media_data) + if media_data.get('episodes'): + info.update({ + '_type': 'playlist', + 'entries': map(self._extract_single, media_data['episodes']), + }) + return info + + def _extract_single(self, media_data): + thumbnails = traverse_obj(media_data, lambda k, _: k.startswith('thumbnail')) + + formats, subtitles = [], {} + for media in traverse_obj(media_data, 'medias', default=[]): + if not media.get('url') or media.get('type') != 'HLS': + continue + f, subs = self._extract_m3u8_formats_and_subtitles( + update_url_query(media['url'], {'id_token': self._ID_TOKEN}), + media_data['id'], 'mp4', m3u8_id='HLS', fatal=False) + formats.extend(f) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': media_data['id'], + 'title': media_data.get('name'), + 'description': media_data.get('description'), + 'thumbnails': thumbnails, + 'duration': int_or_none(media_data.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + 'series': media_data.get('seriesName'), + 'season_number': int_or_none(media_data.get('seasonNumber')), + 'episode': media_data.get('name') if media_data.get('episodeNumber') else None, + 'episode_number': int_or_none(media_data.get('episodeNumber')), + } diff --git a/yt_dlp/extractor/playtvak.py b/yt_dlp/extractor/playtvak.py new file mode 100644 index 0000000..c418f88 --- /dev/null +++ b/yt_dlp/extractor/playtvak.py @@ -0,0 +1,185 @@ +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, + compat_urllib_parse_urlencode, +) +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + qualities, +) + + +class PlaytvakIE(InfoExtractor): + IE_DESC = 'Playtvak.cz, iDNES.cz and Lidovky.cz' + _VALID_URL = r'https?://(?:.+?\.)?(?:playtvak|idnes|lidovky|metro)\.cz/.*\?(?:c|idvideo)=(?P<id>[^&]+)' + _TESTS = [{ + 'url': 'http://www.playtvak.cz/vyzente-vosy-a-srsne-ze-zahrady-dn5-/hodinovy-manzel.aspx?c=A150730_150323_hodinovy-manzel_kuko', + 'md5': '4525ae312c324b4be2f4603cc78ceb4a', + 'info_dict': { + 'id': 'A150730_150323_hodinovy-manzel_kuko', + 'ext': 'mp4', + 'title': 'Vyžeňte vosy a sršně ze zahrady', + 'description': 'md5:4436e61b7df227a093778efb7e373571', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'duration': 279, + 'timestamp': 1438732860, + 'upload_date': '20150805', + 'is_live': False, + } + }, { # live video test + 'url': 'http://slowtv.playtvak.cz/planespotting-0pr-/planespotting.aspx?c=A150624_164934_planespotting_cat', + 'info_dict': { + 'id': 'A150624_164934_planespotting_cat', + 'ext': 'flv', + 'title': 're:^Planespotting [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { # another live stream, this one without Misc.videoFLV + 'url': 'https://slowtv.playtvak.cz/zive-sledujte-vlaky-v-primem-prenosu-dwi-/hlavni-nadrazi.aspx?c=A151218_145728_hlavni-nadrazi_plap', + 'info_dict': { + 'id': 'A151218_145728_hlavni-nadrazi_plap', + 'ext': 'flv', + 'title': 're:^Hlavní nádraží [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { # idnes.cz + 'url': 'http://zpravy.idnes.cz/pes-zavreny-v-aute-rozbijeni-okynek-v-aute-fj5-/domaci.aspx?c=A150809_104116_domaci_pku', + 'md5': '819832ba33cd7016e58a6658577fe289', + 'info_dict': { + 'id': 'A150809_104116_domaci_pku', + 'ext': 'mp4', + 'title': 'Zavřeli jsme mraženou pizzu do auta. Upekla se', + 'description': 'md5:01e73f02329e2e5760bd5eed4d42e3c2', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'duration': 39, + 'timestamp': 1438969140, + 'upload_date': '20150807', + 'is_live': False, + } + }, { # lidovky.cz + 'url': 'http://www.lidovky.cz/dalsi-demonstrace-v-praze-o-migraci-duq-/video.aspx?c=A150808_214044_ln-video_ELE', + 'md5': 'c7209ac4ba9d234d4ad5bab7485bcee8', + 'info_dict': { + 'id': 'A150808_214044_ln-video_ELE', + 'ext': 'mp4', + 'title': 'Táhni! Demonstrace proti imigrantům budila emoce', + 'description': 'md5:97c81d589a9491fbfa323c9fa3cca72c', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'timestamp': 1439052180, + 'upload_date': '20150808', + 'is_live': False, + } + }, { # metro.cz + 'url': 'http://www.metro.cz/video-pod-billboardem-se-na-vltavske-roztocil-kolotoc-deti-vozil-jen-par-hodin-1hx-/metro-extra.aspx?c=A141111_173251_metro-extra_row', + 'md5': '84fc1deedcac37b7d4a6ccae7c716668', + 'info_dict': { + 'id': 'A141111_173251_metro-extra_row', + 'ext': 'mp4', + 'title': 'Recesisté udělali z billboardu kolotoč', + 'description': 'md5:7369926049588c3989a66c9c1a043c4c', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'timestamp': 1415725500, + 'upload_date': '20141111', + 'is_live': False, + } + }, { + 'url': 'http://www.playtvak.cz/embed.aspx?idvideo=V150729_141549_play-porad_kuko', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + info_url = self._html_search_regex( + r'Misc\.video(?:FLV)?\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') + + parsed_url = compat_urlparse.urlparse(info_url) + + qs = compat_urlparse.parse_qs(parsed_url.query) + qs.update({ + 'reklama': ['0'], + 'type': ['js'], + }) + + info_url = compat_urlparse.urlunparse( + parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + + json_info = self._download_json( + info_url, video_id, + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) + + item = None + for i in json_info['items']: + if i.get('type') == 'video' or i.get('type') == 'stream': + item = i + break + if not item: + raise ExtractorError('No suitable stream found') + + quality = qualities(('low', 'middle', 'high')) + + formats = [] + for fmt in item['video']: + video_url = fmt.get('file') + if not video_url: + continue + + format_ = fmt['format'] + format_id = '%s_%s' % (format_, fmt['quality']) + preference = None + + if format_ in ('mp4', 'webm'): + ext = format_ + elif format_ == 'rtmp': + ext = 'flv' + elif format_ == 'apple': + ext = 'mp4' + # Some streams have mp3 audio which does not play + # well with ffmpeg filter aac_adtstoasc + preference = -10 + elif format_ == 'adobe': # f4m manifest fails with 404 in 80% of requests + continue + else: # Other formats not supported yet + continue + + formats.append({ + 'url': video_url, + 'ext': ext, + 'format_id': format_id, + 'quality': quality(fmt.get('quality')), + 'preference': preference, + }) + + title = item['title'] + is_live = item['type'] == 'stream' + description = self._og_search_description(webpage, default=None) or self._html_search_meta( + 'description', webpage, 'description', default=None) + timestamp = None + duration = None + if not is_live: + duration = int_or_none(item.get('length')) + timestamp = item.get('published') + if timestamp: + timestamp = parse_iso8601(timestamp[:-5]) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': item.get('image'), + 'duration': duration, + 'timestamp': timestamp, + 'is_live': is_live, + 'formats': formats, + } diff --git a/yt_dlp/extractor/playwire.py b/yt_dlp/extractor/playwire.py new file mode 100644 index 0000000..1057bff --- /dev/null +++ b/yt_dlp/extractor/playwire.py @@ -0,0 +1,72 @@ +from .common import InfoExtractor +from ..utils import ( + dict_get, + float_or_none, +) + + +class PlaywireIE(InfoExtractor): + _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)' + _EMBED_REGEX = [r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1'] + + _TESTS = [{ + 'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json', + 'md5': 'e6398701e3595888125729eaa2329ed9', + 'info_dict': { + 'id': '3353705', + 'ext': 'mp4', + 'title': 'S04_RM_UCL_Rus', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 145.94, + }, + }, { + # m3u8 in f4m + 'url': 'http://config.playwire.com/21772/videos/v2/4840492/zeus.json', + 'info_dict': { + 'id': '4840492', + 'ext': 'mp4', + 'title': 'ITV EL SHOW FULL', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # Multiple resolutions while bitrates missing + 'url': 'http://cdn.playwire.com/11625/embed/85228.html', + 'only_matching': True, + }, { + 'url': 'http://config.playwire.com/12421/videos/v2/3389892/zeus.json', + 'only_matching': True, + }, { + 'url': 'http://cdn.playwire.com/v2/12342/config/1532636.json', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + publisher_id, video_id = mobj.group('publisher_id'), mobj.group('id') + + player = self._download_json( + 'http://config.playwire.com/%s/videos/v2/%s/zeus.json' % (publisher_id, video_id), + video_id) + + title = player['settings']['title'] + duration = float_or_none(player.get('duration'), 1000) + + content = player['content'] + thumbnail = content.get('poster') + src = content['media']['f4m'] + + formats = self._extract_f4m_formats(src, video_id, m3u8_id='hls') + for a_format in formats: + if not dict_get(a_format, ['tbr', 'width', 'height']): + a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0 + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } diff --git a/yt_dlp/extractor/pluralsight.py b/yt_dlp/extractor/pluralsight.py new file mode 100644 index 0000000..809b656 --- /dev/null +++ b/yt_dlp/extractor/pluralsight.py @@ -0,0 +1,491 @@ +import collections +import json +import os +import random +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + dict_get, + ExtractorError, + float_or_none, + int_or_none, + parse_duration, + parse_qs, + qualities, + srt_subtitles_timecode, + try_get, + update_url_query, + urlencode_postdata, +) + + +class PluralsightBaseIE(InfoExtractor): + _API_BASE = 'https://app.pluralsight.com' + + _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE + _GRAPHQL_HEADERS = { + 'Content-Type': 'application/json;charset=UTF-8', + } + _GRAPHQL_COURSE_TMPL = ''' +query BootstrapPlayer { + rpc { + bootstrapPlayer { + profile { + firstName + lastName + email + username + userHandle + authed + isAuthed + plan + } + course(courseId: "%s") { + name + title + courseHasCaptions + translationLanguages { + code + name + } + supportsWideScreenVideoFormats + timestamp + modules { + name + title + duration + formattedDuration + author + authorized + clips { + authorized + clipId + duration + formattedDuration + id + index + moduleIndex + moduleTitle + name + title + watched + } + } + } + } + } +}''' + + def _download_course(self, course_id, url, display_id): + try: + return self._download_course_rpc(course_id, url, display_id) + except ExtractorError: + # Old API fallback + return self._download_json( + 'https://app.pluralsight.com/player/user/api/v1/player/payload', + display_id, data=urlencode_postdata({'courseId': course_id}), + headers={'Referer': url}) + + def _download_course_rpc(self, course_id, url, display_id): + response = self._download_json( + self._GRAPHQL_EP, display_id, data=json.dumps({ + 'query': self._GRAPHQL_COURSE_TMPL % course_id, + 'variables': {} + }).encode('utf-8'), headers=self._GRAPHQL_HEADERS) + + course = try_get( + response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], + dict) + if course: + return course + + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, response['error']['message']), + expected=True) + + +class PluralsightIE(PluralsightBaseIE): + IE_NAME = 'pluralsight' + _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?' + _LOGIN_URL = 'https://app.pluralsight.com/id/' + + _NETRC_MACHINE = 'pluralsight' + + _TESTS = [{ + 'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas', + 'md5': '4d458cf5cf4c593788672419a8dd4cf8', + 'info_dict': { + 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', + 'ext': 'mp4', + 'title': 'Demo Monitoring', + 'duration': 338, + }, + 'skip': 'Requires pluralsight account credentials', + }, { + 'url': 'https://app.pluralsight.com/training/player?course=angularjs-get-started&author=scott-allen&name=angularjs-get-started-m1-introduction&clip=0&mode=live', + 'only_matching': True, + }, { + # available without pluralsight account + 'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started', + 'only_matching': True, + }, { + 'url': 'https://app.pluralsight.com/player?course=ccna-intro-networking&author=ross-bagurdes&name=ccna-intro-networking-m06&clip=0', + 'only_matching': True, + }] + + GRAPHQL_VIEWCLIP_TMPL = ''' +query viewClip { + viewClip(input: { + author: "%(author)s", + clipIndex: %(clipIndex)d, + courseName: "%(courseName)s", + includeCaptions: %(includeCaptions)s, + locale: "%(locale)s", + mediaType: "%(mediaType)s", + moduleName: "%(moduleName)s", + quality: "%(quality)s" + }) { + urls { + url + cdn + rank + source + }, + status + } +}''' + + def _perform_login(self, username, password): + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'Username': username, + 'Password': password, + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + error = self._search_regex( + r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + + if all(not re.search(p, response) for p in ( + r'__INITIAL_STATE__', r'["\']currentUser["\']', + # new layout? + r'>\s*Sign out\s*<')): + BLOCKED = 'Your account has been blocked due to suspicious activity' + if BLOCKED in response: + raise ExtractorError( + 'Unable to login: %s' % BLOCKED, expected=True) + MUST_AGREE = 'To continue using Pluralsight, you must agree to' + if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')): + raise ExtractorError( + 'Unable to login: %s some documents. Go to pluralsight.com, ' + 'log in and agree with what Pluralsight requires.' + % MUST_AGREE, expected=True) + + raise ExtractorError('Unable to log in') + + def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_id): + captions = None + if clip_id: + captions = self._download_json( + '%s/transcript/api/v1/caption/json/%s/%s' + % (self._API_BASE, clip_id, lang), video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False) + if not captions: + captions_post = { + 'a': author, + 'cn': int(clip_idx), + 'lc': lang, + 'm': name, + } + captions = self._download_json( + '%s/player/retrieve-captions' % self._API_BASE, video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False, data=json.dumps(captions_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) + if captions: + return { + lang: [{ + 'ext': 'json', + 'data': json.dumps(captions), + }, { + 'ext': 'srt', + 'data': self._convert_subtitles(duration, captions), + }] + } + + @staticmethod + def _convert_subtitles(duration, subs): + srt = '' + TIME_OFFSET_KEYS = ('displayTimeOffset', 'DisplayTimeOffset') + TEXT_KEYS = ('text', 'Text') + for num, current in enumerate(subs): + current = subs[num] + start, text = ( + float_or_none(dict_get(current, TIME_OFFSET_KEYS, skip_false_values=False)), + dict_get(current, TEXT_KEYS)) + if start is None or text is None: + continue + end = duration if num == len(subs) - 1 else float_or_none( + dict_get(subs[num + 1], TIME_OFFSET_KEYS, skip_false_values=False)) + if end is None: + continue + srt += os.linesep.join( + ( + '%d' % num, + '%s --> %s' % ( + srt_subtitles_timecode(start), + srt_subtitles_timecode(end)), + text, + os.linesep, + )) + return srt + + def _real_extract(self, url): + qs = parse_qs(url) + + author = qs.get('author', [None])[0] + name = qs.get('name', [None])[0] + clip_idx = qs.get('clip', [None])[0] + course_name = qs.get('course', [None])[0] + + if any(not f for f in (author, name, clip_idx, course_name,)): + raise ExtractorError('Invalid URL', expected=True) + + display_id = '%s-%s' % (name, clip_idx) + + course = self._download_course(course_name, url, display_id) + + collection = course['modules'] + + clip = None + + for module_ in collection: + if name in (module_.get('moduleName'), module_.get('name')): + for clip_ in module_.get('clips', []): + clip_index = clip_.get('clipIndex') + if clip_index is None: + clip_index = clip_.get('index') + if clip_index is None: + continue + if compat_str(clip_index) == clip_idx: + clip = clip_ + break + + if not clip: + raise ExtractorError('Unable to resolve clip') + + title = clip['title'] + clip_id = clip.get('clipName') or clip.get('name') or clip['clipId'] + + QUALITIES = { + 'low': {'width': 640, 'height': 480}, + 'medium': {'width': 848, 'height': 640}, + 'high': {'width': 1024, 'height': 768}, + 'high-widescreen': {'width': 1280, 'height': 720}, + } + + QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen',) + quality_key = qualities(QUALITIES_PREFERENCE) + + AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) + + ALLOWED_QUALITIES = ( + AllowedQuality('webm', ['high', ]), + AllowedQuality('mp4', ['low', 'medium', 'high', ]), + ) + + # Some courses also offer widescreen resolution for high quality (see + # https://github.com/ytdl-org/youtube-dl/issues/7766) + widescreen = course.get('supportsWideScreenVideoFormats') is True + best_quality = 'high-widescreen' if widescreen else 'high' + if widescreen: + for allowed_quality in ALLOWED_QUALITIES: + allowed_quality.qualities.append(best_quality) + + # In order to minimize the number of calls to ViewClip API and reduce + # the probability of being throttled or banned by Pluralsight we will request + # only single format until formats listing was explicitly requested. + if self.get_param('listformats', False): + allowed_qualities = ALLOWED_QUALITIES + else: + def guess_allowed_qualities(): + req_format = self.get_param('format') or 'best' + req_format_split = req_format.split('-', 1) + if len(req_format_split) > 1: + req_ext, req_quality = req_format_split + req_quality = '-'.join(req_quality.split('-')[:2]) + for allowed_quality in ALLOWED_QUALITIES: + if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: + return (AllowedQuality(req_ext, (req_quality, )), ) + req_ext = 'webm' if self.get_param('prefer_free_formats') else 'mp4' + return (AllowedQuality(req_ext, (best_quality, )), ) + allowed_qualities = guess_allowed_qualities() + + formats = [] + for ext, qualities_ in allowed_qualities: + for quality in qualities_: + f = QUALITIES[quality].copy() + clip_post = { + 'author': author, + 'includeCaptions': 'false', + 'clipIndex': int(clip_idx), + 'courseName': course_name, + 'locale': 'en', + 'moduleName': name, + 'mediaType': ext, + 'quality': '%dx%d' % (f['width'], f['height']), + } + format_id = '%s-%s' % (ext, quality) + + try: + viewclip = self._download_json( + self._GRAPHQL_EP, display_id, + 'Downloading %s viewclip graphql' % format_id, + data=json.dumps({ + 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, + 'variables': {} + }).encode('utf-8'), + headers=self._GRAPHQL_HEADERS)['data']['viewClip'] + except ExtractorError: + # Still works but most likely will go soon + viewclip = self._download_json( + '%s/video/clips/viewclip' % self._API_BASE, display_id, + 'Downloading %s viewclip JSON' % format_id, fatal=False, + data=json.dumps(clip_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) + + # Pluralsight tracks multiple sequential calls to ViewClip API and start + # to return 429 HTTP errors after some time (see + # https://github.com/ytdl-org/youtube-dl/pull/6989). Moreover it may even lead + # to account ban (see https://github.com/ytdl-org/youtube-dl/issues/6842). + # To somewhat reduce the probability of these consequences + # we will sleep random amount of time before each call to ViewClip. + self._sleep( + random.randint(5, 10), display_id, + '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') + + if not viewclip: + continue + + clip_urls = viewclip.get('urls') + if not isinstance(clip_urls, list): + continue + + for clip_url_data in clip_urls: + clip_url = clip_url_data.get('url') + if not clip_url: + continue + cdn = clip_url_data.get('cdn') + clip_f = f.copy() + clip_f.update({ + 'url': clip_url, + 'ext': ext, + 'format_id': '%s-%s' % (format_id, cdn) if cdn else format_id, + 'quality': quality_key(quality), + 'source_preference': int_or_none(clip_url_data.get('rank')), + }) + formats.append(clip_f) + + duration = int_or_none( + clip.get('duration')) or parse_duration(clip.get('formattedDuration')) + + # TODO: other languages? + subtitles = self.extract_subtitles( + author, clip_idx, clip.get('clipId'), 'en', name, duration, display_id) + + return { + 'id': clip_id, + 'title': title, + 'duration': duration, + 'creator': author, + 'formats': formats, + 'subtitles': subtitles, + } + + +class PluralsightCourseIE(PluralsightBaseIE): + IE_NAME = 'pluralsight:course' + _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:library/)?courses/(?P<id>[^/]+)' + _TESTS = [{ + # Free course from Pluralsight Starter Subscription for Microsoft TechNet + # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz + 'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas', + 'info_dict': { + 'id': 'hosting-sql-server-windows-azure-iaas', + 'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals', + 'description': 'md5:61b37e60f21c4b2f91dc621a977d0986', + }, + 'playlist_count': 31, + }, { + # available without pluralsight account + 'url': 'https://www.pluralsight.com/courses/angularjs-get-started', + 'only_matching': True, + }, { + 'url': 'https://app.pluralsight.com/library/courses/understanding-microsoft-azure-amazon-aws/table-of-contents', + 'only_matching': True, + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + + # TODO: PSM cookie + + course = self._download_course(course_id, url, course_id) + + title = course['title'] + course_name = course['name'] + course_data = course['modules'] + description = course.get('description') or course.get('shortDescription') + + entries = [] + for num, module in enumerate(course_data, 1): + author = module.get('author') + module_name = module.get('name') + if not author or not module_name: + continue + for clip in module.get('clips', []): + clip_index = int_or_none(clip.get('index')) + if clip_index is None: + continue + clip_url = update_url_query( + '%s/player' % self._API_BASE, query={ + 'mode': 'live', + 'course': course_name, + 'author': author, + 'name': module_name, + 'clip': clip_index, + }) + entries.append({ + '_type': 'url_transparent', + 'url': clip_url, + 'ie_key': PluralsightIE.ie_key(), + 'chapter': module.get('title'), + 'chapter_number': num, + 'chapter_id': module.get('moduleRef'), + }) + + return self.playlist_result(entries, course_id, title, description) diff --git a/yt_dlp/extractor/plutotv.py b/yt_dlp/extractor/plutotv.py new file mode 100644 index 0000000..5898d92 --- /dev/null +++ b/yt_dlp/extractor/plutotv.py @@ -0,0 +1,195 @@ +import re +import uuid + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + try_get, + url_or_none, +) + + +class PlutoTVIE(InfoExtractor): + _WORKING = False + _VALID_URL = r'''(?x) + https?://(?:www\.)?pluto\.tv(?:/[^/]+)?/on-demand + /(?P<video_type>movies|series) + /(?P<series_or_movie_slug>[^/]+) + (?: + (?:/seasons?/(?P<season_no>\d+))? + (?:/episode/(?P<episode_slug>[^/]+))? + )? + /?(?:$|[#?])''' + + _INFO_URL = 'https://service-vod.clusters.pluto.tv/v3/vod/slugs/' + _INFO_QUERY_PARAMS = { + 'appName': 'web', + 'appVersion': 'na', + 'clientID': compat_str(uuid.uuid1()), + 'clientModelNumber': 'na', + 'serverSideAds': 'false', + 'deviceMake': 'unknown', + 'deviceModel': 'web', + 'deviceType': 'web', + 'deviceVersion': 'unknown', + 'sid': compat_str(uuid.uuid1()), + } + _TESTS = [ + { + 'url': 'https://pluto.tv/on-demand/series/i-love-money/season/2/episode/its-in-the-cards-2009-2-3', + 'md5': 'ebcdd8ed89aaace9df37924f722fd9bd', + 'info_dict': { + 'id': '5de6c598e9379ae4912df0a8', + 'ext': 'mp4', + 'title': 'It\'s In The Cards', + 'episode': 'It\'s In The Cards', + 'description': 'The teams face off against each other in a 3-on-2 soccer showdown. Strategy comes into play, though, as each team gets to select their opposing teams’ two defenders.', + 'series': 'I Love Money', + 'season_number': 2, + 'episode_number': 3, + 'duration': 3600, + } + }, { + 'url': 'https://pluto.tv/on-demand/series/i-love-money/season/1/', + 'playlist_count': 11, + 'info_dict': { + 'id': '5de6c582e9379ae4912dedbd', + 'title': 'I Love Money - Season 1', + } + }, { + 'url': 'https://pluto.tv/on-demand/series/i-love-money/', + 'playlist_count': 26, + 'info_dict': { + 'id': '5de6c582e9379ae4912dedbd', + 'title': 'I Love Money', + } + }, { + 'url': 'https://pluto.tv/on-demand/movies/arrival-2015-1-1', + 'md5': '3cead001d317a018bf856a896dee1762', + 'info_dict': { + 'id': '5e83ac701fa6a9001bb9df24', + 'ext': 'mp4', + 'title': 'Arrival', + 'description': 'When mysterious spacecraft touch down across the globe, an elite team - led by expert translator Louise Banks (Academy Award® nominee Amy Adams) – races against time to decipher their intent.', + 'duration': 9000, + } + }, { + 'url': 'https://pluto.tv/en/on-demand/series/manhunters-fugitive-task-force/seasons/1/episode/third-times-the-charm-1-1', + 'only_matching': True, + }, { + 'url': 'https://pluto.tv/it/on-demand/series/csi-vegas/episode/legacy-2021-1-1', + 'only_matching': True, + }, + { + 'url': 'https://pluto.tv/en/on-demand/movies/attack-of-the-killer-tomatoes-1977-1-1-ptv1', + 'md5': '7db56369c0da626a32d505ec6eb3f89f', + 'info_dict': { + 'id': '5b190c7bb0875c36c90c29c4', + 'ext': 'mp4', + 'title': 'Attack of the Killer Tomatoes', + 'description': 'A group of scientists band together to save the world from mutated tomatoes that KILL! (1978)', + 'duration': 5700, + } + } + ] + + def _to_ad_free_formats(self, video_id, formats, subtitles): + ad_free_formats, ad_free_subtitles, m3u8_urls = [], {}, set() + for fmt in formats: + res = self._download_webpage( + fmt.get('url'), video_id, note='Downloading m3u8 playlist', + fatal=False) + if not res: + continue + first_segment_url = re.search( + r'^(https?://.*/)0\-(end|[0-9]+)/[^/]+\.ts$', res, + re.MULTILINE) + if first_segment_url: + m3u8_urls.add( + compat_urlparse.urljoin(first_segment_url.group(1), '0-end/master.m3u8')) + continue + first_segment_url = re.search( + r'^(https?://.*/).+\-0+[0-1]0\.ts$', res, + re.MULTILINE) + if first_segment_url: + m3u8_urls.add( + compat_urlparse.urljoin(first_segment_url.group(1), 'master.m3u8')) + continue + + for m3u8_url in m3u8_urls: + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + ad_free_formats.extend(fmts) + ad_free_subtitles = self._merge_subtitles(ad_free_subtitles, subs) + if ad_free_formats: + formats, subtitles = ad_free_formats, ad_free_subtitles + else: + self.report_warning('Unable to find ad-free formats') + return formats, subtitles + + def _get_video_info(self, video_json, slug, series_name=None): + video_id = video_json.get('_id', slug) + formats, subtitles = [], {} + for video_url in try_get(video_json, lambda x: x['stitched']['urls'], list) or []: + if video_url.get('type') != 'hls': + continue + url = url_or_none(video_url.get('url')) + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + + formats, subtitles = self._to_ad_free_formats(video_id, formats, subtitles) + + info = { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': video_json.get('name'), + 'description': video_json.get('description'), + 'duration': float_or_none(video_json.get('duration'), scale=1000), + } + if series_name: + info.update({ + 'series': series_name, + 'episode': video_json.get('name'), + 'season_number': int_or_none(video_json.get('season')), + 'episode_number': int_or_none(video_json.get('number')), + }) + return info + + def _real_extract(self, url): + mobj = self._match_valid_url(url).groupdict() + info_slug = mobj['series_or_movie_slug'] + video_json = self._download_json(self._INFO_URL + info_slug, info_slug, query=self._INFO_QUERY_PARAMS) + + if mobj['video_type'] == 'series': + series_name = video_json.get('name', info_slug) + season_number, episode_slug = mobj.get('season_number'), mobj.get('episode_slug') + + videos = [] + for season in video_json['seasons']: + if season_number is not None and season_number != int_or_none(season.get('number')): + continue + for episode in season['episodes']: + if episode_slug is not None and episode_slug != episode.get('slug'): + continue + videos.append(self._get_video_info(episode, episode_slug, series_name)) + if not videos: + raise ExtractorError('Failed to find any videos to extract') + if episode_slug is not None and len(videos) == 1: + return videos[0] + playlist_title = series_name + if season_number is not None: + playlist_title += ' - Season %d' % season_number + return self.playlist_result(videos, + playlist_id=video_json.get('_id', info_slug), + playlist_title=playlist_title) + return self._get_video_info(video_json, info_slug) diff --git a/yt_dlp/extractor/podbayfm.py b/yt_dlp/extractor/podbayfm.py new file mode 100644 index 0000000..2a26fd2 --- /dev/null +++ b/yt_dlp/extractor/podbayfm.py @@ -0,0 +1,75 @@ +from .common import InfoExtractor +from ..utils import OnDemandPagedList, int_or_none, jwt_decode_hs256, try_call + + +def result_from_props(props, episode_id=None): + return { + 'id': props.get('podcast_id') or episode_id, + 'title': props.get('title'), + 'url': props['mediaURL'], + 'ext': 'mp3', + 'thumbnail': try_call(lambda: jwt_decode_hs256(props['image'])['url']), + 'timestamp': props.get('timestamp'), + 'duration': int_or_none(props.get('duration')), + } + + +class PodbayFMIE(InfoExtractor): + _VALID_URL = r'https?://podbay\.fm/p/[^/]*/e/(?P<id>[^/]*)/?(?:[\?#].*)?$' + _TESTS = [{ + 'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400', + 'md5': '98b41285dcf7989d105a4ed0404054cf', + 'info_dict': { + 'id': '1647338400', + 'title': 'Part One: Kissinger', + 'ext': 'mp3', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1647338400, + 'duration': 5001, + 'upload_date': '20220315', + }, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + webpage = self._download_webpage(url, episode_id) + data = self._search_nextjs_data(webpage, episode_id) + return result_from_props(data['props']['pageProps']['episode'], episode_id) + + +class PodbayFMChannelIE(InfoExtractor): + _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/]*)/?(?:[\?#].*)?$' + _TESTS = [{ + 'url': 'https://podbay.fm/p/behind-the-bastards', + 'info_dict': { + 'id': 'behind-the-bastards', + 'title': 'Behind the Bastards', + }, + }] + _PAGE_SIZE = 10 + + def _fetch_page(self, channel_id, pagenum): + return self._download_json( + f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}', + channel_id)['podcast'] + + @staticmethod + def _results_from_page(channel_id, page): + return [{ + **result_from_props(e), + 'extractor': PodbayFMIE.IE_NAME, + 'extractor_key': PodbayFMIE.ie_key(), + # somehow they use timestamps as the episode identifier + 'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}', + } for e in page['episodes']] + + def _real_extract(self, url): + channel_id = self._match_id(url) + + first_page = self._fetch_page(channel_id, 0) + entries = OnDemandPagedList( + lambda pagenum: self._results_from_page( + channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page), + self._PAGE_SIZE) + + return self.playlist_result(entries, channel_id, first_page.get('title')) diff --git a/yt_dlp/extractor/podchaser.py b/yt_dlp/extractor/podchaser.py new file mode 100644 index 0000000..fc2d407 --- /dev/null +++ b/yt_dlp/extractor/podchaser.py @@ -0,0 +1,97 @@ +import functools +import json + +from .common import InfoExtractor +from ..utils import ( + OnDemandPagedList, + float_or_none, + str_or_none, + str_to_int, + traverse_obj, + unified_timestamp, +) + + +class PodchaserIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?podchaser\.com/podcasts/[\w-]+-(?P<podcast_id>\d+)(?:/episodes/[\w-]+-(?P<id>\d+))?' + _PAGE_SIZE = 100 + _TESTS = [{ + 'url': 'https://www.podchaser.com/podcasts/cum-town-36924/episodes/ep-285-freeze-me-off-104365585', + 'info_dict': { + 'id': '104365585', + 'title': 'Ep. 285 – freeze me off', + 'description': 'cam ahn', + 'thumbnail': r're:^https?://.*\.jpg$', + 'ext': 'mp3', + 'categories': ['Comedy'], + 'tags': ['comedy', 'dark humor'], + 'series': 'Cum Town', + 'duration': 3708, + 'timestamp': 1636531259, + 'upload_date': '20211110', + 'average_rating': 4.0 + } + }, { + 'url': 'https://www.podchaser.com/podcasts/the-bone-zone-28853', + 'info_dict': { + 'id': '28853', + 'title': 'The Bone Zone', + 'description': 'Podcast by The Bone Zone', + }, + 'playlist_count': 275 + }, { + 'url': 'https://www.podchaser.com/podcasts/sean-carrolls-mindscape-scienc-699349/episodes', + 'info_dict': { + 'id': '699349', + 'title': 'Sean Carroll\'s Mindscape: Science, Society, Philosophy, Culture, Arts, and Ideas', + 'description': 'md5:2cbd8f4749891a84dc8235342e0b5ff1' + }, + 'playlist_mincount': 225 + }] + + @staticmethod + def _parse_episode(episode, podcast): + return { + 'id': str(episode.get('id')), + 'title': episode.get('title'), + 'description': episode.get('description'), + 'url': episode.get('audio_url'), + 'thumbnail': episode.get('image_url'), + 'duration': str_to_int(episode.get('length')), + 'timestamp': unified_timestamp(episode.get('air_date')), + 'average_rating': float_or_none(episode.get('rating')), + 'categories': list(set(traverse_obj(podcast, (('summary', None), 'categories', ..., 'text')))), + 'tags': traverse_obj(podcast, ('tags', ..., 'text')), + 'series': podcast.get('title'), + } + + def _call_api(self, path, *args, **kwargs): + return self._download_json(f'https://api.podchaser.com/{path}', *args, **kwargs) + + def _fetch_page(self, podcast_id, podcast, page): + json_response = self._call_api( + 'list/episode', podcast_id, + headers={'Content-Type': 'application/json;charset=utf-8'}, + data=json.dumps({ + 'start': page * self._PAGE_SIZE, + 'count': self._PAGE_SIZE, + 'sort_order': 'SORT_ORDER_RECENT', + 'filters': { + 'podcast_id': podcast_id + }, + 'options': {} + }).encode()) + + for episode in json_response['entities']: + yield self._parse_episode(episode, podcast) + + def _real_extract(self, url): + podcast_id, episode_id = self._match_valid_url(url).group('podcast_id', 'id') + podcast = self._call_api(f'podcasts/{podcast_id}', episode_id or podcast_id) + if not episode_id: + return self.playlist_result( + OnDemandPagedList(functools.partial(self._fetch_page, podcast_id, podcast), self._PAGE_SIZE), + str_or_none(podcast.get('id')), podcast.get('title'), podcast.get('description')) + + episode = self._call_api(f'episodes/{episode_id}', episode_id) + return self._parse_episode(episode, podcast) diff --git a/yt_dlp/extractor/podomatic.py b/yt_dlp/extractor/podomatic.py new file mode 100644 index 0000000..37b6869 --- /dev/null +++ b/yt_dlp/extractor/podomatic.py @@ -0,0 +1,74 @@ +import json + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PodomaticIE(InfoExtractor): + _WORKING = False + IE_NAME = 'podomatic' + _VALID_URL = r'''(?x) + (?P<proto>https?):// + (?: + (?P<channel>[^.]+)\.podomatic\.com/entry| + (?:www\.)?podomatic\.com/podcasts/(?P<channel_2>[^/]+)/episodes + )/ + (?P<id>[^/?#&]+) + ''' + + _TESTS = [{ + 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', + 'md5': '84bb855fcf3429e6bf72460e1eed782d', + 'info_dict': { + 'id': '2009-01-02T16_03_35-08_00', + 'ext': 'mp3', + 'uploader': 'Science Teaching Tips', + 'uploader_id': 'scienceteachingtips', + 'title': '64. When the Moon Hits Your Eye', + 'duration': 446, + } + }, { + 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', + 'md5': 'd2cf443931b6148e27638650e2638297', + 'info_dict': { + 'id': '2013-11-15T16_31_21-08_00', + 'ext': 'mp3', + 'uploader': 'Ostbahnhof / Techno Mix', + 'uploader_id': 'ostbahnhof', + 'title': 'Einunddreizig', + 'duration': 3799, + } + }, { + 'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + channel = mobj.group('channel') or mobj.group('channel_2') + + json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' + + '?permalink=true&rtmp=0') % + (mobj.group('proto'), channel, video_id)) + data_json = self._download_webpage( + json_url, video_id, 'Downloading video info') + data = json.loads(data_json) + + video_url = data['downloadLink'] + if not video_url: + video_url = '%s/%s' % (data['streamer'].replace('rtmp', 'http'), data['mediaLocation']) + uploader = data['podcast'] + title = data['title'] + thumbnail = data['imageLocation'] + duration = int_or_none(data.get('length'), 1000) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'uploader': uploader, + 'uploader_id': channel, + 'thumbnail': thumbnail, + 'duration': duration, + } diff --git a/yt_dlp/extractor/pokemon.py b/yt_dlp/extractor/pokemon.py new file mode 100644 index 0000000..0911893 --- /dev/null +++ b/yt_dlp/extractor/pokemon.py @@ -0,0 +1,136 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + extract_attributes, + int_or_none, + js_to_json, + merge_dicts, +) + + +class PokemonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/(?:[^/]+/)+(?P<display_id>[^/?#&]+))' + _TESTS = [{ + 'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/', + 'md5': '2fe8eaec69768b25ef898cda9c43062e', + 'info_dict': { + 'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4', + 'ext': 'mp4', + 'title': 'The Ol’ Raise and Switch!', + 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', + }, + 'add_id': ['LimelightMedia'], + }, { + # no data-video-title + 'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008', + 'info_dict': { + 'id': 'dfbaf830d7e54e179837c50c0c6cc0e1', + 'ext': 'mp4', + 'title': "Pokémon : L'ascension de Darkrai", + 'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5', + }, + 'add_id': ['LimelightMedia'], + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, video_id or display_id) + video_data = extract_attributes(self._search_regex( + r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'), + webpage, 'video data element')) + video_id = video_data['data-video-id'] + title = video_data.get('data-video-title') or self._html_search_meta( + 'pkm-title', webpage, ' title', default=None) or self._search_regex( + r'<h1[^>]+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title') + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'limelight:media:%s' % video_id, + 'title': title, + 'description': video_data.get('data-video-summary'), + 'thumbnail': video_data.get('data-video-poster'), + 'series': 'Pokémon', + 'season_number': int_or_none(video_data.get('data-video-season')), + 'episode': title, + 'episode_number': int_or_none(video_data.get('data-video-episode')), + 'ie_key': 'LimelightMedia', + } + + +class PokemonWatchIE(InfoExtractor): + _VALID_URL = r'https?://watch\.pokemon\.com/[a-z]{2}-[a-z]{2}/(?:#/)?player(?:\.html)?\?id=(?P<id>[a-z0-9]{32})' + _API_URL = 'https://www.pokemon.com/api/pokemontv/v2/channels/{0:}' + _TESTS = [{ + 'url': 'https://watch.pokemon.com/en-us/player.html?id=8309a40969894a8e8d5bc1311e9c5667', + 'md5': '62833938a31e61ab49ada92f524c42ff', + 'info_dict': { + 'id': '8309a40969894a8e8d5bc1311e9c5667', + 'ext': 'mp4', + 'title': 'Lillier and the Staff!', + 'description': 'md5:338841b8c21b283d24bdc9b568849f04', + } + }, { + 'url': 'https://watch.pokemon.com/en-us/#/player?id=3fe7752ba09141f0b0f7756d1981c6b2', + 'only_matching': True + }, { + 'url': 'https://watch.pokemon.com/de-de/player.html?id=b3c402e111a4459eb47e12160ab0ba07', + 'only_matching': True + }] + + def _extract_media(self, channel_array, video_id): + for channel in channel_array: + for media in channel.get('media'): + if media.get('id') == video_id: + return media + return None + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = { + '_type': 'url', + 'id': video_id, + 'url': 'limelight:media:%s' % video_id, + 'ie_key': 'LimelightMedia', + } + + # API call can be avoided entirely if we are listing formats + if self.get_param('listformats', False): + return info + + webpage = self._download_webpage(url, video_id) + build_vars = self._parse_json(self._search_regex( + r'(?s)buildVars\s*=\s*({.*?})', webpage, 'build vars'), + video_id, transform_source=js_to_json) + region = build_vars.get('region') + channel_array = self._download_json(self._API_URL.format(region), video_id) + video_data = self._extract_media(channel_array, video_id) + + if video_data is None: + raise ExtractorError( + 'Video %s does not exist' % video_id, expected=True) + + info['_type'] = 'url_transparent' + images = video_data.get('images') + + return merge_dicts(info, { + 'title': video_data.get('title'), + 'description': video_data.get('description'), + 'thumbnail': images.get('medium') or images.get('small'), + 'series': 'Pokémon', + 'season_number': int_or_none(video_data.get('season')), + 'episode': video_data.get('title'), + 'episode_number': int_or_none(video_data.get('episode')), + }) diff --git a/yt_dlp/extractor/pokergo.py b/yt_dlp/extractor/pokergo.py new file mode 100644 index 0000000..5c7baad --- /dev/null +++ b/yt_dlp/extractor/pokergo.py @@ -0,0 +1,106 @@ +import base64 + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + try_get, +) + + +class PokerGoBaseIE(InfoExtractor): + _NETRC_MACHINE = 'pokergo' + _AUTH_TOKEN = None + _PROPERTY_ID = '1dfb3940-7d53-4980-b0b0-f28b369a000d' + + def _perform_login(self, username, password): + if self._AUTH_TOKEN: + return + self.report_login() + PokerGoBaseIE._AUTH_TOKEN = self._download_json( + f'https://subscription.pokergo.com/properties/{self._PROPERTY_ID}/sign-in', None, + headers={'authorization': f'Basic {base64.b64encode(f"{username}:{password}".encode()).decode()}'}, + data=b'')['meta']['token'] + if not self._AUTH_TOKEN: + raise ExtractorError('Unable to get Auth Token.', expected=True) + + def _real_initialize(self): + if not self._AUTH_TOKEN: + self.raise_login_required(method='password') + + +class PokerGoIE(PokerGoBaseIE): + _VALID_URL = r'https?://(?:www\.)?pokergo\.com/videos/(?P<id>[^&$#/?]+)' + + _TESTS = [{ + 'url': 'https://www.pokergo.com/videos/2a70ec4e-4a80-414b-97ec-725d9b72a7dc', + 'info_dict': { + 'id': 'aVLOxDzY', + 'ext': 'mp4', + 'title': 'Poker After Dark | Season 12 (2020) | Cry Me a River | Episode 2', + 'description': 'md5:c7a8c29556cbfb6eb3c0d5d622251b71', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/aVLOxDzY/poster.jpg?width=720', + 'timestamp': 1608085715, + 'duration': 2700.12, + 'season_number': 12, + 'episode_number': 2, + 'series': 'poker after dark', + 'upload_date': '20201216', + 'season': 'Season 12', + 'episode': 'Episode 2', + 'display_id': '2a70ec4e-4a80-414b-97ec-725d9b72a7dc', + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://api.pokergo.com/v2/properties/{self._PROPERTY_ID}/videos/{id}', id, + headers={'authorization': f'Bearer {self._AUTH_TOKEN}'})['data'] + v_id = data_json['source'] + + thumbnails = [{ + 'url': image['url'], + 'id': image.get('label'), + 'width': image.get('width'), + 'height': image.get('height') + } for image in data_json.get('images') or [] if image.get('url')] + series_json = next(dct for dct in data_json.get('show_tags') or [] if dct.get('video_id') == id) or {} + + return { + '_type': 'url_transparent', + 'display_id': id, + 'title': data_json.get('title'), + 'description': data_json.get('description'), + 'duration': data_json.get('duration'), + 'thumbnails': thumbnails, + 'season_number': series_json.get('season'), + 'episode_number': series_json.get('episode_number'), + 'series': try_get(series_json, lambda x: x['tag']['name']), + 'url': f'https://cdn.jwplayer.com/v2/media/{v_id}' + } + + +class PokerGoCollectionIE(PokerGoBaseIE): + _VALID_URL = r'https?://(?:www\.)?pokergo\.com/collections/(?P<id>[^&$#/?]+)' + + _TESTS = [{ + 'url': 'https://www.pokergo.com/collections/19ffe481-5dae-481a-8869-75cc0e3c4700', + 'playlist_mincount': 13, + 'info_dict': { + 'id': '19ffe481-5dae-481a-8869-75cc0e3c4700', + }, + }] + + def _entries(self, id): + data_json = self._download_json(f'https://api.pokergo.com/v2/properties/{self._PROPERTY_ID}/collections/{id}?include=entities', + id, headers={'authorization': f'Bearer {self._AUTH_TOKEN}'})['data'] + for video in data_json.get('collection_video') or []: + video_id = video.get('id') + if video_id: + yield self.url_result( + f'https://www.pokergo.com/videos/{video_id}', + ie=PokerGoIE.ie_key(), video_id=video_id) + + def _real_extract(self, url): + id = self._match_id(url) + return self.playlist_result(self._entries(id), playlist_id=id) diff --git a/yt_dlp/extractor/polsatgo.py b/yt_dlp/extractor/polsatgo.py new file mode 100644 index 0000000..1524a1f --- /dev/null +++ b/yt_dlp/extractor/polsatgo.py @@ -0,0 +1,86 @@ +from uuid import uuid4 +import json + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + url_or_none, + ExtractorError, +) + + +class PolsatGoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polsat(?:box)?go\.pl/.+/(?P<id>[0-9a-fA-F]+)(?:[/#?]|$)' + _TESTS = [{ + 'url': 'https://polsatgo.pl/wideo/seriale/swiat-wedlug-kiepskich/5024045/sezon-1/5028300/swiat-wedlug-kiepskich-odcinek-88/4121', + 'info_dict': { + 'id': '4121', + 'ext': 'mp4', + 'title': 'Świat według Kiepskich - Odcinek 88', + 'age_limit': 12, + }, + }] + + def _extract_formats(self, sources, video_id): + for source in sources or []: + if not source.get('id'): + continue + url = url_or_none(self._call_api( + 'drm', video_id, 'getPseudoLicense', + {'mediaId': video_id, 'sourceId': source['id']}).get('url')) + if not url: + continue + yield { + 'url': url, + 'height': int_or_none(try_get(source, lambda x: x['quality'][:-1])) + } + + def _real_extract(self, url): + video_id = self._match_id(url) + media = self._call_api('navigation', video_id, 'prePlayData', {'mediaId': video_id})['mediaItem'] + + formats = list(self._extract_formats( + try_get(media, lambda x: x['playback']['mediaSources']), video_id)) + + return { + 'id': video_id, + 'title': media['displayInfo']['title'], + 'formats': formats, + 'age_limit': int_or_none(media['displayInfo']['ageGroup']) + } + + def _call_api(self, endpoint, media_id, method, params): + rand_uuid = str(uuid4()) + res = self._download_json( + f'https://b2c-mobile.redefine.pl/rpc/{endpoint}/', media_id, + note=f'Downloading {method} JSON metadata', + data=json.dumps({ + 'method': method, + 'id': '2137', + 'jsonrpc': '2.0', + 'params': { + **params, + 'userAgentData': { + 'deviceType': 'mobile', + 'application': 'native', + 'os': 'android', + 'build': 10003, + 'widevine': False, + 'portal': 'pg', + 'player': 'cpplayer', + }, + 'deviceId': { + 'type': 'other', + 'value': rand_uuid, + }, + 'clientId': rand_uuid, + 'cpid': 1, + }, + }).encode('utf-8'), + headers={'Content-type': 'application/json'}) + if not res.get('result'): + if res['error']['code'] == 13404: + raise ExtractorError('This video is either unavailable in your region or is DRM protected', expected=True) + raise ExtractorError(f'Solorz said: {res["error"]["message"]} - {res["error"]["data"]["userMessage"]}') + return res['result'] diff --git a/yt_dlp/extractor/polskieradio.py b/yt_dlp/extractor/polskieradio.py new file mode 100644 index 0000000..e0b22ff --- /dev/null +++ b/yt_dlp/extractor/polskieradio.py @@ -0,0 +1,610 @@ +import itertools +import json +import math +import re +import urllib.parse + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + InAdvancePagedList, + determine_ext, + extract_attributes, + int_or_none, + js_to_json, + parse_iso8601, + strip_or_none, + traverse_obj, + unescapeHTML, + unified_timestamp, + url_or_none, + urljoin, +) + + +class PolskieRadioBaseExtractor(InfoExtractor): + def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): + media_urls = set() + + for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', webpage): + media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False) + if not media.get('file') or not media.get('desc'): + continue + media_url = self._proto_relative_url(media['file']) + if media_url in media_urls: + continue + media_urls.add(media_url) + entry = base_data.copy() + entry.update({ + 'id': compat_str(media['id']), + 'url': media_url, + 'duration': int_or_none(media.get('length')), + 'vcodec': 'none' if media.get('provider') == 'audio' else None, + }) + entry_title = urllib.parse.unquote(media['desc']) + if entry_title: + entry['title'] = entry_title + yield entry + + +class PolskieRadioLegacyIE(PolskieRadioBaseExtractor): + # legacy sites + IE_NAME = 'polskieradio:legacy' + _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/[Aa]rtykul/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo', + 'info_dict': { + 'id': '2534482', + 'title': 'Żagaryści. Poezja jak spoiwo', + 'description': 'md5:f18d95d5dcba747a09b635e21a4c0695', + }, + 'playlist': [{ + 'md5': 'd07559829f61d5a93a75755987ded760', + 'info_dict': { + 'id': '2516679', + 'ext': 'mp3', + 'title': 'md5:c6e1234e0b747ad883cb91b7ad06b98c', + 'timestamp': 1592654400, + 'upload_date': '20200620', + 'duration': 1430, + 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' + }, + }], + }, { + # PR4 audition - other frontend + 'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301', + 'info_dict': { + 'id': '2610977', + 'ext': 'mp3', + 'title': 'Pogłos 29 października godz. 23:01', + }, + }, { + 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage, urlh = self._download_webpage_handle(url, playlist_id) + if PolskieRadioIE.suitable(urlh.url): + return self.url_result(urlh.url, PolskieRadioIE, playlist_id) + + content = self._search_regex( + r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', + webpage, 'content', default=None) + + timestamp = unified_timestamp(self._html_search_regex( + r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', + webpage, 'timestamp', default=None)) + + thumbnail_url = self._og_search_thumbnail(webpage, default=None) + + title = self._og_search_title(webpage).strip() + + description = strip_or_none(self._og_search_description(webpage, default=None)) + description = description.replace('\xa0', ' ') if description is not None else None + + if not content: + return { + 'id': playlist_id, + 'url': self._proto_relative_url( + self._search_regex( + r"source:\s*'(//static\.prsa\.pl/[^']+)'", + webpage, 'audition record url')), + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'thumbnail': thumbnail_url, + } + + entries = self._extract_webpage_player_entries(content, playlist_id, { + 'title': title, + 'timestamp': timestamp, + 'thumbnail': thumbnail_url, + }) + + return self.playlist_result(entries, playlist_id, title, description) + + +class PolskieRadioIE(PolskieRadioBaseExtractor): + # new next.js sites + _VALID_URL = r'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P<id>\d+)' + _TESTS = [{ + # articleData, attachments + 'url': 'https://jedynka.polskieradio.pl/artykul/1587943', + 'info_dict': { + 'id': '1587943', + 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', + 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', + }, + 'playlist': [{ + 'md5': '2984ee6ce9046d91fc233bc1a864a09a', + 'info_dict': { + 'id': '7a85d429-5356-4def-a347-925e4ae7406b', + 'ext': 'mp3', + 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', + }, + }], + }, { + # post, legacy html players + 'url': 'https://trojka.polskieradio.pl/artykul/2589163,Czy-wciaz-otrzymujemy-zdjecia-z-sond-Voyager', + 'info_dict': { + 'id': '2589163', + 'title': 'Czy wciąż otrzymujemy zdjęcia z sond Voyager?', + 'description': 'md5:cf1a7f348d63a2db9c0d7a63d1669473', + }, + 'playlist': [{ + 'info_dict': { + 'id': '2577880', + 'ext': 'mp3', + 'title': 'md5:a57d10a0c02abd34dd675cb33707ad5a', + 'duration': 321, + }, + }], + }, { + # data, legacy + 'url': 'https://radiokierowcow.pl/artykul/2694529', + 'info_dict': { + 'id': '2694529', + 'title': 'Zielona fala reliktem przeszłości?', + 'description': 'md5:f20a9a7ed9cb58916c54add94eae3bc0', + }, + 'playlist_count': 3, + }, { + 'url': 'https://trojka.polskieradio.pl/artykul/1632955', + 'only_matching': True, + }, { + # with mp4 video + 'url': 'https://trojka.polskieradio.pl/artykul/1634903', + 'only_matching': True, + }, { + 'url': 'https://jedynka.polskieradio.pl/artykul/3042436,Polityka-wschodnia-ojca-i-syna-Wladyslawa-Lokietka-i-Kazimierza-Wielkiego', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + article_data = traverse_obj( + self._search_nextjs_data(webpage, playlist_id), ( + 'props', 'pageProps', (('data', 'articleData'), 'post', 'data')), get_all=False) + + title = strip_or_none(article_data['title']) + + description = strip_or_none(article_data.get('lead')) + + entries = [{ + 'url': entry['file'], + 'ext': determine_ext(entry.get('fileName')), + 'id': self._search_regex( + r'([a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12})', entry['file'], 'entry id'), + 'title': strip_or_none(entry.get('description')) or title, + } for entry in article_data.get('attachments') or () if entry.get('fileType') in ('Audio', )] + + if not entries: + # some legacy articles have no json attachments, but players in body + entries = self._extract_webpage_player_entries(article_data['content'], playlist_id, { + 'title': title, + }) + + return self.playlist_result(entries, playlist_id, title, description) + + +class PolskieRadioAuditionIE(InfoExtractor): + # new next.js sites + IE_NAME = 'polskieradio:audition' + _VALID_URL = r'https?://(?:[^/]+\.)?polskieradio\.pl/audycj[ae]/(?P<id>\d+)' + _TESTS = [{ + # articles, PR1 + 'url': 'https://jedynka.polskieradio.pl/audycje/5102', + 'info_dict': { + 'id': '5102', + 'title': 'Historia żywa', + 'thumbnail': r're:https://static\.prsa\.pl/images/.+', + }, + 'playlist_mincount': 38, + }, { + # episodes, PR1 + 'url': 'https://jedynka.polskieradio.pl/audycje/5769', + 'info_dict': { + 'id': '5769', + 'title': 'AgroFakty', + 'thumbnail': r're:https://static\.prsa\.pl/images/.+', + }, + 'playlist_mincount': 269, + }, { + # both episodes and articles, PR3 + 'url': 'https://trojka.polskieradio.pl/audycja/8906', + 'info_dict': { + 'id': '8906', + 'title': 'Trójka budzi', + 'thumbnail': r're:https://static\.prsa\.pl/images/.+', + }, + 'playlist_mincount': 722, + }, { + # some articles were "promoted to main page" and thus link to old frontend + 'url': 'https://trojka.polskieradio.pl/audycja/305', + 'info_dict': { + 'id': '305', + 'title': 'Co w mowie piszczy?', + 'thumbnail': r're:https://static\.prsa\.pl/images/.+', + }, + 'playlist_count': 1523, + }] + + def _call_lp3(self, path, query, video_id, note): + return self._download_json( + f'https://lp3test.polskieradio.pl/{path}', video_id, note, + query=query, headers={'x-api-key': '9bf6c5a2-a7d0-4980-9ed7-a3f7291f2a81'}) + + def _entries(self, playlist_id, has_episodes, has_articles): + for i in itertools.count(0) if has_episodes else []: + page = self._call_lp3( + 'AudioArticle/GetListByCategoryId', { + 'categoryId': playlist_id, + 'PageSize': 10, + 'skip': i, + 'format': 400, + }, playlist_id, f'Downloading episode list page {i + 1}') + if not traverse_obj(page, 'data'): + break + for episode in page['data']: + yield { + 'id': str(episode['id']), + 'url': episode['file'], + 'title': episode.get('title'), + 'duration': int_or_none(episode.get('duration')), + 'timestamp': parse_iso8601(episode.get('datePublic')), + } + + for i in itertools.count(0) if has_articles else []: + page = self._call_lp3( + 'Article/GetListByCategoryId', { + 'categoryId': playlist_id, + 'PageSize': 9, + 'skip': i, + 'format': 400, + }, playlist_id, f'Downloading article list page {i + 1}') + if not traverse_obj(page, 'data'): + break + for article in page['data']: + yield { + '_type': 'url_transparent', + 'id': str(article['id']), + 'url': article['url'], + 'title': article.get('shortTitle'), + 'description': traverse_obj(article, ('description', 'lead')), + 'timestamp': parse_iso8601(article.get('datePublic')), + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + page_props = traverse_obj( + self._search_nextjs_data(self._download_webpage(url, playlist_id), playlist_id), + ('props', 'pageProps', ('data', None)), get_all=False) + + has_episodes = bool(traverse_obj(page_props, 'episodes', 'audios')) + has_articles = bool(traverse_obj(page_props, 'articles')) + + return self.playlist_result( + self._entries(playlist_id, has_episodes, has_articles), playlist_id, + title=traverse_obj(page_props, ('details', 'name')), + description=traverse_obj(page_props, ('details', 'description', 'lead')), + thumbnail=traverse_obj(page_props, ('details', 'photo'))) + + +class PolskieRadioCategoryIE(InfoExtractor): + # legacy sites + IE_NAME = 'polskieradio:category' + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/(?:\d+(?:,[^/]+)?/|[^/]+/Tag)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', + 'info_dict': { + 'id': '4143', + 'title': 'Kierunek Kraków', + }, + 'playlist_mincount': 61 + }, { + 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', + 'info_dict': { + 'id': '214', + 'title': 'Muzyka', + }, + 'playlist_mincount': 61 + }, { + # billennium tabs + 'url': 'https://www.polskieradio.pl/8/2385', + 'info_dict': { + 'id': '2385', + 'title': 'Droga przez mąkę', + }, + 'playlist_mincount': 111, + }, { + 'url': 'https://www.polskieradio.pl/10/4930', + 'info_dict': { + 'id': '4930', + 'title': 'Teraz K-pop!', + }, + 'playlist_mincount': 392, + }, { + # post back pages, audio content directly without articles + 'url': 'https://www.polskieradio.pl/8,dwojka/7376,nowa-mowa', + 'info_dict': { + 'id': '7376', + 'title': 'Nowa mowa', + }, + 'playlist_mincount': 244, + }, { + 'url': 'https://www.polskieradio.pl/Krzysztof-Dziuba/Tag175458', + 'info_dict': { + 'id': '175458', + 'title': 'Krzysztof Dziuba', + }, + 'playlist_mincount': 420, + }, { + 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if PolskieRadioLegacyIE.suitable(url) else super().suitable(url) + + def _entries(self, url, page, category_id): + content = page + is_billennium_tabs = 'onclick="TB_LoadTab(' in page + is_post_back = 'onclick="__doPostBack(' in page + pagination = page if is_billennium_tabs else None + for page_num in itertools.count(2): + for a_entry, entry_id in re.findall( + r'(?s)<article[^>]+>.*?(<a[^>]+href=["\'](?:(?:https?)?://[^/]+)?/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', + content): + entry = extract_attributes(a_entry) + if entry.get('href'): + yield self.url_result( + urljoin(url, entry['href']), PolskieRadioLegacyIE, entry_id, entry.get('title')) + for a_entry in re.findall(r'<span data-media=({[^ ]+})', content): + yield traverse_obj(self._parse_json(a_entry, category_id), { + 'url': 'file', + 'id': 'uid', + 'duration': 'length', + 'title': ('title', {urllib.parse.unquote}), + 'description': ('desc', {urllib.parse.unquote}), + }) + if is_billennium_tabs: + params = self._search_json( + r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+onclick=["\']TB_LoadTab\(', + pagination, 'next page params', category_id, default=None, close_objects=1, + contains_pattern='.+', transform_source=lambda x: '[%s' % js_to_json(unescapeHTML(x))) + if not params: + break + tab_content = self._download_json( + 'https://www.polskieradio.pl/CMS/TemplateBoxesManagement/TemplateBoxTabContent.aspx/GetTabContent', + category_id, f'Downloading page {page_num}', headers={'content-type': 'application/json'}, + data=json.dumps(dict(zip(( + 'boxInstanceId', 'tabId', 'categoryType', 'sectionId', 'categoryId', 'pagerMode', + 'subjectIds', 'tagIndexId', 'queryString', 'name', 'openArticlesInParentTemplate', + 'idSectionFromUrl', 'maxDocumentAge', 'showCategoryForArticle', 'pageNumber' + ), params))).encode())['d'] + content, pagination = tab_content['Content'], tab_content.get('PagerContent') + elif is_post_back: + target = self._search_regex( + r'onclick=(?:["\'])__doPostBack\((?P<q1>["\'])(?P<target>[\w$]+)(?P=q1)\s*,\s*(?P<q2>["\'])Next(?P=q2)', + content, 'pagination postback target', group='target', default=None) + if not target: + break + content = self._download_webpage( + url, category_id, f'Downloading page {page_num}', + data=urllib.parse.urlencode({ + **self._hidden_inputs(content), + '__EVENTTARGET': target, + '__EVENTARGUMENT': 'Next', + }).encode()) + else: + next_url = urljoin(url, self._search_regex( + r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', + content, 'next page url', group='url', default=None)) + if not next_url: + break + content = self._download_webpage(next_url, category_id, f'Downloading page {page_num}') + + def _real_extract(self, url): + category_id = self._match_id(url) + webpage, urlh = self._download_webpage_handle(url, category_id) + if PolskieRadioAuditionIE.suitable(urlh.url): + return self.url_result(urlh.url, PolskieRadioAuditionIE, category_id) + title = self._html_search_regex( + r'<title>([^<]+)(?: - [^<]+ - [^<]+| w [Pp]olskie[Rr]adio\.pl\s*)', + webpage, 'title', fatal=False) + return self.playlist_result( + self._entries(url, webpage, category_id), + category_id, title) + + +class PolskieRadioPlayerIE(InfoExtractor): + IE_NAME = 'polskieradio:player' + _VALID_URL = r'https?://player\.polskieradio\.pl/anteny/(?P[^/]+)' + + _BASE_URL = 'https://player.polskieradio.pl' + _PLAYER_URL = 'https://player.polskieradio.pl/main.bundle.js' + _STATIONS_API_URL = 'https://apipr.polskieradio.pl/api/stacje' + + _TESTS = [{ + 'url': 'https://player.polskieradio.pl/anteny/trojka', + 'info_dict': { + 'id': '3', + 'ext': 'm4a', + 'title': 'Trójka', + }, + 'params': { + 'format': 'bestaudio', + 'skip_download': 'endless stream', + }, + }] + + def _get_channel_list(self, channel_url='no_channel'): + player_code = self._download_webpage( + self._PLAYER_URL, channel_url, + note='Downloading js player') + channel_list = js_to_json(self._search_regex( + r';var r="anteny",a=(\[.+?\])},', player_code, 'channel list')) + return self._parse_json(channel_list, channel_url) + + def _real_extract(self, url): + channel_url = self._match_id(url) + channel_list = self._get_channel_list(channel_url) + + channel = next((c for c in channel_list if c.get('url') == channel_url), None) + + if not channel: + raise ExtractorError('Channel not found') + + station_list = self._download_json(self._STATIONS_API_URL, channel_url, + note='Downloading stream url list', + headers={ + 'Accept': 'application/json', + 'Referer': url, + 'Origin': self._BASE_URL, + }) + station = next((s for s in station_list + if s.get('Name') == (channel.get('streamName') or channel.get('name'))), None) + if not station: + raise ExtractorError('Station not found even though we extracted channel') + + formats = [] + for stream_url in station['Streams']: + stream_url = self._proto_relative_url(stream_url) + if stream_url.endswith('/playlist.m3u8'): + formats.extend(self._extract_m3u8_formats(stream_url, channel_url, live=True)) + elif stream_url.endswith('/manifest.f4m'): + formats.extend(self._extract_mpd_formats(stream_url, channel_url)) + elif stream_url.endswith('/Manifest'): + formats.extend(self._extract_ism_formats(stream_url, channel_url)) + else: + formats.append({ + 'url': stream_url, + }) + + return { + 'id': compat_str(channel['id']), + 'formats': formats, + 'title': channel.get('name') or channel.get('streamName'), + 'display_id': channel_url, + 'thumbnail': f'{self._BASE_URL}/images/{channel_url}-color-logo.png', + 'is_live': True, + } + + +class PolskieRadioPodcastBaseExtractor(InfoExtractor): + _API_BASE = 'https://apipodcasts.polskieradio.pl/api' + + def _parse_episode(self, data): + return { + 'id': data['guid'], + 'formats': [{ + 'url': data['url'], + 'filesize': int_or_none(data.get('fileSize')), + }], + 'title': data['title'], + 'description': data.get('description'), + 'duration': int_or_none(data.get('length')), + 'timestamp': parse_iso8601(data.get('publishDate')), + 'thumbnail': url_or_none(data.get('image')), + 'series': data.get('podcastTitle'), + 'episode': data['title'], + } + + +class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor): + IE_NAME = 'polskieradio:podcast:list' + _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P\d+)' + _TESTS = [{ + 'url': 'https://podcasty.polskieradio.pl/podcast/8/', + 'info_dict': { + 'id': '8', + 'title': 'Śniadanie w Trójce', + 'description': 'md5:57abcc27bc4c6a6b25baa3061975b9ef', + 'uploader': 'Beata Michniewicz', + }, + 'playlist_mincount': 714, + }] + _PAGE_SIZE = 10 + + def _call_api(self, podcast_id, page): + return self._download_json( + f'{self._API_BASE}/Podcasts/{podcast_id}/?pageSize={self._PAGE_SIZE}&page={page}', + podcast_id, f'Downloading page {page}') + + def _real_extract(self, url): + podcast_id = self._match_id(url) + data = self._call_api(podcast_id, 1) + + def get_page(page_num): + page_data = self._call_api(podcast_id, page_num + 1) if page_num else data + yield from (self._parse_episode(ep) for ep in page_data['items']) + + return { + '_type': 'playlist', + 'entries': InAdvancePagedList( + get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE), + 'id': str(data['id']), + 'title': data.get('title'), + 'description': data.get('description'), + 'uploader': data.get('announcer'), + } + + +class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor): + IE_NAME = 'polskieradio:podcast' + _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})' + _TESTS = [{ + 'url': 'https://podcasty.polskieradio.pl/track/6eafe403-cb8f-4756-b896-4455c3713c32', + 'info_dict': { + 'id': '6eafe403-cb8f-4756-b896-4455c3713c32', + 'ext': 'mp3', + 'title': 'Theresa May rezygnuje. Co dalej z brexitem?', + 'description': 'md5:e41c409a29d022b70ef0faa61dbded60', + 'episode': 'Theresa May rezygnuje. Co dalej z brexitem?', + 'duration': 2893, + 'thumbnail': 'https://static.prsa.pl/images/58649376-c8a0-4ba2-a714-78b383285f5f.jpg', + 'series': 'Raport o stanie świata', + }, + }] + + def _real_extract(self, url): + podcast_id = self._match_id(url) + data = self._download_json( + f'{self._API_BASE}/audio', + podcast_id, 'Downloading podcast metadata', + data=json.dumps({ + 'guids': [podcast_id], + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) + return self._parse_episode(data[0]) diff --git a/yt_dlp/extractor/popcorntimes.py b/yt_dlp/extractor/popcorntimes.py new file mode 100644 index 0000000..ddc5ec8 --- /dev/null +++ b/yt_dlp/extractor/popcorntimes.py @@ -0,0 +1,91 @@ +from .common import InfoExtractor +from ..compat import compat_b64decode +from ..utils import int_or_none + + +class PopcorntimesIE(InfoExtractor): + _VALID_URL = r'https?://popcorntimes\.tv/[^/]+/m/(?P[^/]+)/(?P[^/?#&]+)' + _TEST = { + 'url': 'https://popcorntimes.tv/de/m/A1XCFvz/haensel-und-gretel-opera-fantasy', + 'md5': '93f210991ad94ba8c3485950a2453257', + 'info_dict': { + 'id': 'A1XCFvz', + 'display_id': 'haensel-und-gretel-opera-fantasy', + 'ext': 'mp4', + 'title': 'Hänsel und Gretel', + 'description': 'md5:1b8146791726342e7b22ce8125cf6945', + 'thumbnail': r're:^https?://.*\.jpg$', + 'creator': 'John Paul', + 'release_date': '19541009', + 'duration': 4260, + 'tbr': 5380, + 'width': 720, + 'height': 540, + }, + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + title = self._search_regex( + r'

([^<]+)', webpage, 'title', + default=None) or self._html_search_meta( + 'ya:ovs:original_name', webpage, 'title', fatal=True) + + loc = self._search_regex( + r'PCTMLOC\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'loc', + group='value') + + loc_b64 = '' + for c in loc: + c_ord = ord(c) + if ord('a') <= c_ord <= ord('z') or ord('A') <= c_ord <= ord('Z'): + upper = ord('Z') if c_ord <= ord('Z') else ord('z') + c_ord += 13 + if upper < c_ord: + c_ord -= 26 + loc_b64 += chr(c_ord) + + video_url = compat_b64decode(loc_b64).decode('utf-8') + + description = self._html_search_regex( + r'(?s)]+class=["\']pt-movie-desc[^>]+>(.+?)', webpage, + 'description', fatal=False) + + thumbnail = self._search_regex( + r']+class=["\']video-preview[^>]+\bsrc=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'thumbnail', default=None, + group='value') or self._og_search_thumbnail(webpage) + + creator = self._html_search_meta( + 'video:director', webpage, 'creator', default=None) + + release_date = self._html_search_meta( + 'video:release_date', webpage, default=None) + if release_date: + release_date = release_date.replace('-', '') + + def int_meta(name): + return int_or_none(self._html_search_meta( + name, webpage, default=None)) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'creator': creator, + 'release_date': release_date, + 'duration': int_meta('video:duration'), + 'tbr': int_meta('ya:ovs:bitrate'), + 'width': int_meta('og:video:width'), + 'height': int_meta('og:video:height'), + 'http_headers': { + 'Referer': url, + }, + } diff --git a/yt_dlp/extractor/popcorntv.py b/yt_dlp/extractor/popcorntv.py new file mode 100644 index 0000000..7798462 --- /dev/null +++ b/yt_dlp/extractor/popcorntv.py @@ -0,0 +1,72 @@ +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + unified_timestamp, +) + + +class PopcornTVIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.popcorntv\.it/guarda/(?P[^/]+)/(?P\d+)' + _TESTS = [{ + 'url': 'https://animemanga.popcorntv.it/guarda/food-wars-battaglie-culinarie-episodio-01/9183', + 'md5': '47d65a48d147caf692ab8562fe630b45', + 'info_dict': { + 'id': '9183', + 'display_id': 'food-wars-battaglie-culinarie-episodio-01', + 'ext': 'mp4', + 'title': 'Food Wars, Battaglie Culinarie | Episodio 01', + 'description': 'md5:b8bea378faae4651d3b34c6e112463d0', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1497610857, + 'upload_date': '20170616', + 'duration': 1440, + 'view_count': int, + }, + }, { + 'url': 'https://cinema.popcorntv.it/guarda/smash-cut/10433', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id, video_id = mobj.group('display_id', 'id') + + webpage = self._download_webpage(url, display_id) + + m3u8_url = extract_attributes( + self._search_regex( + r'(]+itemprop=["\'](?:content|embed)Url[^>]*>)', + webpage, 'content' + ))['href'] + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + title = self._search_regex( + r']+itemprop=["\']name[^>]*>([^<]+)', webpage, + 'title', default=None) or self._og_search_title(webpage) + + description = self._html_search_regex( + r'(?s)]+itemprop=["\']description[^>]*>(.+?)', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + timestamp = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + duration = int_or_none(self._html_search_meta( + 'duration', webpage), invscale=60) + view_count = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } diff --git a/yt_dlp/extractor/porn91.py b/yt_dlp/extractor/porn91.py new file mode 100644 index 0000000..7d16a16 --- /dev/null +++ b/yt_dlp/extractor/porn91.py @@ -0,0 +1,95 @@ +import urllib.parse +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + remove_end, + unified_strdate, + ExtractorError, +) + + +class Porn91IE(InfoExtractor): + IE_NAME = '91porn' + _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/view_video.php\?([^#]+&)?viewkey=(?P\w+)' + + _TESTS = [{ + 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', + 'md5': 'd869db281402e0ef4ddef3c38b866f86', + 'info_dict': { + 'id': '7e42283b4f5ab36da134', + 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', + 'description': 'md5:1ff241f579b07ae936a54e810ad2e891', + 'ext': 'mp4', + 'duration': 431, + 'upload_date': '20150520', + 'comment_count': int, + 'view_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://91porn.com/view_video.php?viewkey=7ef0cf3d362c699ab91c', + 'md5': 'f8fd50540468a6d795378cd778b40226', + 'info_dict': { + 'id': '7ef0cf3d362c699ab91c', + 'title': '真实空乘,冲上云霄第二部', + 'description': 'md5:618bf9652cafcc66cd277bd96789baea', + 'ext': 'mp4', + 'duration': 248, + 'upload_date': '20221119', + 'comment_count': int, + 'view_count': int, + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + self._set_cookie('91porn.com', 'language', 'cn_CN') + + webpage = self._download_webpage( + 'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id) + + if '视频不存在,可能已经被删除或者被举报为不良内容!' in webpage: + raise ExtractorError('91 Porn says: Video does not exist', expected=True) + + daily_limit = self._search_regex( + r'作为游客,你每天只可观看([\d]+)个视频', webpage, 'exceeded daily limit', default=None, fatal=False) + if daily_limit: + raise ExtractorError(f'91 Porn says: Daily limit {daily_limit} videos exceeded', expected=True) + + video_link_url = self._search_regex( + r'document\.write\(\s*strencode2\s*\(\s*((?:"[^"]+")|(?:\'[^\']+\'))', webpage, 'video link') + video_link_url = self._search_regex( + r'src=["\']([^"\']+)["\']', urllib.parse.unquote(video_link_url), 'unquoted video link') + + formats, subtitles = self._get_formats_and_subtitle(video_link_url, video_id) + + return { + 'id': video_id, + 'title': remove_end(self._html_extract_title(webpage).replace('\n', ''), 'Chinese homemade video').strip(), + 'formats': formats, + 'subtitles': subtitles, + 'upload_date': unified_strdate(self._search_regex( + r'(\d{4}-\d{2}-\d{2})', webpage, 'upload_date', fatal=False)), + 'description': self._html_search_regex( + r'\s*([^<]+)', webpage, 'description', fatal=False), + 'duration': parse_duration(self._search_regex( + r'时长:\s*]*>\s*(\d+(?::\d+){1,2})', webpage, 'duration', fatal=False)), + 'comment_count': int_or_none(self._search_regex( + r'留言:\s*]*>\s*(\d+)\s*', webpage, 'comment count', fatal=False)), + 'view_count': int_or_none(self._search_regex( + r'热度:\s*]*>\s*(\d+)\s*', webpage, 'view count', fatal=False)), + 'age_limit': 18, + } + + def _get_formats_and_subtitle(self, video_link_url, video_id): + ext = determine_ext(video_link_url) + if ext == 'm3u8': + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_link_url, video_id, ext='mp4') + else: + formats = [{'url': video_link_url, 'ext': ext}] + subtitles = {} + + return formats, subtitles diff --git a/yt_dlp/extractor/pornbox.py b/yt_dlp/extractor/pornbox.py new file mode 100644 index 0000000..c381382 --- /dev/null +++ b/yt_dlp/extractor/pornbox.py @@ -0,0 +1,113 @@ +from .common import InfoExtractor +from ..compat import functools +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, + qualities, + str_or_none, + traverse_obj, + url_or_none, +) + + +class PornboxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornbox\.com/application/watch-page/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://pornbox.com/application/watch-page/212108', + 'md5': '3ff6b6e206f263be4c5e987a3162ac6e', + 'info_dict': { + 'id': '212108', + 'ext': 'mp4', + 'title': 'md5:ececc5c6e6c9dd35d290c45fed05fd49', + 'uploader': 'Lily Strong', + 'timestamp': 1665871200, + 'upload_date': '20221015', + 'age_limit': 18, + 'availability': 'needs_auth', + 'duration': 1505, + 'cast': ['Lily Strong', 'John Strong'], + 'tags': 'count:11', + 'description': 'md5:589c7f33e183aa8aa939537300efb859', + 'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$' + } + }, { + 'url': 'https://pornbox.com/application/watch-page/216045', + 'info_dict': { + 'id': '216045', + 'title': 'md5:3e48528e73a9a2b12f7a2772ed0b26a2', + 'description': 'md5:3e631dcaac029f15ed434e402d1b06c7', + 'uploader': 'VK Studio', + 'timestamp': 1618264800, + 'upload_date': '20210412', + 'age_limit': 18, + 'availability': 'premium_only', + 'duration': 2710, + 'cast': 'count:3', + 'tags': 'count:29', + 'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$', + 'subtitles': 'count:6' + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True + }, + 'expected_warnings': [ + 'You are either not logged in or do not have access to this scene', + 'No video formats found', 'Requested format is not available'] + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + public_data = self._download_json(f'https://pornbox.com/contents/{video_id}', video_id) + + subtitles = {country_code: [{ + 'url': f'https://pornbox.com/contents/{video_id}/subtitles/{country_code}', + 'ext': 'srt' + }] for country_code in traverse_obj(public_data, ('subtitles', ..., {str}))} + + is_free_scene = traverse_obj( + public_data, ('price', 'is_available_for_free', {bool}), default=False) + + metadata = { + 'id': video_id, + **traverse_obj(public_data, { + 'title': ('scene_name', {str.strip}), + 'description': ('small_description', {str.strip}), + 'uploader': 'studio', + 'duration': ('runtime', {parse_duration}), + 'cast': (('models', 'male_models'), ..., 'model_name'), + 'thumbnail': ('player_poster', {url_or_none}), + 'tags': ('niches', ..., 'niche'), + }), + 'age_limit': 18, + 'timestamp': parse_iso8601(traverse_obj( + public_data, ('studios', 'release_date'), 'publish_date')), + 'availability': self._availability(needs_auth=True, needs_premium=not is_free_scene), + 'subtitles': subtitles, + } + + if not public_data.get('is_purchased') or not is_free_scene: + self.raise_login_required( + 'You are either not logged in or do not have access to this scene', metadata_available=True) + return metadata + + media_id = traverse_obj(public_data, ( + 'medias', lambda _, v: v['title'] == 'Full video', 'media_id', {int}), get_all=False) + if not media_id: + self.raise_no_formats('Could not find stream id', video_id=video_id) + + stream_data = self._download_json( + f'https://pornbox.com/media/{media_id}/stream', video_id=video_id, note='Getting manifest urls') + + get_quality = qualities(['web', 'vga', 'hd', '1080p', '4k', '8k']) + metadata['formats'] = traverse_obj(stream_data, ('qualities', lambda _, v: v['src'], { + 'url': 'src', + 'vbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}), + 'format_id': ('quality', {str_or_none}), + 'quality': ('quality', {get_quality}), + 'width': ('size', {lambda x: int(x[:-1])}), + })) + + return metadata diff --git a/yt_dlp/extractor/pornflip.py b/yt_dlp/extractor/pornflip.py new file mode 100644 index 0000000..51a9cf3 --- /dev/null +++ b/yt_dlp/extractor/pornflip.py @@ -0,0 +1,77 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601 +) + + +class PornFlipIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornflip\.com/(?:(embed|sv|v)/)?(?P[^/]+)' + _TESTS = [ + { + 'url': 'https://www.pornflip.com/dzv9Mtw1qj2/sv/brazzers-double-dare-two-couples-fucked-jenna-reid-maya-bijou', + 'info_dict': { + 'id': 'dzv9Mtw1qj2', + 'ext': 'mp4', + 'title': 'Brazzers - Double Dare Two couples fucked Jenna Reid Maya Bijou', + 'description': 'md5:d2b69e6cc743c5fd158e162aa7f05821', + 'duration': 476, + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'timestamp': 1617846819, + 'upload_date': '20210408', + 'uploader': 'Brazzers', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://www.pornflip.com/v/IrJEC40i21L', + 'only_matching': True, + }, + { + 'url': 'https://www.pornflip.com/Z3jzbChC5-P/sexintaxi-e-sereyna-gomez-czech-naked-couple', + 'only_matching': True, + }, + { + 'url': 'https://www.pornflip.com/embed/bLcDFxnrZnU', + 'only_matching': True, + }, + ] + _HOST = 'www.pornflip.com' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'https://{}/sv/{}'.format(self._HOST, video_id), video_id, headers={'host': self._HOST}) + description = self._html_search_regex(r'&p\[summary\]=(.*?)\s*&p', webpage, 'description', fatal=False) + duration = self._search_regex(r'"duration":\s+"([^"]+)",', webpage, 'duration', fatal=False) + view_count = self._search_regex(r'"interactionCount":\s+"([^"]+)"', webpage, 'view_count', fatal=False) + title = self._html_search_regex(r'id="mediaPlayerTitleLink"[^>]*>(.+)', webpage, 'title', fatal=False) + uploader = self._html_search_regex(r'class="title-chanel"[^>]*>[^<]*]*>([^<]+)<', webpage, 'uploader', fatal=False) + upload_date = self._search_regex(r'"uploadDate":\s+"([^"]+)",', webpage, 'upload_date', fatal=False) + likes = self._html_search_regex( + r'class="btn btn-up-rating[^>]*>[^<]*]*>[^<]*[^>]*]*>[^0-9]*([0-9]+)[^<0-9]*<', webpage, 'like_count', fatal=False) + dislikes = self._html_search_regex( + r'class="btn btn-down-rating[^>]*>[^<]*]*>[^<]*[^>]*]*>[^0-9]*([0-9]+)[^<0-9]*<', webpage, 'dislike_count', fatal=False) + mpd_url = self._search_regex(r'"([^"]+userscontent.net/dash/[0-9]+/manifest.mpd[^"]*)"', webpage, 'mpd_url').replace('&', '&') + formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash') + + return { + 'age_limit': 18, + 'description': description, + 'dislike_count': int_or_none(dislikes), + 'duration': parse_duration(duration), + 'formats': formats, + 'id': video_id, + 'like_count': int_or_none(likes), + 'timestamp': parse_iso8601(upload_date), + 'thumbnail': self._og_search_thumbnail(webpage), + 'title': title, + 'uploader': uploader, + 'view_count': int_or_none(view_count), + } diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py new file mode 100644 index 0000000..29a3e43 --- /dev/null +++ b/yt_dlp/extractor/pornhub.py @@ -0,0 +1,825 @@ +import functools +import itertools +import math +import operator +import re + +from .common import InfoExtractor +from .openload import PhantomJSwrapper +from ..compat import compat_str +from ..networking import Request +from ..networking.exceptions import HTTPError +from ..utils import ( + NO_DEFAULT, + ExtractorError, + clean_html, + determine_ext, + format_field, + int_or_none, + merge_dicts, + orderedSet, + remove_quotes, + remove_start, + str_to_int, + update_url_query, + url_or_none, + urlencode_postdata, +) + + +class PornHubBaseIE(InfoExtractor): + _NETRC_MACHINE = 'pornhub' + _PORNHUB_HOST_RE = r'(?:(?Ppornhub(?:premium)?\.(?:com|net|org))|pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd\.onion)' + + def _download_webpage_handle(self, *args, **kwargs): + def dl(*args, **kwargs): + return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) + + ret = dl(*args, **kwargs) + + if not ret: + return ret + + webpage, urlh = ret + + if any(re.search(p, webpage) for p in ( + r']+\bonload=["\']go\(\)', + r'document\.cookie\s*=\s*["\']RNKEY=', + r'document\.location\.reload\(true\)')): + url_or_request = args[0] + url = (url_or_request.url + if isinstance(url_or_request, Request) + else url_or_request) + phantom = PhantomJSwrapper(self, required_version='2.0') + phantom.get(url, html=webpage) + webpage, urlh = dl(*args, **kwargs) + + return webpage, urlh + + def _real_initialize(self): + self._logged_in = False + + def _set_age_cookies(self, host): + self._set_cookie(host, 'age_verified', '1') + self._set_cookie(host, 'accessAgeDisclaimerPH', '1') + self._set_cookie(host, 'accessAgeDisclaimerUK', '1') + self._set_cookie(host, 'accessPH', '1') + + def _login(self, host): + if self._logged_in: + return + + site = host.split('.')[0] + + # Both sites pornhub and pornhubpremium have separate accounts + # so there should be an option to provide credentials for both. + # At the same time some videos are available under the same video id + # on both sites so that we have to identify them as the same video. + # For that purpose we have to keep both in the same extractor + # but under different netrc machines. + username, password = self._get_login_info(netrc_machine=site) + if username is None: + return + + login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '') + login_page = self._download_webpage( + login_url, None, 'Downloading %s login page' % site) + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'id="profileMenuDropdown"', + r'class="ph-icon-logout"')) + + if is_logged(login_page): + self._logged_in = True + return + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + response = self._download_json( + 'https://www.%s/front/authenticate' % host, None, + 'Logging in to %s' % site, + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': login_url, + 'X-Requested-With': 'XMLHttpRequest', + }) + + if response.get('success') == '1': + self._logged_in = True + return + + message = response.get('message') + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % message, expected=True) + + raise ExtractorError('Unable to log in') + + +class PornHubIE(PornHubBaseIE): + IE_DESC = 'PornHub and Thumbzilla' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:[^/]+\.)? + %s + /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:www\.)?thumbzilla\.com/video/ + ) + (?P[\da-z]+) + ''' % PornHubBaseIE._PORNHUB_HOST_RE + _EMBED_REGEX = [r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)'] + _TESTS = [{ + 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', + 'md5': 'a6391306d050e4547f62b3f485dd9ba9', + 'info_dict': { + 'id': '648719015', + 'ext': 'mp4', + 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', + 'uploader': 'Babes', + 'upload_date': '20130628', + 'timestamp': 1372447216, + 'duration': 361, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + 'cast': list, + }, + }, { + # non-ASCII title + 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002', + 'info_dict': { + 'id': '1331683002', + 'ext': 'mp4', + 'title': '重庆婷婷女王足交', + 'upload_date': '20150213', + 'timestamp': 1423804862, + 'duration': 1753, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy', + }, { + # subtitles + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', + 'info_dict': { + 'id': 'ph5af5fef7c2aa7', + 'ext': 'mp4', + 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor', + 'uploader': 'BFFs', + 'duration': 622, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + 'subtitles': { + 'en': [{ + "ext": 'srt' + }] + }, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video has been disabled', + }, { + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph601dc30bae19a', + 'info_dict': { + 'id': 'ph601dc30bae19a', + 'uploader': 'Projekt Melody', + 'uploader_id': 'projekt-melody', + 'upload_date': '20210205', + 'title': '"Welcome to My Pussy Mansion" - CB Stream (02/03/21)', + 'thumbnail': r're:https?://.+', + }, + }, { + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', + 'only_matching': True, + }, { + # removed at the request of cam4.com + 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', + 'only_matching': True, + }, { + # removed at the request of the copyright owner + 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859', + 'only_matching': True, + }, { + # removed by uploader + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', + 'only_matching': True, + }, { + # private video + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7', + 'only_matching': True, + }, { + 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/video/show?viewkey=648719015', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.org/view_video.php?viewkey=203640933', + 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', + 'only_matching': True, + }, { + # Some videos are available with the same id on both premium + # and non-premium sites (e.g. this and the following test) + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, + }, { + # geo restricted + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156', + 'only_matching': True, + }, { + 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/view_video.php?viewkey=ph5a9813bfa7156', + 'only_matching': True, + }] + + def _extract_count(self, pattern, webpage, name): + return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None)) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + host = mobj.group('host') or 'pornhub.com' + video_id = mobj.group('id') + + self._login(host) + self._set_age_cookies(host) + + def dl_webpage(platform): + self._set_cookie(host, 'platform', platform) + return self._download_webpage( + 'https://www.%s/view_video.php?viewkey=%s' % (host, video_id), + video_id, 'Downloading %s webpage' % platform) + + webpage = dl_webpage('pc') + + error_msg = self._html_search_regex( + (r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', + r'(?s)]+class=["\']noVideo["\'][^>]*>(?P.+?)'), + webpage, 'error message', default=None, group='error') + if error_msg: + error_msg = re.sub(r'\s+', ' ', error_msg) + raise ExtractorError( + 'PornHub said: %s' % error_msg, + expected=True, video_id=video_id) + + if any(re.search(p, webpage) for p in ( + r'class=["\']geoBlocked["\']', + r'>\s*This content is unavailable in your country')): + self.raise_geo_restricted() + + # video_title from flashvars contains whitespace instead of non-ASCII (see + # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying + # on that anymore. + title = self._html_search_meta( + 'twitter:title', webpage, default=None) or self._html_search_regex( + (r'(?s)]+class=["\']title["\'][^>]*>(?P.+?)</h1>', + r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1', + r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'), + webpage, 'title', group='title') + + video_urls = [] + video_urls_set = set() + subtitles = {} + + flashvars = self._parse_json( + self._search_regex( + r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), + video_id) + if flashvars: + subtitle_url = url_or_none(flashvars.get('closedCaptionsFile')) + if subtitle_url: + subtitles.setdefault('en', []).append({ + 'url': subtitle_url, + 'ext': 'srt', + }) + thumbnail = flashvars.get('image_url') + duration = int_or_none(flashvars.get('video_duration')) + media_definitions = flashvars.get('mediaDefinitions') + if isinstance(media_definitions, list): + for definition in media_definitions: + if not isinstance(definition, dict): + continue + video_url = definition.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): + continue + if video_url in video_urls_set: + continue + video_urls_set.add(video_url) + video_urls.append( + (video_url, int_or_none(definition.get('quality')))) + else: + thumbnail, duration = [None] * 2 + + def extract_js_vars(webpage, pattern, default=NO_DEFAULT): + assignments = self._search_regex( + pattern, webpage, 'encoded url', default=default) + if not assignments: + return {} + + assignments = assignments.split(';') + + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) + if '+' in inp: + inps = inp.split('+') + return functools.reduce( + operator.concat, map(parse_js_value, inps)) + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + return remove_quotes(inp) + + for assn in assignments: + assn = assn.strip() + if not assn: + continue + assn = re.sub(r'var\s+', '', assn) + vname, value = assn.split('=', 1) + js_vars[vname] = parse_js_value(value) + return js_vars + + def add_video_url(video_url): + v_url = url_or_none(video_url) + if not v_url: + return + if v_url in video_urls_set: + return + video_urls.append((v_url, None)) + video_urls_set.add(v_url) + + def parse_quality_items(quality_items): + q_items = self._parse_json(quality_items, video_id, fatal=False) + if not isinstance(q_items, list): + return + for item in q_items: + if isinstance(item, dict): + add_video_url(item.get('url')) + + if not video_urls: + FORMAT_PREFIXES = ('media', 'quality', 'qualityItems') + js_vars = extract_js_vars( + webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), + default=None) + if js_vars: + for key, format_url in js_vars.items(): + if key.startswith(FORMAT_PREFIXES[-1]): + parse_quality_items(format_url) + elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]): + add_video_url(format_url) + if not video_urls and re.search( + r'<[^>]+\bid=["\']lockedPlayer', webpage): + raise ExtractorError( + 'Video %s is locked' % video_id, expected=True) + + if not video_urls: + js_vars = extract_js_vars( + dl_webpage('tv'), r'(var.+?mediastring.+?)</script>') + add_video_url(js_vars['mediastring']) + + for mobj in re.finditer( + r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage): + video_url = mobj.group('url') + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + upload_date = None + formats = [] + + def add_format(format_url, height=None): + ext = determine_ext(format_url) + if ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + return + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + return + if not height: + height = int_or_none(self._search_regex( + r'(?P<height>\d+)[pP]?_\d+[kK]', format_url, 'height', + default=None)) + formats.append({ + 'url': format_url, + 'format_id': format_field(height, None, '%dp'), + 'height': height, + }) + + for video_url, height in video_urls: + if not upload_date: + upload_date = self._search_regex( + r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) + if upload_date: + upload_date = upload_date.replace('/', '') + if '/video/get_media' in video_url: + medias = self._download_json(video_url, video_id, fatal=False) + if isinstance(medias, list): + for media in medias: + if not isinstance(media, dict): + continue + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + height = int_or_none(media.get('quality')) + add_format(video_url, height) + continue + add_format(video_url) + + model_profile = self._search_json( + r'var\s+MODEL_PROFILE\s*=', webpage, 'model profile', video_id, fatal=False) + video_uploader = self._html_search_regex( + r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', + webpage, 'uploader', default=None) or model_profile.get('username') + + def extract_vote_count(kind, name): + return self._extract_count( + (r'<span[^>]+\bclass="votes%s"[^>]*>([\d,\.]+)</span>' % kind, + r'<span[^>]+\bclass=["\']votes%s["\'][^>]*\bdata-rating=["\'](\d+)' % kind), + webpage, name) + + view_count = self._extract_count( + r'<span class="count">([\d,\.]+)</span> [Vv]iews', webpage, 'view') + like_count = extract_vote_count('Up', 'like') + dislike_count = extract_vote_count('Down', 'dislike') + comment_count = self._extract_count( + r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') + + def extract_list(meta_key): + div = self._search_regex( + r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>' + % meta_key, webpage, meta_key, default=None) + if div: + return [clean_html(x).strip() for x in re.findall(r'(?s)<a[^>]+\bhref=[^>]+>.+?</a>', div)] + + info = self._search_json_ld(webpage, video_id, default={}) + # description provided in JSON-LD is irrelevant + info['description'] = None + + return merge_dicts({ + 'id': video_id, + 'uploader': video_uploader, + 'uploader_id': remove_start(model_profile.get('modelProfileLink'), '/model/'), + 'upload_date': upload_date, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'comment_count': comment_count, + 'formats': formats, + 'age_limit': 18, + 'tags': extract_list('tags'), + 'categories': extract_list('categories'), + 'cast': extract_list('pornstars'), + 'subtitles': subtitles, + }, info) + + +class PornHubPlaylistBaseIE(PornHubBaseIE): + def _extract_page(self, url): + return int_or_none(self._search_regex( + r'\bpage=(\d+)', url, 'page', default=None)) + + def _extract_entries(self, webpage, host): + # Only process container div with main playlist content skipping + # drop-down menu that uses similar pattern for videos (see + # https://github.com/ytdl-org/youtube-dl/issues/11594). + container = self._search_regex( + r'(?s)(<div[^>]+class=["\']container.+)', webpage, + 'container', default=webpage) + + return [ + self.url_result( + 'http://www.%s/%s' % (host, video_url), + PornHubIE.ie_key(), video_title=title) + for video_url, title in orderedSet(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', + container)) + ] + + +class PornHubUserIE(PornHubPlaylistBaseIE): + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' % PornHubBaseIE._PORNHUB_HOST_RE + _TESTS = [{ + 'url': 'https://www.pornhub.com/model/zoe_ph', + 'playlist_mincount': 118, + }, { + 'url': 'https://www.pornhub.com/pornstar/liz-vicious', + 'info_dict': { + 'id': 'liz-vicious', + }, + 'playlist_mincount': 118, + }, { + 'url': 'https://www.pornhub.com/users/russianveet69', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/channels/povd', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1', + 'only_matching': True, + }, { + # Unavailable via /videos page, but available with direct pagination + # on pornstar page (see [1]), requires premium + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west', + 'only_matching': True, + }, { + # Same as before, multi page + 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', + 'only_matching': True, + }, { + 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + user_id = mobj.group('id') + videos_url = '%s/videos' % mobj.group('url') + self._set_age_cookies(mobj.group('host')) + page = self._extract_page(url) + if page: + videos_url = update_url_query(videos_url, {'page': page}) + return self.url_result( + videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id) + + +class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): + @staticmethod + def _has_more(webpage): + return re.search( + r'''(?x) + <li[^>]+\bclass=["\']page_next| + <link[^>]+\brel=["\']next| + <button[^>]+\bid=["\']moreDataBtn + ''', webpage) is not None + + def _entries(self, url, host, item_id): + page = self._extract_page(url) + + VIDEOS = '/videos' + + def download_page(base_url, num, fallback=False): + note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '') + return self._download_webpage( + base_url, item_id, note, query={'page': num}) + + def is_404(e): + return isinstance(e.cause, HTTPError) and e.cause.status == 404 + + base_url = url + has_page = page is not None + first_page = page if has_page else 1 + for page_num in (first_page, ) if has_page else itertools.count(first_page): + try: + try: + webpage = download_page(base_url, page_num) + except ExtractorError as e: + # Some sources may not be available via /videos page, + # trying to fallback to main page pagination (see [1]) + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + if is_404(e) and page_num == first_page and VIDEOS in base_url: + base_url = base_url.replace(VIDEOS, '') + webpage = download_page(base_url, page_num, fallback=True) + else: + raise + except ExtractorError as e: + if is_404(e) and page_num != first_page: + break + raise + page_entries = self._extract_entries(webpage, host) + if not page_entries: + break + for e in page_entries: + yield e + if not self._has_more(webpage): + break + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + host = mobj.group('host') + item_id = mobj.group('id') + + self._login(host) + self._set_age_cookies(host) + + return self.playlist_result(self._entries(url, host, item_id), item_id) + + +class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): + _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?!playlist/)(?P<id>(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE + _TESTS = [{ + 'url': 'https://www.pornhub.com/model/zoe_ph/videos', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/users/rushandlia/videos', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos', + 'info_dict': { + 'id': 'pornstar/jenny-blighe/videos', + }, + 'playlist_mincount': 149, + }, { + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos?page=3', + 'info_dict': { + 'id': 'pornstar/jenny-blighe/videos', + }, + 'playlist_mincount': 40, + }, { + # default sorting as Top Rated Videos + 'url': 'https://www.pornhub.com/channels/povd/videos', + 'info_dict': { + 'id': 'channels/povd/videos', + }, + 'playlist_mincount': 293, + }, { + # Top Rated Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra', + 'only_matching': True, + }, { + # Most Recent Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=da', + 'only_matching': True, + }, { + # Most Viewed Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', + 'only_matching': True, + }, { + # Most Viewed Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=mv', + 'only_matching': True, + }, { + # Top Rated Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=tr', + 'only_matching': True, + }, { + # Longest Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=lg', + 'only_matching': True, + }, { + # Newest Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=cm', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/paid', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/fanonly', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video?page=3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video/search?search=123', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/categories/teen', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/categories/teen?page=3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/hd', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/hd?page=3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/described-video', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/described-video?page=2', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn', + 'only_matching': True, + }, { + 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph/videos', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return (False + if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url) + else super(PornHubPagedVideoListIE, cls).suitable(url)) + + +class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' % PornHubBaseIE._PORNHUB_HOST_RE + _TESTS = [{ + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', + 'info_dict': { + 'id': 'jenny-blighe', + }, + 'playlist_mincount': 129, + }, { + 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload', + 'only_matching': True, + }, { + 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/pornstar/jenny-blighe/videos/upload', + 'only_matching': True, + }] + + +class PornHubPlaylistIE(PornHubPlaylistBaseIE): + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/playlist/(?P<id>[^/?#&]+))' % PornHubBaseIE._PORNHUB_HOST_RE + _TESTS = [{ + 'url': 'https://www.pornhub.com/playlist/44121572', + 'info_dict': { + 'id': '44121572', + }, + 'playlist_count': 77, + }, { + 'url': 'https://www.pornhub.com/playlist/4667351', + 'only_matching': True, + }, { + 'url': 'https://de.pornhub.com/playlist/4667351', + 'only_matching': True, + }, { + 'url': 'https://de.pornhub.com/playlist/4667351?page=2', + 'only_matching': True, + }] + + def _entries(self, url, host, item_id): + webpage = self._download_webpage(url, item_id, 'Downloading page 1') + playlist_id = self._search_regex(r'var\s+playlistId\s*=\s*"([^"]+)"', webpage, 'playlist_id') + video_count = int_or_none( + self._search_regex(r'var\s+itemsCount\s*=\s*([0-9]+)\s*\|\|', webpage, 'video_count')) + token = self._search_regex(r'var\s+token\s*=\s*"([^"]+)"', webpage, 'token') + page_count = math.ceil((video_count - 36) / 40.) + 1 + page_entries = self._extract_entries(webpage, host) + + def download_page(page_num): + note = 'Downloading page {}'.format(page_num) + page_url = 'https://www.{}/playlist/viewChunked'.format(host) + return self._download_webpage(page_url, item_id, note, query={ + 'id': playlist_id, + 'page': page_num, + 'token': token, + }) + + for page_num in range(1, page_count + 1): + if page_num > 1: + webpage = download_page(page_num) + page_entries = self._extract_entries(webpage, host) + if not page_entries: + break + for e in page_entries: + yield e + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + host = mobj.group('host') + item_id = mobj.group('id') + + self._login(host) + self._set_age_cookies(host) + + return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id) diff --git a/yt_dlp/extractor/pornotube.py b/yt_dlp/extractor/pornotube.py new file mode 100644 index 0000000..e0960f4 --- /dev/null +++ b/yt_dlp/extractor/pornotube.py @@ -0,0 +1,83 @@ +import json + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PornotubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)?pornotube\.com/(?:[^?#]*?)/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.pornotube.com/orientation/straight/video/4964/title/weird-hot-and-wet-science', + 'md5': '60fc5a4f0d93a97968fc7999d98260c9', + 'info_dict': { + 'id': '4964', + 'ext': 'mp4', + 'upload_date': '20141203', + 'title': 'Weird Hot and Wet Science', + 'description': 'md5:a8304bef7ef06cb4ab476ca6029b01b0', + 'categories': ['Adult Humor', 'Blondes'], + 'uploader': 'Alpha Blue Archives', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1417582800, + 'age_limit': 18, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + token = self._download_json( + 'https://api.aebn.net/auth/v2/origins/authenticate', + video_id, note='Downloading token', + data=json.dumps({'credentials': 'Clip Application'}).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + 'Origin': 'http://www.pornotube.com', + })['tokenKey'] + + video_url = self._download_json( + 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id, + video_id, note='Downloading delivery information', + headers={'Authorization': token})['mediaUrl'] + + FIELDS = ( + 'title', 'description', 'startSecond', 'endSecond', 'publishDate', + 'studios{name}', 'categories{name}', 'movieId', 'primaryImageNumber' + ) + + info = self._download_json( + 'https://api.aebn.net/content/v2/clips/%s?fields=%s' + % (video_id, ','.join(FIELDS)), video_id, + note='Downloading metadata', + headers={'Authorization': token}) + + if isinstance(info, list): + info = info[0] + + title = info['title'] + + timestamp = int_or_none(info.get('publishDate'), scale=1000) + uploader = info.get('studios', [{}])[0].get('name') + movie_id = info.get('movieId') + primary_image_number = info.get('primaryImageNumber') + thumbnail = None + if movie_id and primary_image_number: + thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( + movie_id, movie_id, primary_image_number) + start = int_or_none(info.get('startSecond')) + end = int_or_none(info.get('endSecond')) + duration = end - start if start and end else None + categories = [c['name'] for c in info.get('categories', []) if c.get('name')] + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': info.get('description'), + 'duration': duration, + 'timestamp': timestamp, + 'uploader': uploader, + 'thumbnail': thumbnail, + 'categories': categories, + 'age_limit': 18, + } diff --git a/yt_dlp/extractor/pornovoisines.py b/yt_dlp/extractor/pornovoisines.py new file mode 100644 index 0000000..2e51b4f --- /dev/null +++ b/yt_dlp/extractor/pornovoisines.py @@ -0,0 +1,103 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + float_or_none, + unified_strdate, +) + + +class PornoVoisinesIE(InfoExtractor): + _WORKING = False + _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/videos/show/(?P<id>\d+)/(?P<display_id>[^/.]+)' + + _TEST = { + 'url': 'http://www.pornovoisines.com/videos/show/919/recherche-appartement.html', + 'md5': '6f8aca6a058592ab49fe701c8ba8317b', + 'info_dict': { + 'id': '919', + 'display_id': 'recherche-appartement', + 'ext': 'mp4', + 'title': 'Recherche appartement', + 'description': 'md5:fe10cb92ae2dd3ed94bb4080d11ff493', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20140925', + 'duration': 120, + 'view_count': int, + 'average_rating': float, + 'categories': ['Débutante', 'Débutantes', 'Scénario', 'Sodomie'], + 'age_limit': 18, + 'subtitles': { + 'fr': [{ + 'ext': 'vtt', + }] + }, + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + settings_url = self._download_json( + 'http://www.pornovoisines.com/api/video/%s/getsettingsurl/' % video_id, + video_id, note='Getting settings URL')['video_settings_url'] + settings = self._download_json(settings_url, video_id)['data'] + + formats = [] + for kind, data in settings['variants'].items(): + if kind == 'HLS': + formats.extend(self._extract_m3u8_formats( + data, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls')) + elif kind == 'MP4': + for item in data: + formats.append({ + 'url': item['url'], + 'height': item.get('height'), + 'bitrate': item.get('bitrate'), + }) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + # The webpage has a bug - there's no space between "thumb" and src= + thumbnail = self._html_search_regex( + r'<img[^>]+class=([\'"])thumb\1[^>]*src=([\'"])(?P<url>[^"]+)\2', + webpage, 'thumbnail', fatal=False, group='url') + + upload_date = unified_strdate(self._search_regex( + r'Le\s*<b>([\d/]+)', webpage, 'upload date', fatal=False)) + duration = settings.get('main', {}).get('duration') + view_count = int_or_none(self._search_regex( + r'(\d+) vues', webpage, 'view count', fatal=False)) + average_rating = self._search_regex( + r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False) + if average_rating: + average_rating = float_or_none(average_rating.replace(',', '.')) + + categories = self._html_search_regex( + r'(?s)Catégories\s*:\s*<b>(.+?)</b>', webpage, 'categories', fatal=False) + if categories: + categories = [category.strip() for category in categories.split(',')] + + subtitles = {'fr': [{ + 'url': subtitle, + } for subtitle in settings.get('main', {}).get('vtt_tracks', {}).values()]} + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'average_rating': average_rating, + 'categories': categories, + 'age_limit': 18, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/pornoxo.py b/yt_dlp/extractor/pornoxo.py new file mode 100644 index 0000000..049feb4 --- /dev/null +++ b/yt_dlp/extractor/pornoxo.py @@ -0,0 +1,55 @@ +from .common import InfoExtractor +from ..utils import ( + str_to_int, +) + + +class PornoXOIE(InfoExtractor): + _WORKING = False + _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)\.html' + _TEST = { + 'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html', + 'md5': '582f28ecbaa9e6e24cb90f50f524ce87', + 'info_dict': { + 'id': '7564', + 'ext': 'flv', + 'title': 'Striptease From Sexy Secretary!', + 'display_id': 'striptease-from-sexy-secretary', + 'description': 'md5:0ee35252b685b3883f4a1d38332f9980', + 'categories': list, # NSFW + 'thumbnail': r're:https?://.*\.jpg$', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id, display_id = mobj.groups() + + webpage = self._download_webpage(url, video_id) + video_data = self._extract_jwplayer_data(webpage, video_id, require_title=False) + + title = self._html_search_regex( + r'<title>([^<]+)\s*-\s*PornoXO', webpage, 'title') + + view_count = str_to_int(self._html_search_regex( + r'[vV]iews:\s*([0-9,]+)', webpage, 'view count', fatal=False)) + + categories_str = self._html_search_regex( + r'<meta name="description" content=".*featuring\s*([^"]+)"', + webpage, 'categories', fatal=False) + categories = ( + None if categories_str is None + else categories_str.split(',')) + + video_data.update({ + 'id': video_id, + 'title': title, + 'display_id': display_id, + 'description': self._html_search_meta('description', webpage), + 'categories': categories, + 'view_count': view_count, + 'age_limit': 18, + }) + + return video_data diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py new file mode 100644 index 0000000..66f8a5f --- /dev/null +++ b/yt_dlp/extractor/pr0gramm.py @@ -0,0 +1,201 @@ +import json +from urllib.parse import unquote + +from .common import InfoExtractor +from ..compat import functools +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + make_archive_id, + mimetype2ext, + str_or_none, + urljoin, +) +from ..utils.traversal import traverse_obj + + +class Pr0grammIE(InfoExtractor): + _VALID_URL = r'https?://pr0gramm\.com\/(?:[^/?#]+/)+(?P<id>[\d]+)(?:[/?#:]|$)' + _TESTS = [{ + 'url': 'https://pr0gramm.com/new/video/5466437', + 'info_dict': { + 'id': '5466437', + 'ext': 'mp4', + 'title': 'pr0gramm-5466437 by g11st', + 'tags': ['Neon Genesis Evangelion', 'Touhou Project', 'Fly me to the Moon', 'Marisad', 'Marisa Kirisame', 'video', 'sound', 'Marisa', 'Anime'], + 'uploader': 'g11st', + 'uploader_id': '394718', + 'timestamp': 1671590240, + 'upload_date': '20221221', + 'like_count': int, + 'dislike_count': int, + 'age_limit': 0, + 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg', + '_old_archive_ids': ['pr0grammstatic 5466437'], + }, + }, { + 'url': 'https://pr0gramm.com/new/3052805:comment28391322', + 'info_dict': { + 'id': '3052805', + 'ext': 'mp4', + 'title': 'pr0gramm-3052805 by Hansking1', + 'tags': 'count:15', + 'uploader': 'Hansking1', + 'uploader_id': '385563', + 'timestamp': 1552930408, + 'upload_date': '20190318', + 'like_count': int, + 'dislike_count': int, + 'age_limit': 0, + 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg', + '_old_archive_ids': ['pr0grammstatic 3052805'], + }, + }, { + # Requires verified account + 'url': 'https://pr0gramm.com/new/Gianna%20Michaels/5848332', + 'info_dict': { + 'id': '5848332', + 'ext': 'mp4', + 'title': 'pr0gramm-5848332 by erd0pfel', + 'tags': 'count:18', + 'uploader': 'erd0pfel', + 'uploader_id': '349094', + 'timestamp': 1694489652, + 'upload_date': '20230912', + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg', + '_old_archive_ids': ['pr0grammstatic 5848332'], + }, + }, { + 'url': 'https://pr0gramm.com/top/5895149', + 'info_dict': { + 'id': '5895149', + 'ext': 'mp4', + 'title': 'pr0gramm-5895149 by algoholigSeeManThrower', + 'tags': 'count:19', + 'uploader': 'algoholigSeeManThrower', + 'uploader_id': '457556', + 'timestamp': 1697580902, + 'upload_date': '20231018', + 'like_count': int, + 'dislike_count': int, + 'age_limit': 0, + 'thumbnail': 'https://thumb.pr0gramm.com/2023/10/18/db47bb3db5e1a1b3.jpg', + '_old_archive_ids': ['pr0grammstatic 5895149'], + }, + }, { + 'url': 'https://pr0gramm.com/static/5466437', + 'only_matching': True, + }, { + 'url': 'https://pr0gramm.com/new/rowan%20atkinson%20herr%20bohne/3052805', + 'only_matching': True, + }, { + 'url': 'https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290', + 'only_matching': True, + }] + + BASE_URL = 'https://pr0gramm.com' + + @functools.cached_property + def _is_logged_in(self): + return 'pp' in self._get_cookies(self.BASE_URL) + + @functools.cached_property + def _maximum_flags(self): + # We need to guess the flags for the content otherwise the api will raise an error + # We can guess the maximum allowed flags for the account from the cookies + # Bitflags are (msbf): pol, nsfp, nsfl, nsfw, sfw + flags = 0b10001 + if self._is_logged_in: + flags |= 0b01000 + cookies = self._get_cookies(self.BASE_URL) + if 'me' not in cookies: + self._download_webpage(self.BASE_URL, None, 'Refreshing verification information') + if traverse_obj(cookies, ('me', {lambda x: x.value}, {unquote}, {json.loads}, 'verified')): + flags |= 0b00110 + + return flags + + def _call_api(self, endpoint, video_id, query={}, note='Downloading API json'): + data = self._download_json( + f'https://pr0gramm.com/api/items/{endpoint}', + video_id, note, query=query, expected_status=403) + + error = traverse_obj(data, ('error', {str})) + if error in ('nsfwRequired', 'nsflRequired', 'nsfpRequired', 'verificationRequired'): + if not self._is_logged_in: + self.raise_login_required() + raise ExtractorError(f'Unverified account cannot access NSFW/NSFL ({error})', expected=True) + elif error: + message = traverse_obj(data, ('msg', {str})) or error + raise ExtractorError(f'API returned error: {message}', expected=True) + + return data + + @staticmethod + def _create_source_url(path): + return urljoin('https://img.pr0gramm.com', path) + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = traverse_obj( + self._call_api('get', video_id, {'id': video_id, 'flags': self._maximum_flags}), + ('items', 0, {dict})) + + source = video_info.get('image') + if not source or not source.endswith('mp4'): + self.raise_no_formats('Could not extract a video', expected=bool(source), video_id=video_id) + + metadata = self._call_api('info', video_id, {'itemId': video_id}, note='Downloading tags') + tags = traverse_obj(metadata, ('tags', ..., 'tag', {str})) + # Sorted by "confidence", higher confidence = earlier in list + confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float}))) + if confidences: + tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)] + + formats = traverse_obj(video_info, ('variants', ..., { + 'format_id': ('name', {str}), + 'url': ('path', {self._create_source_url}), + 'ext': ('mimeType', {mimetype2ext}), + 'vcodec': ('codec', {str}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'bitrate': ('bitRate', {float_or_none}), + 'filesize': ('fileSize', {int_or_none}), + })) if video_info.get('variants') else [{ + 'ext': 'mp4', + 'format_id': 'source', + **traverse_obj(video_info, { + 'url': ('image', {self._create_source_url}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }] + + subtitles = {} + for subtitle in traverse_obj(video_info, ('subtitles', lambda _, v: v['language'])): + subtitles.setdefault(subtitle['language'], []).append(traverse_obj(subtitle, { + 'url': ('path', {self._create_source_url}), + 'note': ('label', {str}), + })) + + return { + 'id': video_id, + 'title': f'pr0gramm-{video_id} by {video_info.get("user")}', + 'tags': tags, + 'formats': formats, + 'subtitles': subtitles, + 'age_limit': 18 if traverse_obj(video_info, ('flags', {0b110.__and__})) else 0, + '_old_archive_ids': [make_archive_id('Pr0grammStatic', video_id)], + **traverse_obj(video_info, { + 'uploader': ('user', {str}), + 'uploader_id': ('userId', {str_or_none}), + 'like_count': ('up', {int}), + 'dislike_count': ('down', {int}), + 'timestamp': ('created', {int}), + 'thumbnail': ('thumb', {lambda x: urljoin('https://thumb.pr0gramm.com', x)}) + }), + } diff --git a/yt_dlp/extractor/prankcast.py b/yt_dlp/extractor/prankcast.py new file mode 100644 index 0000000..56cd40d --- /dev/null +++ b/yt_dlp/extractor/prankcast.py @@ -0,0 +1,137 @@ +import json + +from .common import InfoExtractor +from ..utils import float_or_none, parse_iso8601, str_or_none, try_call +from ..utils.traversal import traverse_obj + + +class PrankCastIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?prankcast\.com/[^/?#]+/showreel/(?P<id>\d+)-(?P<display_id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://prankcast.com/Devonanustart/showreel/1561-Beverly-is-back-like-a-heart-attack-', + 'info_dict': { + 'id': '1561', + 'ext': 'mp3', + 'title': 'Beverly is back like a heart attack!', + 'display_id': 'Beverly-is-back-like-a-heart-attack-', + 'timestamp': 1661391575, + 'uploader': 'Devonanustart', + 'channel_id': '4', + 'duration': 7918, + 'cast': ['Devonanustart', 'Phonelosers'], + 'description': '', + 'categories': ['prank'], + 'tags': ['prank call', 'prank', 'live show'], + 'upload_date': '20220825' + } + }, { + 'url': 'https://prankcast.com/phonelosers/showreel/2048-NOT-COOL', + 'info_dict': { + 'id': '2048', + 'ext': 'mp3', + 'title': 'NOT COOL', + 'display_id': 'NOT-COOL', + 'timestamp': 1665028364, + 'uploader': 'phonelosers', + 'channel_id': '6', + 'duration': 4044, + 'cast': ['phonelosers'], + 'description': '', + 'categories': ['prank'], + 'tags': ['prank call', 'prank', 'live show'], + 'upload_date': '20221006' + } + }] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).group('id', 'display_id') + + webpage = self._download_webpage(url, video_id) + json_info = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['ssr_data_showreel'] + + uploader = json_info.get('user_name') + guests_json = self._parse_json(json_info.get('guests_json') or '{}', video_id) + start_date = parse_iso8601(json_info.get('start_date')) + + return { + 'id': video_id, + 'title': json_info.get('broadcast_title') or self._og_search_title(webpage), + 'display_id': display_id, + 'url': f'{json_info["broadcast_url"]}{json_info["recording_hash"]}.mp3', + 'timestamp': start_date, + 'uploader': uploader, + 'channel_id': str_or_none(json_info.get('user_id')), + 'duration': try_call(lambda: parse_iso8601(json_info['end_date']) - start_date), + 'cast': list(filter(None, [uploader] + traverse_obj(guests_json, (..., 'name')))), + 'description': json_info.get('broadcast_description'), + 'categories': [json_info.get('broadcast_category')], + 'tags': try_call(lambda: json_info['broadcast_tags'].split(',')) + } + + +class PrankCastPostIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?prankcast\.com/[^/?#]+/posts/(?P<id>\d+)-(?P<display_id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://prankcast.com/devonanustart/posts/6214-happy-national-rachel-day-', + 'info_dict': { + 'id': '6214', + 'ext': 'mp3', + 'title': 'Happy National Rachel Day!', + 'display_id': 'happy-national-rachel-day-', + 'timestamp': 1704333938, + 'uploader': 'Devonanustart', + 'channel_id': '4', + 'duration': 13175, + 'cast': ['Devonanustart'], + 'description': '', + 'categories': ['prank call'], + 'upload_date': '20240104' + } + }, { + 'url': 'https://prankcast.com/despicabledogs/posts/6217-jake-the-work-crow-', + 'info_dict': { + 'id': '6217', + 'ext': 'mp3', + 'title': 'Jake the Work Crow!', + 'display_id': 'jake-the-work-crow-', + 'timestamp': 1704346592, + 'uploader': 'despicabledogs', + 'channel_id': '957', + 'duration': 263.287, + 'cast': ['despicabledogs'], + 'description': 'https://imgur.com/a/vtxLvKU', + 'categories': [], + 'upload_date': '20240104' + } + }] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).group('id', 'display_id') + + webpage = self._download_webpage(url, video_id) + post = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['ssr_data_posts'] + content = self._parse_json(post['post_contents_json'], video_id)[0] + + uploader = post.get('user_name') + guests_json = traverse_obj(content, ('guests_json', {json.loads}, {dict})) or {} + + return { + 'id': video_id, + 'title': post.get('post_title') or self._og_search_title(webpage), + 'display_id': display_id, + 'url': content.get('url'), + 'timestamp': parse_iso8601(content.get('start_date') or content.get('crdate'), ' '), + 'uploader': uploader, + 'channel_id': str_or_none(post.get('user_id')), + 'duration': float_or_none(content.get('duration')), + 'cast': list(filter(None, [uploader] + traverse_obj(guests_json, (..., 'name')))), + 'description': post.get('post_body'), + 'categories': list(filter(None, [content.get('category')])), + 'tags': try_call(lambda: list(filter('', post['post_tags'].split(',')))), + 'subtitles': { + 'live_chat': [{ + 'url': f'https://prankcast.com/api/private/chat/select-broadcast?id={post["content_id"]}&cache=', + 'ext': 'json', + }], + } if post.get('content_id') else None + } diff --git a/yt_dlp/extractor/premiershiprugby.py b/yt_dlp/extractor/premiershiprugby.py new file mode 100644 index 0000000..67d41fd --- /dev/null +++ b/yt_dlp/extractor/premiershiprugby.py @@ -0,0 +1,39 @@ +from .common import InfoExtractor +from ..utils import int_or_none, traverse_obj + + +class PremiershipRugbyIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)premiershiprugby\.(?:com)/watch/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.premiershiprugby.com/watch/full-match-harlequins-v-newcastle-falcons', + 'info_dict': { + 'id': '0_mbkb7ldt', + 'title': 'Full Match: Harlequins v Newcastle Falcons', + 'ext': 'mp4', + 'thumbnail': 'https://open.http.mp.streamamg.com/p/3000914/sp/300091400/thumbnail/entry_id/0_mbkb7ldt//width/960/height/540/type/1/quality/75', + 'duration': 6093.0, + 'tags': ['video'], + 'categories': ['Full Match', 'Harlequins', 'Newcastle Falcons', 'gallaher premiership'], + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + json_data = self._download_json( + f'https://article-cms-api.incrowdsports.com/v2/articles/slug/{display_id}', + display_id, query={'clientId': 'PRL'})['data']['article'] + + formats, subs = self._extract_m3u8_formats_and_subtitles( + json_data['heroMedia']['content']['videoLink'], display_id) + + return { + 'id': json_data['heroMedia']['content']['sourceSystemId'], + 'display_id': display_id, + 'title': traverse_obj(json_data, ('heroMedia', 'title')), + 'formats': formats, + 'subtitles': subs, + 'thumbnail': traverse_obj(json_data, ('heroMedia', 'content', 'videoThumbnail')), + 'duration': int_or_none(traverse_obj(json_data, ('heroMedia', 'content', 'metadata', 'msDuration')), scale=1000), + 'tags': json_data.get('tags'), + 'categories': traverse_obj(json_data, ('categories', ..., 'text')), + } diff --git a/yt_dlp/extractor/presstv.py b/yt_dlp/extractor/presstv.py new file mode 100644 index 0000000..26ce74a --- /dev/null +++ b/yt_dlp/extractor/presstv.py @@ -0,0 +1,69 @@ +from .common import InfoExtractor +from ..utils import remove_start + + +class PressTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P<y>\d+)/(?P<m>\d+)/(?P<d>\d+)/(?P<id>\d+)/(?P<display_id>[^/]+)?' + + _TEST = { + 'url': 'http://www.presstv.ir/Detail/2016/04/09/459911/Australian-sewerage-treatment-facility-/', + 'md5': '5d7e3195a447cb13e9267e931d8dd5a5', + 'info_dict': { + 'id': '459911', + 'display_id': 'Australian-sewerage-treatment-facility-', + 'ext': 'mp4', + 'title': 'Organic mattresses used to clean waste water', + 'upload_date': '20160409', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:20002e654bbafb6908395a5c0cfcd125' + } + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + # extract video URL from webpage + video_url = self._hidden_inputs(webpage)['inpPlayback'] + + # build list of available formats + # specified in http://www.presstv.ir/Scripts/playback.js + base_url = 'http://192.99.219.222:82/presstv' + _formats = [ + (180, '_low200.mp4'), + (360, '_low400.mp4'), + (720, '_low800.mp4'), + (1080, '.mp4') + ] + + formats = [{ + 'url': base_url + video_url[:-4] + extension, + 'format_id': '%dp' % height, + 'height': height, + } for height, extension in _formats] + + # extract video metadata + title = remove_start( + self._html_search_meta('title', webpage, fatal=True), 'PressTV-') + + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + + upload_date = '%04d%02d%02d' % ( + int(mobj.group('y')), + int(mobj.group('m')), + int(mobj.group('d')), + ) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'description': description + } diff --git a/yt_dlp/extractor/projectveritas.py b/yt_dlp/extractor/projectveritas.py new file mode 100644 index 0000000..daf1405 --- /dev/null +++ b/yt_dlp/extractor/projectveritas.py @@ -0,0 +1,52 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_strdate, +) + + +class ProjectVeritasIE(InfoExtractor): + _WORKING = False + _VALID_URL = r'https?://(?:www\.)?projectveritas\.com/(?P<type>news|video)/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.projectveritas.com/news/exclusive-inside-the-new-york-and-new-jersey-hospitals-battling-coronavirus/', + 'info_dict': { + 'id': '51910aab-365a-5cf1-88f2-8eb1ca5fd3c6', + 'ext': 'mp4', + 'title': 'Exclusive: Inside The New York and New Jersey Hospitals Battling Coronavirus', + 'upload_date': '20200327', + 'thumbnail': 'md5:6076477fe50b03eb8708be9415e18e1c', + } + }, { + 'url': 'https://www.projectveritas.com/video/ilhan-omar-connected-ballot-harvester-in-cash-for-ballots-scheme-car-is-full/', + 'info_dict': { + 'id': 'c5aab304-a56b-54b1-9f0b-03b77bc5f2f6', + 'ext': 'mp4', + 'title': 'Ilhan Omar connected Ballot Harvester in cash-for-ballots scheme: "Car is full" of absentee ballots', + 'upload_date': '20200927', + 'thumbnail': 'md5:194b8edf0e2ba64f25500ff4378369a4', + } + }] + + def _real_extract(self, url): + id, type = self._match_valid_url(url).group('id', 'type') + api_url = f'https://www.projectveritas.com/page-data/{type}/{id}/page-data.json' + data_json = self._download_json(api_url, id)['result']['data'] + main_data = traverse_obj(data_json, 'video', 'post') + video_id = main_data['id'] + thumbnail = traverse_obj(main_data, ('image', 'ogImage', 'src')) + mux_asset = traverse_obj(main_data, + 'muxAsset', ('body', 'json', 'content', ..., 'data', 'target', 'fields', 'muxAsset'), + get_all=False, expected_type=dict) + if not mux_asset: + raise ExtractorError('No video on the provided url.', expected=True) + playback_id = traverse_obj(mux_asset, 'playbackId', ('en-US', 'playbackId')) + formats = self._extract_m3u8_formats(f'https://stream.mux.com/{playback_id}.m3u8', video_id) + return { + 'id': video_id, + 'title': main_data['title'], + 'upload_date': unified_strdate(main_data.get('date')), + 'thumbnail': thumbnail.replace('//', ''), + 'formats': formats, + } diff --git a/yt_dlp/extractor/prosiebensat1.py b/yt_dlp/extractor/prosiebensat1.py new file mode 100644 index 0000000..46e2e8a --- /dev/null +++ b/yt_dlp/extractor/prosiebensat1.py @@ -0,0 +1,496 @@ +import re + +from hashlib import sha1 +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + determine_ext, + float_or_none, + int_or_none, + merge_dicts, + unified_strdate, +) + + +class ProSiebenSat1BaseIE(InfoExtractor): + _GEO_BYPASS = False + _ACCESS_ID = None + _SUPPORTED_PROTOCOLS = 'dash:clear,hls:clear,progressive:clear' + _V4_BASE_URL = 'https://vas-v4.p7s1video.net/4.0/get' + + def _extract_video_info(self, url, clip_id): + client_location = url + + video = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos', + clip_id, 'Downloading videos JSON', query={ + 'access_token': self._TOKEN, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'ids': clip_id, + })[0] + + if not self.get_param('allow_unplayable_formats') and video.get('is_protected') is True: + self.report_drm(clip_id) + + formats = [] + if self._ACCESS_ID: + raw_ct = self._ENCRYPTION_KEY + clip_id + self._IV + self._ACCESS_ID + protocols = self._download_json( + self._V4_BASE_URL + 'protocols', clip_id, + 'Downloading protocols JSON', + headers=self.geo_verification_headers(), query={ + 'access_id': self._ACCESS_ID, + 'client_token': sha1((raw_ct).encode()).hexdigest(), + 'video_id': clip_id, + }, fatal=False, expected_status=(403,)) or {} + error = protocols.get('error') or {} + if error.get('title') == 'Geo check failed': + self.raise_geo_restricted(countries=['AT', 'CH', 'DE']) + server_token = protocols.get('server_token') + if server_token: + urls = (self._download_json( + self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={ + 'access_id': self._ACCESS_ID, + 'client_token': sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(), + 'protocols': self._SUPPORTED_PROTOCOLS, + 'server_token': server_token, + 'video_id': clip_id, + }, fatal=False) or {}).get('urls') or {} + for protocol, variant in urls.items(): + source_url = variant.get('clear', {}).get('url') + if not source_url: + continue + if protocol == 'dash': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id=protocol, fatal=False)) + elif protocol == 'hls': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id=protocol, fatal=False)) + else: + formats.append({ + 'url': source_url, + 'format_id': protocol, + }) + if not formats: + source_ids = [compat_str(source['id']) for source in video['sources']] + + client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + + sources = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, + clip_id, 'Downloading sources JSON', query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + }) + server_id = sources['server_id'] + + def fix_bitrate(bitrate): + bitrate = int_or_none(bitrate) + if not bitrate: + return None + return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate + + for source_id in source_ids: + client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + urls = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, + clip_id, 'Downloading urls JSON', fatal=False, query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'server_id': server_id, + 'source_ids': source_id, + }) + if not urls: + continue + if urls.get('status_code') != 0: + raise ExtractorError('This video is unavailable', expected=True) + urls_sources = urls['sources'] + if isinstance(urls_sources, dict): + urls_sources = urls_sources.values() + for source in urls_sources: + source_url = source.get('url') + if not source_url: + continue + protocol = source.get('protocol') + mimetype = source.get('mimetype') + if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + source_url, clip_id, f4m_id='hds', fatal=False)) + elif mimetype == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif mimetype == 'application/dash+xml': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id='dash', fatal=False)) + else: + tbr = fix_bitrate(source['bitrate']) + if protocol in ('rtmp', 'rtmpe'): + mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) + if not mobj: + continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] + formats.append({ + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, + 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', + 'page_url': 'http://www.prosieben.de', + 'tbr': tbr, + 'ext': 'flv', + 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), + }) + else: + formats.append({ + 'url': source_url, + 'tbr': tbr, + 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), + }) + + return { + 'duration': float_or_none(video.get('duration')), + 'formats': formats, + } + + +class ProSiebenSat1IE(ProSiebenSat1BaseIE): + IE_NAME = 'prosiebensat1' + IE_DESC = 'ProSiebenSat.1 Digital' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + (?:beta\.)? + (?: + prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|advopedia + )\.(?:de|at|ch)| + ran\.de|fem\.com|advopedia\.de|galileo\.tv/video + ) + /(?P<id>.+) + ''' + + _TESTS = [ + { + # Tests changes introduced in https://github.com/ytdl-org/youtube-dl/pull/6242 + # in response to fixing https://github.com/ytdl-org/youtube-dl/issues/6215: + # - malformed f4m manifest support + # - proper handling of URLs starting with `https?://` in 2.0 manifests + # - recursive child f4m manifests extraction + 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', + 'info_dict': { + 'id': '2104602', + 'ext': 'mp4', + 'title': 'CIRCUS HALLIGALLI - Episode 18 - Staffel 2', + 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', + 'upload_date': '20131231', + 'duration': 5845.04, + 'series': 'CIRCUS HALLIGALLI', + 'season_number': 2, + 'episode': 'Episode 18 - Staffel 2', + 'episode_number': 18, + }, + }, + { + 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html', + 'info_dict': { + 'id': '2570327', + 'ext': 'mp4', + 'title': 'Lady-Umstyling für Audrina', + 'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d', + 'upload_date': '20131014', + 'duration': 606.76, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Seems to be broken', + }, + { + 'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge', + 'info_dict': { + 'id': '2429369', + 'ext': 'mp4', + 'title': 'Countdown für die Autowerkstatt', + 'description': 'md5:809fc051a457b5d8666013bc40698817', + 'upload_date': '20140223', + 'duration': 2595.04, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip', + 'info_dict': { + 'id': '2904997', + 'ext': 'mp4', + 'title': 'Sexy laufen in Ugg Boots', + 'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6', + 'upload_date': '20140122', + 'duration': 245.32, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip', + 'info_dict': { + 'id': '2906572', + 'ext': 'mp4', + 'title': 'Im Interview: Kai Wiesinger', + 'description': 'md5:e4e5370652ec63b95023e914190b4eb9', + 'upload_date': '20140203', + 'duration': 522.56, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge', + 'info_dict': { + 'id': '2992323', + 'ext': 'mp4', + 'title': 'Jagd auf Fertigkost im Elsthal - Teil 2', + 'description': 'md5:2669cde3febe9bce13904f701e774eb6', + 'upload_date': '20141014', + 'duration': 2410.44, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge', + 'info_dict': { + 'id': '3004256', + 'ext': 'mp4', + 'title': 'Schalke: Tönnies möchte Raul zurück', + 'description': 'md5:4b5b271d9bcde223b54390754c8ece3f', + 'upload_date': '20140226', + 'duration': 228.96, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', + 'info_dict': { + 'id': '2572814', + 'ext': 'mp4', + 'title': 'The Voice of Germany - Andreas Kümmert: Rocket Man', + 'description': 'md5:6ddb02b0781c6adf778afea606652e38', + 'timestamp': 1382041620, + 'upload_date': '20131017', + 'duration': 469.88, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.fem.com/videos/beauty-lifestyle/kurztrips-zum-valentinstag', + 'info_dict': { + 'id': '2156342', + 'ext': 'mp4', + 'title': 'Kurztrips zum Valentinstag', + 'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.', + 'duration': 307.24, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.prosieben.de/tv/joko-gegen-klaas/videos/playlists/episode-8-ganze-folge-playlist', + 'info_dict': { + 'id': '439664', + 'title': 'Episode 8 - Ganze Folge - Playlist', + 'description': 'md5:63b8963e71f481782aeea877658dec84', + }, + 'playlist_count': 2, + 'skip': 'This video is unavailable', + }, + { + # title in <h2 class="subtitle"> + 'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip', + 'info_dict': { + 'id': '4895826', + 'ext': 'mp4', + 'title': 'Jetzt erst enthüllt: Das Geheimnis von Emma Stones Oscar-Robe', + 'description': 'md5:e5ace2bc43fadf7b63adc6187e9450b9', + 'upload_date': '20170302', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'geo restricted to Germany', + }, + { + # geo restricted to Germany + 'url': 'http://www.kabeleinsdoku.de/tv/mayday-alarm-im-cockpit/video/102-notlandung-im-hudson-river-ganze-folge', + 'only_matching': True, + }, + { + # geo restricted to Germany + 'url': 'http://www.sat1gold.de/tv/edel-starck/video/11-staffel-1-episode-1-partner-wider-willen-ganze-folge', + 'only_matching': True, + }, + { + # geo restricted to Germany + 'url': 'https://www.galileo.tv/video/diese-emojis-werden-oft-missverstanden', + 'only_matching': True, + }, + { + 'url': 'http://www.sat1gold.de/tv/edel-starck/playlist/die-gesamte-1-staffel', + 'only_matching': True, + }, + { + 'url': 'http://www.advopedia.de/videos/lenssen-klaert-auf/lenssen-klaert-auf-folge-8-staffel-3-feiertage-und-freie-tage', + 'only_matching': True, + }, + ] + + _TOKEN = 'prosieben' + _SALT = '01!8d8F_)r9]4s[qeuXfP%' + _CLIENT_NAME = 'kolibri-2.0.19-splec4' + + _ACCESS_ID = 'x_prosiebenmaxx-de' + _ENCRYPTION_KEY = 'Eeyeey9oquahthainoofashoyoikosag' + _IV = 'Aeluchoc6aevechuipiexeeboowedaok' + + _CLIPID_REGEXES = [ + r'"clip_id"\s*:\s+"(\d+)"', + r'clipid: "(\d+)"', + r'clip[iI]d=(\d+)', + r'clip[iI][dD]\s*=\s*["\'](\d+)', + r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", + r'proMamsId"\s*:\s*"(\d+)', + r'proMamsId"\s*:\s*"(\d+)', + ] + _TITLE_REGEXES = [ + r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>', + r'<header class="clearfix">\s*<h3>(.+?)</h3>', + r'<!-- start video -->\s*<h1>(.+?)</h1>', + r'<h1 class="att-name">\s*(.+?)</h1>', + r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>', + r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>', + r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>', + r'<h2[^>]+class="subtitle"[^>]*>([^<]+)</h2>', + ] + _DESCRIPTION_REGEXES = [ + r'<p itemprop="description">\s*(.+?)</p>', + r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>', + r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>', + r'<p class="att-description">\s*(.+?)\s*</p>', + r'<p class="video-description" itemprop="description">\s*(.+?)</p>', + r'<div[^>]+id="veeseoDescription"[^>]*>(.+?)</div>', + ] + _UPLOAD_DATE_REGEXES = [ + r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"', + r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr', + r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>', + r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>', + ] + _PAGE_TYPE_REGEXES = [ + r'<meta name="page_type" content="([^"]+)">', + r"'itemType'\s*:\s*'([^']*)'", + ] + _PLAYLIST_ID_REGEXES = [ + r'content[iI]d=(\d+)', + r"'itemId'\s*:\s*'([^']*)'", + ] + _PLAYLIST_CLIP_REGEXES = [ + r'(?s)data-qvt=.+?<a href="([^"]+)"', + ] + + def _extract_clip(self, url, webpage): + clip_id = self._html_search_regex( + self._CLIPID_REGEXES, webpage, 'clip id') + title = self._html_search_regex( + self._TITLE_REGEXES, webpage, 'title', + default=None) or self._og_search_title(webpage) + info = self._extract_video_info(url, clip_id) + description = self._html_search_regex( + self._DESCRIPTION_REGEXES, webpage, 'description', default=None) + if description is None: + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate( + self._html_search_meta('og:published_time', webpage, + 'upload date', default=None) + or self._html_search_regex(self._UPLOAD_DATE_REGEXES, + webpage, 'upload date', default=None)) + + json_ld = self._search_json_ld(webpage, clip_id, default={}) + + return merge_dicts(info, { + 'id': clip_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + }, json_ld) + + def _extract_playlist(self, url, webpage): + playlist_id = self._html_search_regex( + self._PLAYLIST_ID_REGEXES, webpage, 'playlist id') + playlist = self._parse_json( + self._search_regex( + r'var\s+contentResources\s*=\s*(\[.+?\]);\s*</script', + webpage, 'playlist'), + playlist_id) + entries = [] + for item in playlist: + clip_id = item.get('id') or item.get('upc') + if not clip_id: + continue + info = self._extract_video_info(url, clip_id) + info.update({ + 'id': clip_id, + 'title': item.get('title') or item.get('teaser', {}).get('headline'), + 'description': item.get('teaser', {}).get('description'), + 'thumbnail': item.get('poster'), + 'duration': float_or_none(item.get('duration')), + 'series': item.get('tvShowTitle'), + 'uploader': item.get('broadcastPublisher'), + }) + entries.append(info) + return self.playlist_result(entries, playlist_id) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + page_type = self._search_regex( + self._PAGE_TYPE_REGEXES, webpage, + 'page type', default='clip').lower() + if page_type == 'clip': + return self._extract_clip(url, webpage) + elif page_type == 'playlist': + return self._extract_playlist(url, webpage) + else: + raise ExtractorError( + 'Unsupported page type %s' % page_type, expected=True) diff --git a/yt_dlp/extractor/prx.py b/yt_dlp/extractor/prx.py new file mode 100644 index 0000000..5bb1832 --- /dev/null +++ b/yt_dlp/extractor/prx.py @@ -0,0 +1,428 @@ +import itertools +from .common import InfoExtractor, SearchInfoExtractor +from ..utils import ( + urljoin, + traverse_obj, + int_or_none, + mimetype2ext, + clean_html, + url_or_none, + unified_timestamp, + str_or_none, +) + + +class PRXBaseIE(InfoExtractor): + PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s' + + def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'): + return self._download_json( + urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note) + + @staticmethod + def _get_prx_embed_response(response, section): + return traverse_obj(response, ('_embedded', f'prx:{section}')) + + @staticmethod + def _extract_file_link(response): + return url_or_none(traverse_obj( + response, ('_links', 'enclosure', 'href'), expected_type=str)) + + @classmethod + def _extract_image(cls, image_response): + if not isinstance(image_response, dict): + return + return { + 'id': str_or_none(image_response.get('id')), + 'filesize': image_response.get('size'), + 'width': image_response.get('width'), + 'height': image_response.get('height'), + 'url': cls._extract_file_link(image_response) + } + + @classmethod + def _extract_base_info(cls, response): + if not isinstance(response, dict): + return + item_id = str_or_none(response.get('id')) + if not item_id: + return + thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image')) + description = ( + clean_html(response.get('description')) + or response.get('shortDescription')) + return { + 'id': item_id, + 'title': response.get('title') or item_id, + 'thumbnails': [thumbnail_dict] if thumbnail_dict else None, + 'description': description, + 'release_timestamp': unified_timestamp(response.get('releasedAt')), + 'timestamp': unified_timestamp(response.get('createdAt')), + 'modified_timestamp': unified_timestamp(response.get('updatedAt')), + 'duration': int_or_none(response.get('duration')), + 'tags': response.get('tags'), + 'episode_number': int_or_none(response.get('episodeIdentifier')), + 'season_number': int_or_none(response.get('seasonIdentifier')) + } + + @classmethod + def _extract_series_info(cls, series_response): + base_info = cls._extract_base_info(series_response) + if not base_info: + return + account_info = cls._extract_account_info( + cls._get_prx_embed_response(series_response, 'account')) or {} + return { + **base_info, + 'channel_id': account_info.get('channel_id'), + 'channel_url': account_info.get('channel_url'), + 'channel': account_info.get('channel'), + 'series': base_info.get('title'), + 'series_id': base_info.get('id'), + } + + @classmethod + def _extract_account_info(cls, account_response): + base_info = cls._extract_base_info(account_response) + if not base_info: + return + name = account_response.get('name') + return { + **base_info, + 'title': name, + 'channel_id': base_info.get('id'), + 'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'), + 'channel': name, + } + + @classmethod + def _extract_story_info(cls, story_response): + base_info = cls._extract_base_info(story_response) + if not base_info: + return + series = cls._extract_series_info( + cls._get_prx_embed_response(story_response, 'series')) or {} + account = cls._extract_account_info( + cls._get_prx_embed_response(story_response, 'account')) or {} + return { + **base_info, + 'series': series.get('series'), + 'series_id': series.get('series_id'), + 'channel_id': account.get('channel_id'), + 'channel_url': account.get('channel_url'), + 'channel': account.get('channel') + } + + def _entries(self, item_id, endpoint, entry_func, query=None): + """ + Extract entries from paginated list API + @param entry_func: Function to generate entry from response item + """ + total = 0 + for page in itertools.count(1): + response = self._call_api(f'{item_id}: page {page}', endpoint, query={ + **(query or {}), + 'page': page, + 'per': 100 + }) + items = self._get_prx_embed_response(response, 'items') + if not response or not items: + break + + yield from filter(None, map(entry_func, items)) + + total += response['count'] + if total >= response['total']: + break + + def _story_playlist_entry(self, response): + story = self._extract_story_info(response) + if not story: + return + story.update({ + '_type': 'url', + 'url': 'https://beta.prx.org/stories/%s' % story['id'], + 'ie_key': PRXStoryIE.ie_key() + }) + return story + + def _series_playlist_entry(self, response): + series = self._extract_series_info(response) + if not series: + return + series.update({ + '_type': 'url', + 'url': 'https://beta.prx.org/series/%s' % series['id'], + 'ie_key': PRXSeriesIE.ie_key() + }) + return series + + +class PRXStoryIE(PRXBaseIE): + _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)' + + _TESTS = [ + { + # Story with season and episode details + 'url': 'https://beta.prx.org/stories/399200', + 'info_dict': { + 'id': '399200', + 'title': 'Fly Me To The Moon', + 'description': 'md5:43230168390b95d3322048d8a56bf2bb', + 'release_timestamp': 1640250000, + 'timestamp': 1640208972, + 'modified_timestamp': 1641318202, + 'duration': 1004, + 'tags': 'count:7', + 'episode_number': 8, + 'season_number': 5, + 'series': 'AirSpace', + 'series_id': '38057', + 'channel_id': '220986', + 'channel_url': 'https://beta.prx.org/accounts/220986', + 'channel': 'Air and Space Museum', + }, + 'playlist': [{ + 'info_dict': { + 'id': '399200_part1', + 'title': 'Fly Me To The Moon', + 'description': 'md5:43230168390b95d3322048d8a56bf2bb', + 'release_timestamp': 1640250000, + 'timestamp': 1640208972, + 'modified_timestamp': 1641318202, + 'duration': 530, + 'tags': 'count:7', + 'episode_number': 8, + 'season_number': 5, + 'series': 'AirSpace', + 'series_id': '38057', + 'channel_id': '220986', + 'channel_url': 'https://beta.prx.org/accounts/220986', + 'channel': 'Air and Space Museum', + 'ext': 'mp3', + 'upload_date': '20211222', + 'episode': 'Episode 8', + 'release_date': '20211223', + 'season': 'Season 5', + 'modified_date': '20220104' + } + }, { + 'info_dict': { + 'id': '399200_part2', + 'title': 'Fly Me To The Moon', + 'description': 'md5:43230168390b95d3322048d8a56bf2bb', + 'release_timestamp': 1640250000, + 'timestamp': 1640208972, + 'modified_timestamp': 1641318202, + 'duration': 474, + 'tags': 'count:7', + 'episode_number': 8, + 'season_number': 5, + 'series': 'AirSpace', + 'series_id': '38057', + 'channel_id': '220986', + 'channel_url': 'https://beta.prx.org/accounts/220986', + 'channel': 'Air and Space Museum', + 'ext': 'mp3', + 'upload_date': '20211222', + 'episode': 'Episode 8', + 'release_date': '20211223', + 'season': 'Season 5', + 'modified_date': '20220104' + } + } + + ] + }, { + # Story with only split audio + 'url': 'https://beta.prx.org/stories/326414', + 'info_dict': { + 'id': '326414', + 'title': 'Massachusetts v EPA', + 'description': 'md5:744fffba08f19f4deab69fa8d49d5816', + 'timestamp': 1592509124, + 'modified_timestamp': 1592510457, + 'duration': 3088, + 'tags': 'count:0', + 'series': 'Outside/In', + 'series_id': '36252', + 'channel_id': '206', + 'channel_url': 'https://beta.prx.org/accounts/206', + 'channel': 'New Hampshire Public Radio', + }, + 'playlist_count': 4 + }, { + # Story with single combined audio + 'url': 'https://beta.prx.org/stories/400404', + 'info_dict': { + 'id': '400404', + 'title': 'Cafe Chill (Episode 2022-01)', + 'thumbnails': 'count:1', + 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539', + 'timestamp': 1641233952, + 'modified_timestamp': 1641234248, + 'duration': 3540, + 'series': 'Café Chill', + 'series_id': '37762', + 'channel_id': '5767', + 'channel_url': 'https://beta.prx.org/accounts/5767', + 'channel': 'C89.5 - KNHC Seattle', + 'ext': 'mp3', + 'tags': 'count:0', + 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg', + 'upload_date': '20220103', + 'modified_date': '20220103' + } + }, { + 'url': 'https://listen.prx.org/stories/399200', + 'only_matching': True + } + ] + + def _extract_audio_pieces(self, audio_response): + return [{ + 'format_id': str_or_none(piece_response.get('id')), + 'format_note': str_or_none(piece_response.get('label')), + 'filesize': int_or_none(piece_response.get('size')), + 'duration': int_or_none(piece_response.get('duration')), + 'ext': mimetype2ext(piece_response.get('contentType')), + 'asr': int_or_none(piece_response.get('frequency'), scale=1000), + 'abr': int_or_none(piece_response.get('bitRate')), + 'url': self._extract_file_link(piece_response), + 'vcodec': 'none' + } for piece_response in sorted( + self._get_prx_embed_response(audio_response, 'items') or [], + key=lambda p: int_or_none(p.get('position')))] + + def _extract_story(self, story_response): + info = self._extract_story_info(story_response) + if not info: + return + audio_pieces = self._extract_audio_pieces( + self._get_prx_embed_response(story_response, 'audio')) + if len(audio_pieces) == 1: + return { + 'formats': audio_pieces, + **info + } + + entries = [{ + **info, + 'id': '%s_part%d' % (info['id'], (idx + 1)), + 'formats': [fmt], + } for idx, fmt in enumerate(audio_pieces)] + return { + '_type': 'multi_video', + 'entries': entries, + **info + } + + def _real_extract(self, url): + story_id = self._match_id(url) + response = self._call_api(story_id, f'stories/{story_id}') + return self._extract_story(response) + + +class PRXSeriesIE(PRXBaseIE): + _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://beta.prx.org/series/36252', + 'info_dict': { + 'id': '36252', + 'title': 'Outside/In', + 'thumbnails': 'count:1', + 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114', + 'timestamp': 1470684964, + 'modified_timestamp': 1582308830, + 'channel_id': '206', + 'channel_url': 'https://beta.prx.org/accounts/206', + 'channel': 'New Hampshire Public Radio', + 'series': 'Outside/In', + 'series_id': '36252' + }, + 'playlist_mincount': 39 + }, { + # Blank series + 'url': 'https://beta.prx.org/series/25038', + 'info_dict': { + 'id': '25038', + 'title': '25038', + 'timestamp': 1207612800, + 'modified_timestamp': 1207612800, + 'channel_id': '206', + 'channel_url': 'https://beta.prx.org/accounts/206', + 'channel': 'New Hampshire Public Radio', + 'series': '25038', + 'series_id': '25038' + }, + 'playlist_count': 0 + } + ] + + def _extract_series(self, series_response): + info = self._extract_series_info(series_response) + return { + '_type': 'playlist', + 'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry), + **info + } + + def _real_extract(self, url): + series_id = self._match_id(url) + response = self._call_api(series_id, f'series/{series_id}') + return self._extract_series(response) + + +class PRXAccountIE(PRXBaseIE): + _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://beta.prx.org/accounts/206', + 'info_dict': { + 'id': '206', + 'title': 'New Hampshire Public Radio', + 'description': 'md5:277f2395301d0aca563c80c70a18ee0a', + 'channel_id': '206', + 'channel_url': 'https://beta.prx.org/accounts/206', + 'channel': 'New Hampshire Public Radio', + 'thumbnails': 'count:1' + }, + 'playlist_mincount': 380 + }] + + def _extract_account(self, account_response): + info = self._extract_account_info(account_response) + series = self._entries( + info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry) + stories = self._entries( + info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry) + return { + '_type': 'playlist', + 'entries': itertools.chain(series, stories), + **info + } + + def _real_extract(self, url): + account_id = self._match_id(url) + response = self._call_api(account_id, f'accounts/{account_id}') + return self._extract_account(response) + + +class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor): + IE_DESC = 'PRX Stories Search' + IE_NAME = 'prxstories:search' + _SEARCH_KEY = 'prxstories' + + def _search_results(self, query): + yield from self._entries( + f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query}) + + +class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor): + IE_DESC = 'PRX Series Search' + IE_NAME = 'prxseries:search' + _SEARCH_KEY = 'prxseries' + + def _search_results(self, query): + yield from self._entries( + f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query}) diff --git a/yt_dlp/extractor/puhutv.py b/yt_dlp/extractor/puhutv.py new file mode 100644 index 0000000..4b8e5e9 --- /dev/null +++ b/yt_dlp/extractor/puhutv.py @@ -0,0 +1,233 @@ +from .common import InfoExtractor +from ..compat import compat_str +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + parse_resolution, + str_or_none, + try_get, + unified_timestamp, + url_or_none, + urljoin, +) + + +class PuhuTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-izle' + IE_NAME = 'puhutv' + _TESTS = [{ + # film + 'url': 'https://puhutv.com/sut-kardesler-izle', + 'md5': 'a347470371d56e1585d1b2c8dab01c96', + 'info_dict': { + 'id': '5085', + 'display_id': 'sut-kardesler', + 'ext': 'mp4', + 'title': 'Süt Kardeşler', + 'description': 'md5:ca09da25b7e57cbb5a9280d6e48d17aa', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 4832.44, + 'creator': 'Arzu Film', + 'timestamp': 1561062602, + 'upload_date': '20190620', + 'release_year': 1976, + 'view_count': int, + 'tags': list, + }, + }, { + # episode, geo restricted, bypassable with --geo-verification-proxy + 'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle', + 'only_matching': True, + }, { + # 4k, with subtitles + 'url': 'https://puhutv.com/dip-1-bolum-izle', + 'only_matching': True, + }] + _SUBTITLE_LANGS = { + 'English': 'en', + 'Deutsch': 'de', + 'عربى': 'ar' + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + info = self._download_json( + urljoin(url, '/api/slug/%s-izle' % display_id), + display_id)['data'] + + video_id = compat_str(info['id']) + show = info.get('title') or {} + title = info.get('name') or show['name'] + if info.get('display_name'): + title = '%s %s' % (title, info['display_name']) + + try: + videos = self._download_json( + 'https://puhutv.com/api/assets/%s/videos' % video_id, + display_id, 'Downloading video JSON', + headers=self.geo_verification_headers()) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + self.raise_geo_restricted() + raise + + urls = [] + formats = [] + + for video in videos['data']['videos']: + media_url = url_or_none(video.get('url')) + if not media_url or media_url in urls: + continue + urls.append(media_url) + + playlist = video.get('is_playlist') + if (video.get('stream_type') == 'hls' and playlist is True) or 'playlist.m3u8' in media_url: + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + + quality = int_or_none(video.get('quality')) + f = { + 'url': media_url, + 'ext': 'mp4', + 'height': quality + } + video_format = video.get('video_format') + is_hls = (video_format == 'hls' or '/hls/' in media_url or '/chunklist.m3u8' in media_url) and playlist is False + if is_hls: + format_id = 'hls' + f['protocol'] = 'm3u8_native' + elif video_format == 'mp4': + format_id = 'http' + else: + continue + if quality: + format_id += '-%sp' % quality + f['format_id'] = format_id + formats.append(f) + + creator = try_get( + show, lambda x: x['producer']['name'], compat_str) + + content = info.get('content') or {} + + images = try_get( + content, lambda x: x['images']['wide'], dict) or {} + thumbnails = [] + for image_id, image_url in images.items(): + if not isinstance(image_url, compat_str): + continue + if not image_url.startswith(('http', '//')): + image_url = 'https://%s' % image_url + t = parse_resolution(image_id) + t.update({ + 'id': image_id, + 'url': image_url + }) + thumbnails.append(t) + + tags = [] + for genre in show.get('genres') or []: + if not isinstance(genre, dict): + continue + genre_name = genre.get('name') + if genre_name and isinstance(genre_name, compat_str): + tags.append(genre_name) + + subtitles = {} + for subtitle in content.get('subtitles') or []: + if not isinstance(subtitle, dict): + continue + lang = subtitle.get('language') + sub_url = url_or_none(subtitle.get('url') or subtitle.get('file')) + if not lang or not isinstance(lang, compat_str) or not sub_url: + continue + subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ + 'url': sub_url + }] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': info.get('description') or show.get('description'), + 'season_id': str_or_none(info.get('season_id')), + 'season_number': int_or_none(info.get('season_number')), + 'episode_number': int_or_none(info.get('episode_number')), + 'release_year': int_or_none(show.get('released_at')), + 'timestamp': unified_timestamp(info.get('created_at')), + 'creator': creator, + 'view_count': int_or_none(content.get('watch_count')), + 'duration': float_or_none(content.get('duration_in_ms'), 1000), + 'tags': tags, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'formats': formats + } + + +class PuhuTVSerieIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-detay' + IE_NAME = 'puhutv:serie' + _TESTS = [{ + 'url': 'https://puhutv.com/deniz-yildizi-detay', + 'info_dict': { + 'title': 'Deniz Yıldızı', + 'id': 'deniz-yildizi', + }, + 'playlist_mincount': 205, + }, { + # a film detail page which is using same url with serie page + 'url': 'https://puhutv.com/kaybedenler-kulubu-detay', + 'only_matching': True, + }] + + def _extract_entries(self, seasons): + for season in seasons: + season_id = season.get('id') + if not season_id: + continue + page = 1 + has_more = True + while has_more is True: + season = self._download_json( + 'https://galadriel.puhutv.com/seasons/%s' % season_id, + season_id, 'Downloading page %s' % page, query={ + 'page': page, + 'per': 40, + }) + episodes = season.get('episodes') + if isinstance(episodes, list): + for ep in episodes: + slug_path = str_or_none(ep.get('slugPath')) + if not slug_path: + continue + video_id = str_or_none(int_or_none(ep.get('id'))) + yield self.url_result( + 'https://puhutv.com/%s' % slug_path, + ie=PuhuTVIE.ie_key(), video_id=video_id, + video_title=ep.get('name') or ep.get('eventLabel')) + page += 1 + has_more = season.get('hasMore') + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + info = self._download_json( + urljoin(url, '/api/slug/%s-detay' % playlist_id), + playlist_id)['data'] + + seasons = info.get('seasons') + if seasons: + return self.playlist_result( + self._extract_entries(seasons), playlist_id, info.get('name')) + + # For films, these are using same url with series + video_id = info.get('slug') or info['assets'][0]['slug'] + return self.url_result( + 'https://puhutv.com/%s-izle' % video_id, + PuhuTVIE.ie_key(), video_id) diff --git a/yt_dlp/extractor/puls4.py b/yt_dlp/extractor/puls4.py new file mode 100644 index 0000000..38c5d11 --- /dev/null +++ b/yt_dlp/extractor/puls4.py @@ -0,0 +1,51 @@ +from .prosiebensat1 import ProSiebenSat1BaseIE +from ..compat import compat_str +from ..utils import parse_duration, unified_strdate + + +class Puls4IE(ProSiebenSat1BaseIE): + _VALID_URL = r'https?://(?:www\.)?puls4\.com/(?P<id>[^?#&]+)' + _TESTS = [{ + 'url': 'http://www.puls4.com/2-minuten-2-millionen/staffel-3/videos/2min2miotalk/Tobias-Homberger-von-myclubs-im-2min2miotalk-118118', + 'md5': 'fd3c6b0903ac72c9d004f04bc6bb3e03', + 'info_dict': { + 'id': '118118', + 'ext': 'flv', + 'title': 'Tobias Homberger von myclubs im #2min2miotalk', + 'description': 'md5:f9def7c5e8745d6026d8885487d91955', + 'upload_date': '20160830', + 'uploader': 'PULS_4', + }, + }, { + 'url': 'http://www.puls4.com/pro-und-contra/wer-wird-prasident/Ganze-Folgen/Wer-wird-Praesident.-Norbert-Hofer', + 'only_matching': True, + }, { + 'url': 'http://www.puls4.com/pro-und-contra/wer-wird-prasident/Ganze-Folgen/Wer-wird-Praesident-Analyse-des-Interviews-mit-Norbert-Hofer-416598', + 'only_matching': True, + }] + _TOKEN = 'puls4' + _SALT = '01!kaNgaiNgah1Ie4AeSha' + _CLIENT_NAME = '' + + def _real_extract(self, url): + path = self._match_id(url) + content_path = self._download_json( + 'http://www.puls4.com/api/json-fe/page/' + path, path)['content'][0]['url'] + media = self._download_json( + 'http://www.puls4.com' + content_path, + content_path)['mediaCurrent'] + player_content = media['playerContent'] + info = self._extract_video_info(url, player_content['id']) + info.update({ + 'id': compat_str(media['objectId']), + 'title': player_content['title'], + 'description': media.get('description'), + 'thumbnail': media.get('previewLink'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(player_content.get('duration')), + 'episode': player_content.get('episodePartName'), + 'show': media.get('channel'), + 'season_id': player_content.get('seasonId'), + 'uploader': player_content.get('sourceCompany'), + }) + return info diff --git a/yt_dlp/extractor/pyvideo.py b/yt_dlp/extractor/pyvideo.py new file mode 100644 index 0000000..7b25166 --- /dev/null +++ b/yt_dlp/extractor/pyvideo.py @@ -0,0 +1,70 @@ +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none + + +class PyvideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/(?P<category>[^/]+)/(?P<id>[^/?#&.]+)' + + _TESTS = [{ + 'url': 'http://pyvideo.org/pycon-us-2013/become-a-logging-expert-in-30-minutes.html', + 'info_dict': { + 'id': 'become-a-logging-expert-in-30-minutes', + }, + 'playlist_count': 2, + }, { + 'url': 'http://pyvideo.org/pygotham-2012/gloriajw-spotifywitherikbernhardsson182m4v.html', + 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', + 'info_dict': { + 'id': '2542', + 'ext': 'm4v', + 'title': 'Gloriajw-SpotifyWithErikBernhardsson182.m4v', + }, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + category = mobj.group('category') + video_id = mobj.group('id') + + entries = [] + + data = self._download_json( + 'https://raw.githubusercontent.com/pyvideo/data/master/%s/videos/%s.json' + % (category, video_id), video_id, fatal=False) + + if data: + for video in data['videos']: + video_url = video.get('url') + if video_url: + if video.get('type') == 'youtube': + entries.append(self.url_result(video_url, 'Youtube')) + else: + entries.append({ + 'id': compat_str(data.get('id') or video_id), + 'url': video_url, + 'title': data['title'], + 'description': data.get('description') or data.get('summary'), + 'thumbnail': data.get('thumbnail_url'), + 'duration': int_or_none(data.get('duration')), + }) + else: + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + media_urls = self._search_regex( + r'(?s)Media URL:(.+?)</li>', webpage, 'media urls') + for m in re.finditer( + r'<a[^>]+href=(["\'])(?P<url>http.+?)\1', media_urls): + media_url = m.group('url') + if re.match(r'https?://www\.youtube\.com/watch\?v=.*', media_url): + entries.append(self.url_result(media_url, 'Youtube')) + else: + entries.append({ + 'id': video_id, + 'url': media_url, + 'title': title, + }) + + return self.playlist_result(entries, video_id) diff --git a/yt_dlp/extractor/qdance.py b/yt_dlp/extractor/qdance.py new file mode 100644 index 0000000..934ebbf --- /dev/null +++ b/yt_dlp/extractor/qdance.py @@ -0,0 +1,171 @@ +import json +import time + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + jwt_decode_hs256, + str_or_none, + traverse_obj, + try_call, + url_or_none, +) + + +class QDanceIE(InfoExtractor): + _NETRC_MACHINE = 'qdance' + _VALID_URL = r'https?://(?:www\.)?q-dance\.com/network/(?:library|live)/(?P<id>[\w-]+)' + _TESTS = [{ + 'note': 'vod', + 'url': 'https://www.q-dance.com/network/library/146542138', + 'info_dict': { + 'id': '146542138', + 'ext': 'mp4', + 'title': 'Sound Rush [LIVE] | Defqon.1 Weekend Festival 2022 | Friday | RED', + 'display_id': 'sound-rush-live-v3-defqon-1-weekend-festival-2022-friday-red', + 'description': 'Relive Defqon.1 - Primal Energy 2022 with the sounds of Sound Rush LIVE at the RED on Friday! 🔥', + 'season': 'Defqon.1 Weekend Festival 2022', + 'season_id': '31840632', + 'series': 'Defqon.1', + 'series_id': '31840378', + 'thumbnail': 'https://images.q-dance.network/1674829540-20220624171509-220624171509_delio_dn201093-2.jpg', + 'availability': 'premium_only', + 'duration': 1829, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'livestream', + 'url': 'https://www.q-dance.com/network/live/149170353', + 'info_dict': { + 'id': '149170353', + 'ext': 'mp4', + 'title': r're:^Defqon\.1 2023 - Friday - RED', + 'display_id': 'defqon-1-2023-friday-red', + 'description': 'md5:3c73fbbd4044e578e696adfc64019163', + 'season': 'Defqon.1 Weekend Festival 2023', + 'season_id': '141735599', + 'series': 'Defqon.1', + 'series_id': '31840378', + 'thumbnail': 'https://images.q-dance.network/1686849069-area-thumbs_red.png', + 'availability': 'subscriber_only', + 'live_status': 'is_live', + 'channel_id': 'qdancenetwork.video_149170353', + }, + 'skip': 'Completed livestream', + }, { + 'note': 'vod with alphanumeric id', + 'url': 'https://www.q-dance.com/network/library/WhDleSIWSfeT3Q9ObBKBeA', + 'info_dict': { + 'id': 'WhDleSIWSfeT3Q9ObBKBeA', + 'ext': 'mp4', + 'title': 'Aftershock I Defqon.1 Weekend Festival 2023 I Sunday I BLUE', + 'display_id': 'naam-i-defqon-1-weekend-festival-2023-i-dag-i-podium', + 'description': 'Relive Defqon.1 Path of the Warrior with Aftershock at the BLUE 🔥', + 'series': 'Defqon.1', + 'series_id': '31840378', + 'season': 'Defqon.1 Weekend Festival 2023', + 'season_id': '141735599', + 'duration': 3507, + 'availability': 'premium_only', + 'thumbnail': 'https://images.q-dance.network/1698158361-230625-135716-defqon-1-aftershock.jpg', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.q-dance.com/network/library/-uRFKXwmRZGVnve7av9uqA', + 'only_matching': True, + }] + + _access_token = None + _refresh_token = None + + def _call_login_api(self, data, note='Logging in'): + login = self._download_json( + 'https://members.id-t.com/api/auth/login', None, note, headers={ + 'content-type': 'application/json', + 'brand': 'qdance', + 'origin': 'https://www.q-dance.com', + 'referer': 'https://www.q-dance.com/', + }, data=json.dumps(data, separators=(',', ':')).encode(), + expected_status=lambda x: True) + + tokens = traverse_obj(login, ('data', { + '_id-t-accounts-token': ('accessToken', {str}), + '_id-t-accounts-refresh': ('refreshToken', {str}), + '_id-t-accounts-id-token': ('idToken', {str}), + })) + + if not tokens.get('_id-t-accounts-token'): + error = ': '.join(traverse_obj(login, ('error', ('code', 'message'), {str}))) + if 'validation_error' not in error: + raise ExtractorError(f'Q-Dance API said "{error}"') + msg = 'Invalid username or password' if 'email' in data else 'Refresh token has expired' + raise ExtractorError(msg, expected=True) + + for name, value in tokens.items(): + self._set_cookie('.q-dance.com', name, value) + + def _perform_login(self, username, password): + self._call_login_api({'email': username, 'password': password}) + + def _real_initialize(self): + cookies = self._get_cookies('https://www.q-dance.com/') + self._refresh_token = try_call(lambda: cookies['_id-t-accounts-refresh'].value) + self._access_token = try_call(lambda: cookies['_id-t-accounts-token'].value) + if not self._access_token: + self.raise_login_required() + + def _get_auth(self): + if (try_call(lambda: jwt_decode_hs256(self._access_token)['exp']) or 0) <= int(time.time() - 120): + if not self._refresh_token: + raise ExtractorError( + 'Cannot refresh access token, login with yt-dlp or refresh cookies in browser') + self._call_login_api({'refreshToken': self._refresh_token}, note='Refreshing access token') + self._real_initialize() + + return {'Authorization': self._access_token} + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._search_nuxt_data(webpage, video_id, traverse=('data', 0, 'data')) + + def extract_availability(level): + level = int_or_none(level) or 0 + return self._availability( + needs_premium=(level >= 20), needs_subscription=(level >= 15), needs_auth=True) + + info = traverse_obj(data, { + 'title': ('title', {str.strip}), + 'description': ('description', {str.strip}), + 'display_id': ('slug', {str}), + 'thumbnail': ('thumbnail', {url_or_none}), + 'duration': ('durationInSeconds', {int_or_none}, {lambda x: x or None}), + 'availability': ('subscription', 'level', {extract_availability}), + 'is_live': ('type', {lambda x: x.lower() == 'live'}), + 'artist': ('acts', ..., {str}), + 'series': ('event', 'title', {str.strip}), + 'series_id': ('event', 'id', {str_or_none}), + 'season': ('eventEdition', 'title', {str.strip}), + 'season_id': ('eventEdition', 'id', {str_or_none}), + 'channel_id': ('pubnub', 'channelName', {str}), + }) + + stream = self._download_json( + f'https://dc9h6qmsoymbq.cloudfront.net/api/content/videos/{video_id}/url', + video_id, headers=self._get_auth(), expected_status=401) + + m3u8_url = traverse_obj(stream, ('data', 'url', {url_or_none})) + if not m3u8_url and traverse_obj(stream, ('error', 'code')) == 'unauthorized': + raise ExtractorError('Your account does not have access to this content', expected=True) + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, fatal=False, live=True) if m3u8_url else [] + if not formats: + self.raise_no_formats('No active streams found', expected=bool(info.get('is_live'))) + + return { + **info, + 'id': video_id, + 'formats': formats, + } diff --git a/yt_dlp/extractor/qingting.py b/yt_dlp/extractor/qingting.py new file mode 100644 index 0000000..aa690d4 --- /dev/null +++ b/yt_dlp/extractor/qingting.py @@ -0,0 +1,47 @@ +from .common import InfoExtractor + +from ..utils import traverse_obj + + +class QingTingIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.|m\.)?(?:qingting\.fm|qtfm\.cn)/v?channels/(?P<channel>\d+)/programs/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.qingting.fm/channels/378005/programs/22257411/', + 'md5': '47e6a94f4e621ed832c316fd1888fb3c', + 'info_dict': { + 'id': '22257411', + 'title': '用了十年才修改,谁在乎教科书?', + 'channel_id': '378005', + 'channel': '睡前消息', + 'uploader': '马督工', + 'ext': 'm4a', + } + }, { + 'url': 'https://m.qtfm.cn/vchannels/378005/programs/23023573/', + 'md5': '2703120b6abe63b5fa90b975a58f4c0e', + 'info_dict': { + 'id': '23023573', + 'title': '【睡前消息488】重庆山火之后,有图≠真相', + 'channel_id': '378005', + 'channel': '睡前消息', + 'uploader': '马督工', + 'ext': 'm4a', + } + }] + + def _real_extract(self, url): + channel_id, pid = self._match_valid_url(url).group('channel', 'id') + webpage = self._download_webpage( + f'https://m.qtfm.cn/vchannels/{channel_id}/programs/{pid}/', pid) + info = self._search_json(r'window\.__initStores\s*=', webpage, 'program info', pid) + return { + 'id': pid, + 'title': traverse_obj(info, ('ProgramStore', 'programInfo', 'title')), + 'channel_id': channel_id, + 'channel': traverse_obj(info, ('ProgramStore', 'channelInfo', 'title')), + 'uploader': traverse_obj(info, ('ProgramStore', 'podcasterInfo', 'podcaster', 'nickname')), + 'url': traverse_obj(info, ('ProgramStore', 'programInfo', 'audioUrl')), + 'vcodec': 'none', + 'acodec': 'm4a', + 'ext': 'm4a', + } diff --git a/yt_dlp/extractor/qqmusic.py b/yt_dlp/extractor/qqmusic.py new file mode 100644 index 0000000..9285825 --- /dev/null +++ b/yt_dlp/extractor/qqmusic.py @@ -0,0 +1,365 @@ +import random +import re +import time + +from .common import InfoExtractor +from ..utils import ( + clean_html, + ExtractorError, + strip_jsonp, + unescapeHTML, +) + + +class QQMusicIE(InfoExtractor): + IE_NAME = 'qqmusic' + IE_DESC = 'QQ音乐' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P<id>[0-9A-Za-z]+)\.html' + _TESTS = [{ + 'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html', + 'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8', + 'info_dict': { + 'id': '004295Et37taLD', + 'ext': 'mp3', + 'title': '可惜没如果', + 'release_date': '20141227', + 'creator': '林俊杰', + 'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'note': 'There is no mp3-320 version of this song.', + 'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html', + 'md5': 'fa3926f0c585cda0af8fa4f796482e3e', + 'info_dict': { + 'id': '004MsGEo3DdNxV', + 'ext': 'mp3', + 'title': '如果', + 'release_date': '20050626', + 'creator': '李季美', + 'description': 'md5:46857d5ed62bc4ba84607a805dccf437', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'note': 'lyrics not in .lrc format', + 'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html', + 'info_dict': { + 'id': '001JyApY11tIp6', + 'ext': 'mp3', + 'title': 'Shadows Over Transylvania', + 'release_date': '19970225', + 'creator': 'Dark Funeral', + 'description': 'md5:c9b20210587cbcd6836a1c597bab4525', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + }] + + _FORMATS = { + 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320}, + 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128}, + 'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10} + } + + # Reference: m_r_GetRUin() in top_player.js + # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js + @staticmethod + def m_r_get_ruin(): + curMs = int(time.time() * 1000) % 1000 + return int(round(random.random() * 2147483647) * curMs % 1E10) + + def _real_extract(self, url): + mid = self._match_id(url) + + detail_info_page = self._download_webpage( + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid, + mid, note='Download song detail info', + errnote='Unable to get song detail info', encoding='gbk') + + song_name = self._html_search_regex( + r"songname:\s*'([^']+)'", detail_info_page, 'song name') + + publish_time = self._html_search_regex( + r'发行时间:(\d{4}-\d{2}-\d{2})', detail_info_page, + 'publish time', default=None) + if publish_time: + publish_time = publish_time.replace('-', '') + + singer = self._html_search_regex( + r"singer:\s*'([^']+)", detail_info_page, 'singer', default=None) + + lrc_content = self._html_search_regex( + r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>', + detail_info_page, 'LRC lyrics', default=None) + if lrc_content: + lrc_content = lrc_content.replace('\\n', '\n') + + thumbnail_url = None + albummid = self._search_regex( + [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], + detail_info_page, 'album mid', default=None) + if albummid: + thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \ + % (albummid[-2:-1], albummid[-1], albummid) + + guid = self.m_r_get_ruin() + + vkey = self._download_json( + 'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid, + mid, note='Retrieve vkey', errnote='Unable to get vkey', + transform_source=strip_jsonp)['key'] + + formats = [] + for format_id, details in self._FORMATS.items(): + formats.append({ + 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' + % (details['prefix'], mid, details['ext'], vkey, guid), + 'format': format_id, + 'format_id': format_id, + 'quality': details['preference'], + 'abr': details.get('abr'), + }) + self._check_formats(formats, mid) + + actual_lrc_lyrics = ''.join( + line + '\n' for line in re.findall( + r'(?m)^(\[[0-9]{2}:[0-9]{2}(?:\.[0-9]{2,})?\][^\n]*|\[[^\]]*\])', lrc_content)) + + info_dict = { + 'id': mid, + 'formats': formats, + 'title': song_name, + 'release_date': publish_time, + 'creator': singer, + 'description': lrc_content, + 'thumbnail': thumbnail_url + } + if actual_lrc_lyrics: + info_dict['subtitles'] = { + 'origin': [{ + 'ext': 'lrc', + 'data': actual_lrc_lyrics, + }] + } + return info_dict + + +class QQPlaylistBaseIE(InfoExtractor): + @staticmethod + def qq_static_url(category, mid): + return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid) + + def get_singer_all_songs(self, singmid, num): + return self._download_webpage( + r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid, + query={ + 'format': 'json', + 'inCharset': 'utf8', + 'outCharset': 'utf-8', + 'platform': 'yqq', + 'needNewCode': 0, + 'singermid': singmid, + 'order': 'listen', + 'begin': 0, + 'num': num, + 'songstatus': 1, + }) + + def get_entries_from_page(self, singmid): + entries = [] + + default_num = 1 + json_text = self.get_singer_all_songs(singmid, default_num) + json_obj_all_songs = self._parse_json(json_text, singmid) + + if json_obj_all_songs['code'] == 0: + total = json_obj_all_songs['data']['total'] + json_text = self.get_singer_all_songs(singmid, total) + json_obj_all_songs = self._parse_json(json_text, singmid) + + for item in json_obj_all_songs['data']['list']: + if item['musicData'].get('songmid') is not None: + songmid = item['musicData']['songmid'] + entries.append(self.url_result( + r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid)) + + return entries + + +class QQMusicSingerIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:singer' + IE_DESC = 'QQ音乐 - 歌手' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P<id>[0-9A-Za-z]+)\.html' + _TEST = { + 'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html', + 'info_dict': { + 'id': '001BLpXF2DyJe2', + 'title': '林俊杰', + 'description': 'md5:870ec08f7d8547c29c93010899103751', + }, + 'playlist_mincount': 12, + } + + def _real_extract(self, url): + mid = self._match_id(url) + + entries = self.get_entries_from_page(mid) + singer_page = self._download_webpage(url, mid, 'Download singer page') + singer_name = self._html_search_regex( + r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None) + singer_desc = None + + if mid: + singer_desc_page = self._download_xml( + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid, + 'Donwload singer description XML', + query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid}, + headers={'Referer': 'https://y.qq.com/n/yqq/singer/'}) + + singer_desc = singer_desc_page.find('./data/info/desc').text + + return self.playlist_result(entries, mid, singer_name, singer_desc) + + +class QQMusicAlbumIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:album' + IE_DESC = 'QQ音乐 - 专辑' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P<id>[0-9A-Za-z]+)\.html' + + _TESTS = [{ + 'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html', + 'info_dict': { + 'id': '000gXCTb2AhRR1', + 'title': '我们都是这样长大的', + 'description': 'md5:179c5dce203a5931970d306aa9607ea6', + }, + 'playlist_count': 4, + }, { + 'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html', + 'info_dict': { + 'id': '002Y5a3b3AlCu3', + 'title': '그리고...', + 'description': 'md5:a48823755615508a95080e81b51ba729', + }, + 'playlist_count': 8, + }] + + def _real_extract(self, url): + mid = self._match_id(url) + + album = self._download_json( + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid=%s&format=json' % mid, + mid, 'Download album page')['data'] + + entries = [ + self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'] + ) for song in album['list'] + ] + album_name = album.get('name') + album_detail = album.get('desc') + if album_detail is not None: + album_detail = album_detail.strip() + + return self.playlist_result(entries, mid, album_name, album_detail) + + +class QQMusicToplistIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:toplist' + IE_DESC = 'QQ音乐 - 排行榜' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P<id>[0-9]+)\.html' + + _TESTS = [{ + 'url': 'https://y.qq.com/n/yqq/toplist/123.html', + 'info_dict': { + 'id': '123', + 'title': '美国iTunes榜', + 'description': 'md5:89db2335fdbb10678dee2d43fe9aba08', + }, + 'playlist_count': 100, + }, { + 'url': 'https://y.qq.com/n/yqq/toplist/3.html', + 'info_dict': { + 'id': '3', + 'title': '巅峰榜·欧美', + 'description': 'md5:5a600d42c01696b26b71f8c4d43407da', + }, + 'playlist_count': 100, + }, { + 'url': 'https://y.qq.com/n/yqq/toplist/106.html', + 'info_dict': { + 'id': '106', + 'title': '韩国Mnet榜', + 'description': 'md5:cb84b325215e1d21708c615cac82a6e7', + }, + 'playlist_count': 50, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + + toplist_json = self._download_json( + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id, + note='Download toplist page', + query={'type': 'toplist', 'topid': list_id, 'format': 'json'}) + + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic', + song['data']['songmid']) + for song in toplist_json['songlist']] + + topinfo = toplist_json.get('topinfo', {}) + list_name = topinfo.get('ListName') + list_description = topinfo.get('info') + return self.playlist_result(entries, list_id, list_name, list_description) + + +class QQMusicPlaylistIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:playlist' + IE_DESC = 'QQ音乐 - 歌单' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P<id>[0-9]+)\.html' + + _TESTS = [{ + 'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html', + 'info_dict': { + 'id': '3462654915', + 'title': '韩国5月新歌精选下旬', + 'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4', + }, + 'playlist_count': 40, + 'skip': 'playlist gone', + }, { + 'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html', + 'info_dict': { + 'id': '1374105607', + 'title': '易入人心的华语民谣', + 'description': '民谣的歌曲易于传唱、、歌词朗朗伤口、旋律简单温馨。属于那种才入耳孔。却上心头的感觉。没有太多的复杂情绪。简单而直接地表达乐者的情绪,就是这样的简单才易入人心。', + }, + 'playlist_count': 20, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + + list_json = self._download_json( + 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg', + list_id, 'Download list page', + query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id}, + transform_source=strip_jsonp) + if not len(list_json.get('cdlist', [])): + if list_json.get('code'): + raise ExtractorError( + 'QQ Music said: error %d in fetching playlist info' % list_json['code'], + expected=True) + raise ExtractorError('Unable to get playlist info') + + cdlist = list_json['cdlist'][0] + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']) + for song in cdlist['songlist']] + + list_name = cdlist.get('dissname') + list_description = clean_html(unescapeHTML(cdlist.get('desc'))) + return self.playlist_result(entries, list_id, list_name, list_description) diff --git a/yt_dlp/extractor/r7.py b/yt_dlp/extractor/r7.py new file mode 100644 index 0000000..36f0b52 --- /dev/null +++ b/yt_dlp/extractor/r7.py @@ -0,0 +1,112 @@ +from .common import InfoExtractor +from ..utils import int_or_none + + +class R7IE(InfoExtractor): + _WORKING = False + _ENABLED = None # XXX: pass through to GenericIE + _VALID_URL = r'''(?x) + https?:// + (?: + (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/| + noticias\.r7\.com(?:/[^/]+)+/[^/]+-| + player\.r7\.com/video/i/ + ) + (?P<id>[\da-f]{24}) + ''' + _TESTS = [{ + 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html', + 'md5': '403c4e393617e8e8ddc748978ee8efde', + 'info_dict': { + 'id': '54e7050b0cf2ff57e0279389', + 'ext': 'mp4', + 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', + 'description': 'md5:01812008664be76a6479aa58ec865b72', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 98, + 'like_count': int, + 'view_count': int, + }, + }, { + 'url': 'http://esportes.r7.com/videos/cigano-manda-recado-aos-fas/idmedia/4e176727b51a048ee6646a1b.html', + 'only_matching': True, + }, { + 'url': 'http://noticias.r7.com/record-news/video/representante-do-instituto-sou-da-paz-fala-sobre-fim-do-estatuto-do-desarmamento-5480fc580cf2285b117f438d/', + 'only_matching': True, + }, { + 'url': 'http://player.r7.com/video/i/54e7050b0cf2ff57e0279389?play=true&video=http://vsh.r7.com/54e7050b0cf2ff57e0279389/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-ATOS_copy.mp4&linkCallback=http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html&thumbnail=http://vtb.r7.com/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-thumb.jpg&idCategory=192&share=true&layout=full&full=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://player-api.r7.com/video/i/%s' % video_id, video_id) + + title = video['title'] + + formats = [] + media_url_hls = video.get('media_url_hls') + if media_url_hls: + formats.extend(self._extract_m3u8_formats( + media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + media_url = video.get('media_url') + if media_url: + f = { + 'url': media_url, + 'format_id': 'http', + } + # m3u8 format always matches the http format, let's copy metadata from + # one to another + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none', formats)) + if len(m3u8_formats) == 1: + f_copy = m3u8_formats[0].copy() + f_copy.update(f) + f_copy['protocol'] = 'http' + f = f_copy + formats.append(f) + + description = video.get('description') + thumbnail = video.get('thumb') + duration = int_or_none(video.get('media_duration')) + like_count = int_or_none(video.get('likes')) + view_count = int_or_none(video.get('views')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'like_count': like_count, + 'view_count': view_count, + 'formats': formats, + } + + +class R7ArticleIE(InfoExtractor): + _WORKING = False + _ENABLED = None # XXX: pass through to GenericIE + _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P<id>\d+)' + _TEST = { + 'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'<div[^>]+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})', + webpage, 'video id') + + return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key()) diff --git a/yt_dlp/extractor/radiko.py b/yt_dlp/extractor/radiko.py new file mode 100644 index 0000000..f013582 --- /dev/null +++ b/yt_dlp/extractor/radiko.py @@ -0,0 +1,261 @@ +import base64 +import random +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + time_seconds, + try_call, + unified_timestamp, + update_url_query, +) +from ..utils.traversal import traverse_obj + + +class RadikoBaseIE(InfoExtractor): + _GEO_BYPASS = False + _FULL_KEY = None + _HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED = ( + 'https://c-rpaa.smartstream.ne.jp', + 'https://si-c-radiko.smartstream.ne.jp', + 'https://tf-f-rpaa-radiko.smartstream.ne.jp', + 'https://tf-c-rpaa-radiko.smartstream.ne.jp', + 'https://si-f-radiko.smartstream.ne.jp', + 'https://rpaa.smartstream.ne.jp', + ) + _HOSTS_FOR_TIME_FREE_FFMPEG_SUPPORTED = ( + 'https://rd-wowza-radiko.radiko-cf.com', + 'https://radiko.jp', + 'https://f-radiko.smartstream.ne.jp', + ) + # Following URL forcibly connects not Time Free but Live + _HOSTS_FOR_LIVE = ( + 'https://c-radiko.smartstream.ne.jp', + ) + + def _negotiate_token(self): + _, auth1_handle = self._download_webpage_handle( + 'https://radiko.jp/v2/api/auth1', None, 'Downloading authentication page', + headers={ + 'x-radiko-app': 'pc_html5', + 'x-radiko-app-version': '0.0.1', + 'x-radiko-device': 'pc', + 'x-radiko-user': 'dummy_user', + }) + auth1_header = auth1_handle.headers + + auth_token = auth1_header['X-Radiko-AuthToken'] + kl = int(auth1_header['X-Radiko-KeyLength']) + ko = int(auth1_header['X-Radiko-KeyOffset']) + raw_partial_key = self._extract_full_key()[ko:ko + kl] + partial_key = base64.b64encode(raw_partial_key).decode() + + area_id = self._download_webpage( + 'https://radiko.jp/v2/api/auth2', None, 'Authenticating', + headers={ + 'x-radiko-device': 'pc', + 'x-radiko-user': 'dummy_user', + 'x-radiko-authtoken': auth_token, + 'x-radiko-partialkey': partial_key, + }).split(',')[0] + + if area_id == 'OUT': + self.raise_geo_restricted(countries=['JP']) + + auth_data = (auth_token, area_id) + self.cache.store('radiko', 'auth_data', auth_data) + return auth_data + + def _auth_client(self): + cachedata = self.cache.load('radiko', 'auth_data') + if cachedata is not None: + response = self._download_webpage( + 'https://radiko.jp/v2/api/auth_check', None, 'Checking cached token', expected_status=401, + headers={'X-Radiko-AuthToken': cachedata[0], 'X-Radiko-AreaId': cachedata[1]}) + if response == 'OK': + return cachedata + return self._negotiate_token() + + def _extract_full_key(self): + if self._FULL_KEY: + return self._FULL_KEY + + jscode = self._download_webpage( + 'https://radiko.jp/apps/js/playerCommon.js', None, + note='Downloading player js code') + full_key = self._search_regex( + (r"RadikoJSPlayer\([^,]*,\s*(['\"])pc_html5\1,\s*(['\"])(?P<fullkey>[0-9a-f]+)\2,\s*{"), + jscode, 'full key', fatal=False, group='fullkey') + + if full_key: + full_key = full_key.encode() + else: # use only full key ever known + full_key = b'bcd151073c03b352e1ef2fd66c32209da9ca0afa' + + self._FULL_KEY = full_key + return full_key + + def _find_program(self, video_id, station, cursor): + station_program = self._download_xml( + 'https://radiko.jp/v3/program/station/weekly/%s.xml' % station, video_id, + note='Downloading radio program for %s station' % station) + + prog = None + for p in station_program.findall('.//prog'): + ft_str, to_str = p.attrib['ft'], p.attrib['to'] + ft = unified_timestamp(ft_str, False) + to = unified_timestamp(to_str, False) + if ft <= cursor and cursor < to: + prog = p + break + if not prog: + raise ExtractorError('Cannot identify radio program to download!') + assert ft, to + return prog, station_program, ft, ft_str, to_str + + def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, area_id, query): + m3u8_playlist_data = self._download_xml( + f'https://radiko.jp/v3/station/stream/pc_html5/{station}.xml', video_id, + note='Downloading stream information') + + formats = [] + found = set() + + timefree_int = 0 if is_onair else 1 + + for element in m3u8_playlist_data.findall(f'.//url[@timefree="{timefree_int}"]/playlist_create_url'): + pcu = element.text + if pcu in found: + continue + found.add(pcu) + playlist_url = update_url_query(pcu, { + 'station_id': station, + **query, + 'l': '15', + 'lsid': ''.join(random.choices('0123456789abcdef', k=32)), + 'type': 'b', + }) + + time_to_skip = None if is_onair else cursor - ft + + domain = urllib.parse.urlparse(playlist_url).netloc + subformats = self._extract_m3u8_formats( + playlist_url, video_id, ext='m4a', + live=True, fatal=False, m3u8_id=domain, + note=f'Downloading m3u8 information from {domain}', + headers={ + 'X-Radiko-AreaId': area_id, + 'X-Radiko-AuthToken': auth_token, + }) + for sf in subformats: + if (is_onair ^ pcu.startswith(self._HOSTS_FOR_LIVE)) or ( + not is_onair and pcu.startswith(self._HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED)): + sf['preference'] = -100 + sf['format_note'] = 'not preferred' + if not is_onair and timefree_int == 1 and time_to_skip: + sf['downloader_options'] = {'ffmpeg_args': ['-ss', str(time_to_skip)]} + formats.extend(subformats) + + return formats + + def _extract_performers(self, prog): + return traverse_obj(prog, ( + 'pfm/text()', ..., {lambda x: re.split(r'[//、 ,,]', x)}, ..., {str.strip})) or None + + +class RadikoIE(RadikoBaseIE): + _VALID_URL = r'https?://(?:www\.)?radiko\.jp/#!/ts/(?P<station>[A-Z0-9-]+)/(?P<id>\d+)' + + _TESTS = [{ + # QRR (文化放送) station provides <desc> + 'url': 'https://radiko.jp/#!/ts/QRR/20210425101300', + 'only_matching': True, + }, { + # FMT (TOKYO FM) station does not provide <desc> + 'url': 'https://radiko.jp/#!/ts/FMT/20210810150000', + 'only_matching': True, + }, { + 'url': 'https://radiko.jp/#!/ts/JOAK-FM/20210509090000', + 'only_matching': True, + }] + + def _real_extract(self, url): + station, video_id = self._match_valid_url(url).groups() + vid_int = unified_timestamp(video_id, False) + prog, station_program, ft, radio_begin, radio_end = self._find_program(video_id, station, vid_int) + + auth_token, area_id = self._auth_client() + + return { + 'id': video_id, + 'title': try_call(lambda: prog.find('title').text), + 'cast': self._extract_performers(prog), + 'description': clean_html(try_call(lambda: prog.find('info').text)), + 'uploader': try_call(lambda: station_program.find('.//name').text), + 'uploader_id': station, + 'timestamp': vid_int, + 'duration': try_call(lambda: unified_timestamp(radio_end, False) - unified_timestamp(radio_begin, False)), + 'is_live': True, + 'formats': self._extract_formats( + video_id=video_id, station=station, is_onair=False, + ft=ft, cursor=vid_int, auth_token=auth_token, area_id=area_id, + query={ + 'start_at': radio_begin, + 'ft': radio_begin, + 'end_at': radio_end, + 'to': radio_end, + 'seek': video_id + } + ), + } + + +class RadikoRadioIE(RadikoBaseIE): + _VALID_URL = r'https?://(?:www\.)?radiko\.jp/#!/live/(?P<id>[A-Z0-9-]+)' + + _TESTS = [{ + # QRR (文化放送) station provides <desc> + 'url': 'https://radiko.jp/#!/live/QRR', + 'only_matching': True, + }, { + # FMT (TOKYO FM) station does not provide <desc> + 'url': 'https://radiko.jp/#!/live/FMT', + 'only_matching': True, + }, { + 'url': 'https://radiko.jp/#!/live/JOAK-FM', + 'only_matching': True, + }] + + def _real_extract(self, url): + station = self._match_id(url) + self.report_warning('Downloader will not stop at the end of the program! Press Ctrl+C to stop') + + auth_token, area_id = self._auth_client() + # get current time in JST (GMT+9:00 w/o DST) + vid_now = time_seconds(hours=9) + + prog, station_program, ft, _, _ = self._find_program(station, station, vid_now) + + title = prog.find('title').text + description = clean_html(prog.find('info').text) + station_name = station_program.find('.//name').text + + formats = self._extract_formats( + video_id=station, station=station, is_onair=True, + ft=ft, cursor=vid_now, auth_token=auth_token, area_id=area_id, + query={}) + + return { + 'id': station, + 'title': title, + 'cast': self._extract_performers(prog), + 'description': description, + 'uploader': station_name, + 'uploader_id': station, + 'timestamp': ft, + 'formats': formats, + 'is_live': True, + } diff --git a/yt_dlp/extractor/radiocanada.py b/yt_dlp/extractor/radiocanada.py new file mode 100644 index 0000000..1a5a635 --- /dev/null +++ b/yt_dlp/extractor/radiocanada.py @@ -0,0 +1,165 @@ +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + unified_strdate, +) + + +class RadioCanadaIE(InfoExtractor): + IE_NAME = 'radiocanada' + _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)' + _TESTS = [ + { + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', + 'info_dict': { + 'id': '7184272', + 'ext': 'mp4', + 'title': 'Le parcours du tireur capté sur vidéo', + 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', + 'upload_date': '20141023', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, + { + # empty Title + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/', + 'info_dict': { + 'id': '7754998', + 'ext': 'mp4', + 'title': 'letelejournal22h', + 'description': 'INTEGRALE WEB 22H-TJ', + 'upload_date': '20170720', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + # with protectionType but not actually DRM protected + 'url': 'radiocanada:toutv:140872', + 'info_dict': { + 'id': '140872', + 'title': 'Épisode 1', + 'series': 'District 31', + }, + 'only_matching': True, + } + ] + _GEO_COUNTRIES = ['CA'] + _access_token = None + _claims = None + + def _call_api(self, path, video_id=None, app_code=None, query=None): + if not query: + query = {} + query.update({ + 'client_key': '773aea60-0e80-41bb-9c7f-e6d7c3ad17fb', + 'output': 'json', + }) + if video_id: + query.update({ + 'appCode': app_code, + 'idMedia': video_id, + }) + if self._access_token: + query['access_token'] = self._access_token + try: + return self._download_json( + 'https://services.radio-canada.ca/media/' + path, video_id, query=query) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status in (401, 422): + data = self._parse_json(e.cause.response.read().decode(), None) + error = data.get('error_description') or data['errorMessage']['text'] + raise ExtractorError(error, expected=True) + raise + + def _extract_info(self, app_code, video_id): + metas = self._call_api('meta/v1/index.ashx', video_id, app_code)['Metas'] + + def get_meta(name): + for meta in metas: + if meta.get('name') == name: + text = meta.get('text') + if text: + return text + + # protectionType does not necessarily mean the video is DRM protected (see + # https://github.com/ytdl-org/youtube-dl/pull/18609). + if get_meta('protectionType'): + self.report_warning('This video is probably DRM protected.') + + query = { + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'multibitrate': 'true', + } + if self._claims: + query['claims'] = self._claims + v_data = self._call_api('validation/v2/', video_id, app_code, query) + v_url = v_data.get('url') + if not v_url: + error = v_data['message'] + if error == "Le contenu sélectionné n'est pas disponible dans votre pays": + raise self.raise_geo_restricted(error, self._GEO_COUNTRIES) + if error == 'Le contenu sélectionné est disponible seulement en premium': + self.raise_login_required(error) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + formats = self._extract_m3u8_formats(v_url, video_id, 'mp4') + + subtitles = {} + closed_caption_url = get_meta('closedCaption') or get_meta('closedCaptionHTML5') + if closed_caption_url: + subtitles['fr'] = [{ + 'url': closed_caption_url, + 'ext': determine_ext(closed_caption_url, 'vtt'), + }] + + return { + 'id': video_id, + 'title': get_meta('Title') or get_meta('AV-nomEmission'), + 'description': get_meta('Description') or get_meta('ShortDescription'), + 'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'), + 'duration': int_or_none(get_meta('length')), + 'series': get_meta('Emission'), + 'season_number': int_or_none('SrcSaison'), + 'episode_number': int_or_none('SrcEpisode'), + 'upload_date': unified_strdate(get_meta('Date')), + 'subtitles': subtitles, + 'formats': formats, + } + + def _real_extract(self, url): + return self._extract_info(*self._match_valid_url(url).groups()) + + +class RadioCanadaAudioVideoIE(InfoExtractor): + IE_NAME = 'radiocanada:audiovideo' + _VALID_URL = r'https?://ici\.radio-canada\.ca/([^/]+/)*media-(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam', + 'info_dict': { + 'id': '7527184', + 'ext': 'mp4', + 'title': 'Barack Obama au Vietnam', + 'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam', + 'upload_date': '20160523', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://ici.radio-canada.ca/info/videos/media-7527184/barack-obama-au-vietnam', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result('radiocanada:medianet:%s' % self._match_id(url)) diff --git a/yt_dlp/extractor/radiocomercial.py b/yt_dlp/extractor/radiocomercial.py new file mode 100644 index 0000000..38f8cf7 --- /dev/null +++ b/yt_dlp/extractor/radiocomercial.py @@ -0,0 +1,154 @@ +import itertools + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + extract_attributes, + get_element_by_class, + get_element_html_by_class, + get_element_text_and_html_by_tag, + get_elements_html_by_class, + int_or_none, + join_nonempty, + try_call, + unified_strdate, + update_url, + urljoin +) +from ..utils.traversal import traverse_obj + + +class RadioComercialIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/[^/?#]+/t?(?P<season>\d+)/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao/t6/taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas#page-content-wrapper', + 'md5': '5f4fe8e485b29d2e8fd495605bc2c7e4', + 'info_dict': { + 'id': 'taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas', + 'ext': 'mp3', + 'title': 'Taylor Swift entranhando-se que nem uma espada no ventre dos fãs.', + 'release_date': '20231025', + 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', + 'season': 'Season 6', + 'season_number': 6, + } + }, { + 'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3/convenca-me-num-minuto-que-os-lobisomens-existem', + 'md5': '47e96c273aef96a8eb160cd6cf46d782', + 'info_dict': { + 'id': 'convenca-me-num-minuto-que-os-lobisomens-existem', + 'ext': 'mp3', + 'title': 'Convença-me num minuto que os lobisomens existem', + 'release_date': '20231026', + 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', + 'season': 'Season 3', + 'season_number': 3, + } + }, { + 'url': 'https://radiocomercial.pt/podcasts/inacreditavel-by-ines-castel-branco/t2/o-desastre-de-aviao', + 'md5': '69be64255420fec23b7259955d771e54', + 'info_dict': { + 'id': 'o-desastre-de-aviao', + 'ext': 'mp3', + 'title': 'O desastre de avião', + 'description': 'md5:8a82beeb372641614772baab7246245f', + 'release_date': '20231101', + 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', + 'season': 'Season 2', + 'season_number': 2, + }, + 'params': { + # inconsistant md5 + 'skip_download': True, + }, + }, { + 'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/2023/t-n-t-29-de-outubro', + 'md5': '91d32d4d4b1407272068b102730fc9fa', + 'info_dict': { + 'id': 't-n-t-29-de-outubro', + 'ext': 'mp3', + 'title': 'T.N.T 29 de outubro', + 'release_date': '20231029', + 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', + 'season': 'Season 2023', + 'season_number': 2023, + } + }] + + def _real_extract(self, url): + video_id, season = self._match_valid_url(url).group('id', 'season') + webpage = self._download_webpage(url, video_id) + return { + 'id': video_id, + 'title': self._html_extract_title(webpage), + 'description': self._og_search_description(webpage, default=None), + 'release_date': unified_strdate(get_element_by_class( + 'date', get_element_html_by_class('descriptions', webpage) or '')), + 'thumbnail': self._og_search_thumbnail(webpage), + 'season_number': int_or_none(season), + 'url': extract_attributes(get_element_html_by_class('audiofile', webpage) or '').get('href'), + } + + +class RadioComercialPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/(?P<id>[\w-]+)(?:/t?(?P<season>\d+))?/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3', + 'info_dict': { + 'id': 'convenca-me-num-minuto_t3', + 'title': 'Convença-me num Minuto - Temporada 3', + }, + 'playlist_mincount': 32 + }, { + 'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao', + 'info_dict': { + 'id': 'o-homem-que-mordeu-o-cao', + 'title': 'O Homem Que Mordeu o Cão', + }, + 'playlist_mincount': 19 + }, { + 'url': 'https://radiocomercial.pt/podcasts/as-minhas-coisas-favoritas', + 'info_dict': { + 'id': 'as-minhas-coisas-favoritas', + 'title': 'As Minhas Coisas Favoritas', + }, + 'playlist_mincount': 131 + }, { + 'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/t2023', + 'info_dict': { + 'id': 'tnt-todos-no-top_t2023', + 'title': 'TNT - Todos No Top - Temporada 2023', + }, + 'playlist_mincount': 39 + }] + + def _entries(self, url, playlist_id): + for page in itertools.count(1): + try: + webpage = self._download_webpage( + f'{url}/{page}', playlist_id, f'Downloading page {page}') + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: + break + raise + + episodes = get_elements_html_by_class('tm-ouvir-podcast', webpage) + if not episodes: + break + for url_path in traverse_obj(episodes, (..., {extract_attributes}, 'href')): + episode_url = urljoin(url, url_path) + if RadioComercialIE.suitable(episode_url): + yield episode_url + + def _real_extract(self, url): + podcast, season = self._match_valid_url(url).group('id', 'season') + playlist_id = join_nonempty(podcast, season, delim='_t') + url = update_url(url, query=None, fragment=None) + webpage = self._download_webpage(url, playlist_id) + + name = try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0]) + title = name if name == season else join_nonempty(name, season, delim=' - Temporada ') + + return self.playlist_from_matches( + self._entries(url, playlist_id), playlist_id, title, ie=RadioComercialIE) diff --git a/yt_dlp/extractor/radiode.py b/yt_dlp/extractor/radiode.py new file mode 100644 index 0000000..7262078 --- /dev/null +++ b/yt_dlp/extractor/radiode.py @@ -0,0 +1,50 @@ +from .common import InfoExtractor + + +class RadioDeIE(InfoExtractor): + _WORKING = False + IE_NAME = 'radio.de' + _VALID_URL = r'https?://(?P<id>.+?)\.(?:radio\.(?:de|at|fr|pt|es|pl|it)|rad\.io)' + _TEST = { + 'url': 'http://ndr2.radio.de/', + 'info_dict': { + 'id': 'ndr2', + 'ext': 'mp3', + 'title': 're:^NDR 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:591c49c702db1a33751625ebfb67f273', + 'thumbnail': r're:^https?://.*\.png', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + radio_id = self._match_id(url) + webpage = self._download_webpage(url, radio_id) + jscode = self._search_regex( + r"'components/station/stationService':\s*\{\s*'?station'?:\s*(\{.*?\s*\}),\n", + webpage, 'broadcast') + + broadcast = self._parse_json(jscode, radio_id) + title = broadcast['name'] + description = broadcast.get('description') or broadcast.get('shortDescription') + thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl') or broadcast.get('logo100x100') + + formats = [{ + 'url': stream['streamUrl'], + 'ext': stream['streamContentFormat'].lower(), + 'acodec': stream['streamContentFormat'], + 'abr': stream['bitRate'], + 'asr': stream['sampleRate'] + } for stream in broadcast['streamUrls']] + + return { + 'id': radio_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'is_live': True, + 'formats': formats, + } diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py new file mode 100644 index 0000000..6bd6fe9 --- /dev/null +++ b/yt_dlp/extractor/radiofrance.py @@ -0,0 +1,473 @@ +import itertools +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + join_nonempty, + js_to_json, + parse_duration, + strftime_or_none, + traverse_obj, + unified_strdate, + urljoin, +) + + +class RadioFranceIE(InfoExtractor): + _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)' + IE_NAME = 'radiofrance' + + _TEST = { + 'url': 'http://maison.radiofrance.fr/radiovisions/one-one', + 'md5': 'bdbb28ace95ed0e04faab32ba3160daf', + 'info_dict': { + 'id': 'one-one', + 'ext': 'ogg', + 'title': 'One to one', + 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", + 'uploader': 'Thomas Hercouët', + }, + } + + def _real_extract(self, url): + m = self._match_valid_url(url) + video_id = m.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title') + description = self._html_search_regex( + r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>', + webpage, 'description', fatal=False) + uploader = self._html_search_regex( + r'<div class="credit">  © (.*?)</div>', + webpage, 'uploader', fatal=False) + + formats_str = self._html_search_regex( + r'class="jp-jplayer[^"]*" data-source="([^"]+)">', + webpage, 'audio URLs') + formats = [ + { + 'format_id': fm[0], + 'url': fm[1], + 'vcodec': 'none', + 'quality': i, + } + for i, fm in + enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)) + ] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'uploader': uploader, + } + + +class RadioFranceBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr' + + _STATIONS_RE = '|'.join(map(re.escape, ( + 'franceculture', + 'franceinfo', + 'franceinter', + 'francemusique', + 'fip', + 'mouv', + ))) + + def _extract_data_from_webpage(self, webpage, display_id, key): + return traverse_obj(self._search_json( + r'\bconst\s+data\s*=', webpage, key, display_id, + contains_pattern=r'\[\{(?s:.+)\}\]', transform_source=js_to_json), + (..., 'data', key, {dict}), get_all=False) or {} + + +class FranceCultureIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?:{RadioFranceBaseIE._STATIONS_RE}) + /podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d{{6,}})(?:$|[?#]) + ''' + + _TESTS = [ + { + 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487', + 'info_dict': { + 'id': '8440487', + 'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau', + 'ext': 'mp3', + 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?', + 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'upload_date': '20220514', + 'duration': 2750, + }, + }, + { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675', + 'info_dict': { + 'id': '2107675', + 'display_id': 'le-7-9-30-du-vendredi-10-mars-2023', + 'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot', + 'description': 'md5:36ee74351ede77a314fdebb94026b916', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'upload_date': '20230310', + 'duration': 8977, + 'ext': 'mp3', + }, + }, + { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507', + 'only_matching': True, + }, { + 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).group('id', 'display_id') + webpage = self._download_webpage(url, display_id) + + # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846 + video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'{\s*"@type"\s*:\s*"AudioObject".+}') + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_data['contentUrl'], + 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None, + 'duration': parse_duration(video_data.get('duration')), + 'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>', + webpage, 'title', default=self._og_search_title(webpage)), + 'description': self._html_search_regex( + r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None), + 'thumbnail': self._og_search_thumbnail(webpage), + 'uploader': self._html_search_regex( + r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None), + 'upload_date': unified_strdate(self._search_regex( + r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)) + } + + +class RadioFranceLiveIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + https?://(?:www\.)?radiofrance\.fr + /(?P<id>{RadioFranceBaseIE._STATIONS_RE}) + /?(?P<substation_id>radio-[\w-]+)?(?:[#?]|$) + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinter/', + 'info_dict': { + 'id': 'franceinter', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/franceculture', + 'info_dict': { + 'id': 'franceculture', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family', + 'info_dict': { + 'id': 'mouv-radio-musique-kids-family', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul', + 'info_dict': { + 'id': 'mouv-radio-rnb-soul', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix', + 'info_dict': { + 'id': 'mouv-radio-musique-mix', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/fip/radio-rock', + 'info_dict': { + 'id': 'fip-radio-rock', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv', + 'only_matching': True, + }] + + def _real_extract(self, url): + station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id') + + if substation_id: + webpage = self._download_webpage(url, station_id) + api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData') + else: + api_response = self._download_json( + f'https://www.radiofrance.fr/{station_id}/api/live', station_id) + + formats, subtitles = [], {} + for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])): + if media_source.get('format') == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': media_source['url'], + 'abr': media_source.get('bitrate'), + }) + + return { + 'id': join_nonempty(station_id, substation_id), + 'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty( + ('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + } + + +class RadioFrancePlaylistBaseIE(RadioFranceBaseIE): + """Subclasses must set _METADATA_KEY""" + + def _call_api(self, content_id, cursor, page_num): + raise NotImplementedError('This method must be implemented by subclasses') + + def _generate_playlist_entries(self, content_id, content_response): + for page_num in itertools.count(2): + for entry in content_response['items']: + yield self.url_result( + f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, { + 'title': 'title', + 'description': 'standFirst', + 'timestamp': ('publishedDate', {int_or_none}), + 'thumbnail': ('visual', 'src'), + })) + + next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False) + if not next_cursor: + break + + content_response = self._call_api(content_id, next_cursor, page_num) + + def _real_extract(self, url): + display_id = self._match_id(url) + + metadata = self._download_json( + 'https://www.radiofrance.fr/api/v2.1/path', display_id, + query={'value': urllib.parse.urlparse(url).path})['content'] + + content_id = metadata['id'] + + return self.playlist_result( + self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id, + display_id=display_id, **{**traverse_obj(metadata, { + 'title': 'title', + 'description': 'standFirst', + 'thumbnail': ('visual', 'src'), + }), **traverse_obj(metadata, { + 'title': 'name', + 'description': 'role', + })}) + + +class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?:{RadioFranceBaseIE._STATIONS_RE}) + /podcasts/(?P<id>[\w-]+)/?(?:[?#]|$) + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert', + 'info_dict': { + 'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17', + 'display_id': 'le-billet-vert', + 'title': 'Le billet sciences', + 'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale', + 'info_dict': { + 'id': '566fd524-3074-4fbc-ac69-8696f2152a54', + 'display_id': 'jean-marie-le-pen-l-obsession-nationale', + 'title': 'Jean-Marie Le Pen, l\'obsession nationale', + 'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine', + 'info_dict': { + 'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d', + 'display_id': 'serie-thomas-grjebine', + 'title': 'Thomas Grjebine', + }, + 'playlist_count': 1, + }, { + 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip', + 'info_dict': { + 'id': '143dff38-e956-4a5d-8576-1c0b7242b99e', + 'display_id': 'certains-l-aiment-fip', + 'title': 'Certains l’aiment Fip', + 'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 321, + }, { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9', + 'only_matching': True, + }, { + 'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix', + 'only_matching': True, + }] + + _METADATA_KEY = 'expressions' + + def _call_api(self, podcast_id, cursor, page_num): + return self._download_json( + f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id, + note=f'Downloading page {page_num}', query={'pageCursor': cursor}) + + +class RadioFranceProfileIE(RadioFrancePlaylistBaseIE): + _VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3', + 'info_dict': { + 'id': '86c62790-e481-11e2-9f7b-782bcb6744eb', + 'display_id': 'thomas-pesquet', + 'title': 'Thomas Pesquet', + 'description': 'Astronaute à l\'agence spatiale européenne', + }, + 'playlist_mincount': 212, + }, { + 'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie', + 'info_dict': { + 'id': '9593050b-0183-4972-a0b5-d8f699079e02', + 'display_id': 'eugenie-bastie', + 'title': 'Eugénie Bastié', + 'description': 'Journaliste et essayiste', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 39, + }, { + 'url': 'https://www.radiofrance.fr/personnes/lea-salame', + 'only_matching': True, + }] + + _METADATA_KEY = 'documents' + + def _call_api(self, profile_id, cursor, page_num): + resp = self._download_json( + f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id, + note=f'Downloading page {page_num}', query={ + 'relation': 'personality', + 'cursor': cursor, + }) + + resp['next'] = traverse_obj(resp, ('pagination', 'next')) + return resp + + +class RadioFranceProgramScheduleIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?P<station>{RadioFranceBaseIE._STATIONS_RE}) + /grille-programmes(?:\?date=(?P<date>[\d-]+))? + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023', + 'info_dict': { + 'id': 'franceinter-program-20230217', + 'upload_date': '20230217', + }, + 'playlist_count': 25, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023', + 'info_dict': { + 'id': 'franceculture-program-20230201', + 'upload_date': '20230201', + }, + 'playlist_count': 25, + }, { + 'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023', + 'info_dict': { + 'id': 'mouv-program-20230319', + 'upload_date': '20230319', + }, + 'playlist_count': 3, + }, { + 'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023', + 'info_dict': { + 'id': 'francemusique-program-20230318', + 'upload_date': '20230318', + }, + 'playlist_count': 15, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes', + 'only_matching': True, + }] + + def _generate_playlist_entries(self, webpage_url, api_response): + for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])): + yield self.url_result( + urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE, + url_transparent=True, **traverse_obj(entry, { + 'title': ('expression', 'title'), + 'thumbnail': ('expression', 'visual', 'src'), + 'timestamp': ('startTime', {int_or_none}), + 'series_id': ('concept', 'id'), + 'series': ('concept', 'title'), + })) + + def _real_extract(self, url): + station, date = self._match_valid_url(url).group('station', 'date') + webpage = self._download_webpage(url, station) + grid_data = self._extract_data_from_webpage(webpage, station, 'grid') + upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d') + + return self.playlist_result( + self._generate_playlist_entries(url, grid_data), + join_nonempty(station, 'program', upload_date), upload_date=upload_date) diff --git a/yt_dlp/extractor/radiojavan.py b/yt_dlp/extractor/radiojavan.py new file mode 100644 index 0000000..b3befae --- /dev/null +++ b/yt_dlp/extractor/radiojavan.py @@ -0,0 +1,81 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + parse_resolution, + str_to_int, + unified_strdate, + urlencode_postdata, + urljoin, +) + + +class RadioJavanIE(InfoExtractor): + _WORKING = False + _VALID_URL = r'https?://(?:www\.)?radiojavan\.com/videos/video/(?P<id>[^/]+)/?' + _TEST = { + 'url': 'http://www.radiojavan.com/videos/video/chaartaar-ashoobam', + 'md5': 'e85208ffa3ca8b83534fca9fe19af95b', + 'info_dict': { + 'id': 'chaartaar-ashoobam', + 'ext': 'mp4', + 'title': 'Chaartaar - Ashoobam', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'upload_date': '20150215', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + download_host = self._download_json( + 'https://www.radiojavan.com/videos/video_host', video_id, + data=urlencode_postdata({'id': video_id}), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }).get('host', 'https://host1.rjmusicmedia.com') + + webpage = self._download_webpage(url, video_id) + + formats = [] + for format_id, _, video_path in re.findall( + r'RJ\.video(?P<format_id>\d+[pPkK])\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2', + webpage): + f = parse_resolution(format_id) + f.update({ + 'url': urljoin(download_host, video_path), + 'format_id': format_id, + }) + formats.append(f) + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + upload_date = unified_strdate(self._search_regex( + r'class="date_added">Date added: ([^<]+)<', + webpage, 'upload date', fatal=False)) + + view_count = str_to_int(self._search_regex( + r'class="views">Plays: ([\d,]+)', + webpage, 'view count', fatal=False)) + like_count = str_to_int(self._search_regex( + r'class="rating">([\d,]+) likes', + webpage, 'like count', fatal=False)) + dislike_count = str_to_int(self._search_regex( + r'class="rating">([\d,]+) dislikes', + webpage, 'dislike count', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'formats': formats, + } diff --git a/yt_dlp/extractor/radiokapital.py b/yt_dlp/extractor/radiokapital.py new file mode 100644 index 0000000..8f9737a --- /dev/null +++ b/yt_dlp/extractor/radiokapital.py @@ -0,0 +1,97 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + traverse_obj, + unescapeHTML, +) + +import itertools +from urllib.parse import urlencode + + +class RadioKapitalBaseIE(InfoExtractor): + def _call_api(self, resource, video_id, note='Downloading JSON metadata', qs={}): + return self._download_json( + f'https://www.radiokapital.pl/wp-json/kapital/v1/{resource}?{urlencode(qs)}', + video_id, note=note) + + def _parse_episode(self, data): + release = '%s%s%s' % (data['published'][6:11], data['published'][3:6], data['published'][:3]) + return { + '_type': 'url_transparent', + 'url': data['mixcloud_url'], + 'ie_key': 'Mixcloud', + 'title': unescapeHTML(data['title']), + 'description': clean_html(data.get('content')), + 'tags': traverse_obj(data, ('tags', ..., 'name')), + 'release_date': release, + 'series': traverse_obj(data, ('show', 'title')), + } + + +class RadioKapitalIE(RadioKapitalBaseIE): + IE_NAME = 'radiokapital' + _VALID_URL = r'https?://(?:www\.)?radiokapital\.pl/shows/[a-z\d-]+/(?P<id>[a-z\d-]+)' + + _TESTS = [{ + 'url': 'https://radiokapital.pl/shows/tutaj-sa-smoki/5-its-okay-to-be-immaterial', + 'info_dict': { + 'id': 'radiokapital_radio-kapitał-tutaj-są-smoki-5-its-okay-to-be-immaterial-2021-05-20', + 'ext': 'm4a', + 'title': '#5: It’s okay to\xa0be\xa0immaterial', + 'description': 'md5:2499da5fbfb0e88333b7d37ec8e9e4c4', + 'uploader': 'Radio Kapitał', + 'uploader_id': 'radiokapital', + 'timestamp': 1621640164, + 'upload_date': '20210521', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + episode = self._call_api('episodes/%s' % video_id, video_id) + return self._parse_episode(episode) + + +class RadioKapitalShowIE(RadioKapitalBaseIE): + IE_NAME = 'radiokapital:show' + _VALID_URL = r'https?://(?:www\.)?radiokapital\.pl/shows/(?P<id>[a-z\d-]+)/?(?:$|[?#])' + + _TESTS = [{ + 'url': 'https://radiokapital.pl/shows/wesz', + 'info_dict': { + 'id': '100', + 'title': 'WĘSZ', + 'description': 'md5:3a557a1e0f31af612b0dcc85b1e0ca5c', + }, + 'playlist_mincount': 17, + }] + + def _get_episode_list(self, series_id, page_no): + return self._call_api( + 'episodes', series_id, + f'Downloading episode list page #{page_no}', qs={ + 'show': series_id, + 'page': page_no, + }) + + def _entries(self, series_id): + for page_no in itertools.count(1): + episode_list = self._get_episode_list(series_id, page_no) + yield from (self._parse_episode(ep) for ep in episode_list['items']) + if episode_list['next'] is None: + break + + def _real_extract(self, url): + series_id = self._match_id(url) + + show = self._call_api(f'shows/{series_id}', series_id, 'Downloading show metadata') + entries = self._entries(series_id) + return { + '_type': 'playlist', + 'entries': entries, + 'id': str(show['id']), + 'title': show.get('title'), + 'description': clean_html(show.get('content')), + } diff --git a/yt_dlp/extractor/radiozet.py b/yt_dlp/extractor/radiozet.py new file mode 100644 index 0000000..6752017 --- /dev/null +++ b/yt_dlp/extractor/radiozet.py @@ -0,0 +1,50 @@ +from .common import InfoExtractor +from ..utils import ( + traverse_obj, + strip_or_none, +) + + +class RadioZetPodcastIE(InfoExtractor): + _VALID_URL = r'https?://player\.radiozet\.pl\/Podcasty/.*?/(?P<id>.+)' + _TEST = { + 'url': 'https://player.radiozet.pl/Podcasty/Nie-Ma-Za-Co/O-przedmiotach-szkolnych-ktore-przydaja-sie-w-zyciu', + 'md5': 'e03665c316b4fbc5f6a8f232948bbba3', + 'info_dict': { + 'id': '42154', + 'display_id': 'O-przedmiotach-szkolnych-ktore-przydaja-sie-w-zyciu', + 'title': 'O przedmiotach szkolnych, które przydają się w życiu', + 'description': 'md5:fa72bed49da334b09e5b2f79851f185c', + 'release_timestamp': 1592985480, + 'ext': 'mp3', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 83, + 'series': 'Nie Ma Za Co', + 'creator': 'Katarzyna Pakosińska', + } + } + + def _call_api(self, podcast_id, display_id): + return self._download_json( + f'https://player.radiozet.pl/api/podcasts/getPodcast/(node)/{podcast_id}/(station)/radiozet', + display_id) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + podcast_id = self._html_search_regex(r'<div.*?\sid="player".*?\sdata-id=[\'"]([^\'"]+)[\'"]', + webpage, 'podcast id') + data = self._call_api(podcast_id, display_id)['data'][0] + + return { + 'id': podcast_id, + 'display_id': display_id, + 'title': strip_or_none(data.get('title')), + 'description': strip_or_none(traverse_obj(data, ('program', 'desc'))), + 'release_timestamp': data.get('published_date'), + 'url': traverse_obj(data, ('player', 'stream')), + 'thumbnail': traverse_obj(data, ('program', 'image', 'original')), + 'duration': traverse_obj(data, ('player', 'duration')), + 'series': strip_or_none(traverse_obj(data, ('program', 'title'))), + 'creator': strip_or_none(traverse_obj(data, ('presenter', 0, 'title'))), + } diff --git a/yt_dlp/extractor/radlive.py b/yt_dlp/extractor/radlive.py new file mode 100644 index 0000000..3c00183 --- /dev/null +++ b/yt_dlp/extractor/radlive.py @@ -0,0 +1,180 @@ +import json + +from ..utils import ( + ExtractorError, + format_field, + traverse_obj, + try_get, + unified_timestamp +) +from .common import InfoExtractor + + +class RadLiveIE(InfoExtractor): + IE_NAME = 'radlive' + _VALID_URL = r'https?://(?:www\.)?rad\.live/content/(?P<content_type>feature|episode)/(?P<id>[a-f0-9-]+)' + _TESTS = [{ + 'url': 'https://rad.live/content/feature/dc5acfbc-761b-4bec-9564-df999905116a', + 'md5': '6219d5d31d52de87d21c9cf5b7cb27ff', + 'info_dict': { + 'id': 'dc5acfbc-761b-4bec-9564-df999905116a', + 'ext': 'mp4', + 'title': 'Deathpact - Digital Mirage 2 [Full Set]', + 'language': 'en', + 'thumbnail': 'https://static.12core.net/cb65ae077a079c68380e38f387fbc438.png', + 'description': '', + 'release_timestamp': 1600185600.0, + 'channel': 'Proximity', + 'channel_id': '9ce6dd01-70a4-4d59-afb6-d01f807cd009', + 'channel_url': 'https://rad.live/content/channel/9ce6dd01-70a4-4d59-afb6-d01f807cd009', + } + }, { + 'url': 'https://rad.live/content/episode/bbcf66ec-0d02-4ca0-8dc0-4213eb2429bf', + 'md5': '40b2175f347592125d93e9a344080125', + 'info_dict': { + 'id': 'bbcf66ec-0d02-4ca0-8dc0-4213eb2429bf', + 'ext': 'mp4', + 'title': 'E01: Bad Jokes 1', + 'language': 'en', + 'thumbnail': 'https://lsp.littlstar.com/channels/WHISTLE/BAD_JOKES/SEASON_1/BAD_JOKES_101/poster.jpg', + 'description': 'Bad Jokes - Champions, Adam Pally, Super Troopers, Team Edge and 2Hype', + 'episode': 'E01: Bad Jokes 1', + 'episode_number': 1, + 'episode_id': '336', + }, + }] + + def _real_extract(self, url): + content_type, video_id = self._match_valid_url(url).groups() + + webpage = self._download_webpage(url, video_id) + + content_info = json.loads(self._search_regex( + r'<script[^>]*type=([\'"])application/json\1[^>]*>(?P<json>{.+?})</script>', + webpage, 'video info', group='json'))['props']['pageProps']['initialContentData'] + video_info = content_info[content_type] + + if not video_info: + raise ExtractorError('Unable to extract video info, make sure the URL is valid') + + formats = self._extract_m3u8_formats(video_info['assets']['videos'][0]['url'], video_id) + + data = video_info.get('structured_data', {}) + + release_date = unified_timestamp(traverse_obj(data, ('releasedEvent', 'startDate'))) + channel = next(iter(content_info.get('channels', [])), {}) + channel_id = channel.get('lrn', '').split(':')[-1] or None + + result = { + 'id': video_id, + 'title': video_info['title'], + 'formats': formats, + 'language': traverse_obj(data, ('potentialAction', 'target', 'inLanguage')), + 'thumbnail': traverse_obj(data, ('image', 'contentUrl')), + 'description': data.get('description'), + 'release_timestamp': release_date, + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': format_field(channel_id, None, 'https://rad.live/content/channel/%s'), + + } + if content_type == 'episode': + result.update({ + # TODO: Get season number when downloading single episode + 'episode': video_info.get('title'), + 'episode_number': video_info.get('number'), + 'episode_id': video_info.get('id'), + }) + + return result + + +class RadLiveSeasonIE(RadLiveIE): # XXX: Do not subclass from concrete IE + IE_NAME = 'radlive:season' + _VALID_URL = r'https?://(?:www\.)?rad\.live/content/season/(?P<id>[a-f0-9-]+)' + _TESTS = [{ + 'url': 'https://rad.live/content/season/08a290f7-c9ef-4e22-9105-c255995a2e75', + 'md5': '40b2175f347592125d93e9a344080125', + 'info_dict': { + 'id': '08a290f7-c9ef-4e22-9105-c255995a2e75', + 'title': 'Bad Jokes - Season 1', + }, + 'playlist_mincount': 5, + }] + + @classmethod + def suitable(cls, url): + return False if RadLiveIE.suitable(url) else super(RadLiveSeasonIE, cls).suitable(url) + + def _real_extract(self, url): + season_id = self._match_id(url) + webpage = self._download_webpage(url, season_id) + + content_info = json.loads(self._search_regex( + r'<script[^>]*type=([\'"])application/json\1[^>]*>(?P<json>{.+?})</script>', + webpage, 'video info', group='json'))['props']['pageProps']['initialContentData'] + video_info = content_info['season'] + + entries = [{ + '_type': 'url_transparent', + 'id': episode['structured_data']['url'].split('/')[-1], + 'url': episode['structured_data']['url'], + 'series': try_get(content_info, lambda x: x['series']['title']), + 'season': video_info['title'], + 'season_number': video_info.get('number'), + 'season_id': video_info.get('id'), + 'ie_key': RadLiveIE.ie_key(), + } for episode in video_info['episodes']] + + return self.playlist_result(entries, season_id, video_info.get('title')) + + +class RadLiveChannelIE(RadLiveIE): # XXX: Do not subclass from concrete IE + IE_NAME = 'radlive:channel' + _VALID_URL = r'https?://(?:www\.)?rad\.live/content/channel/(?P<id>[a-f0-9-]+)' + _TESTS = [{ + 'url': 'https://rad.live/content/channel/5c4d8df4-6fa0-413c-81e3-873479b49274', + 'md5': '625156a08b7f2b0b849f234e664457ac', + 'info_dict': { + 'id': '5c4d8df4-6fa0-413c-81e3-873479b49274', + 'title': 'Whistle Sports', + }, + 'playlist_mincount': 7, + }] + + _QUERY = ''' +query WebChannelListing ($lrn: ID!) { + channel (id:$lrn) { + name + features { + structured_data + } + } +}''' + + @classmethod + def suitable(cls, url): + return False if RadLiveIE.suitable(url) else super(RadLiveChannelIE, cls).suitable(url) + + def _real_extract(self, url): + channel_id = self._match_id(url) + + graphql = self._download_json( + 'https://content.mhq.12core.net/graphql', channel_id, + headers={'Content-Type': 'application/json'}, + data=json.dumps({ + 'query': self._QUERY, + 'variables': {'lrn': f'lrn:12core:media:content:channel:{channel_id}'} + }).encode('utf-8')) + + data = traverse_obj(graphql, ('data', 'channel')) + if not data: + raise ExtractorError('Unable to extract video info, make sure the URL is valid') + + entries = [{ + '_type': 'url_transparent', + 'url': feature['structured_data']['url'], + 'ie_key': RadLiveIE.ie_key(), + } for feature in data['features']] + + return self.playlist_result(entries, channel_id, data.get('name')) diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py new file mode 100644 index 0000000..c1fc65c --- /dev/null +++ b/yt_dlp/extractor/rai.py @@ -0,0 +1,816 @@ +import re + +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import ( + clean_html, + determine_ext, + ExtractorError, + filter_dict, + GeoRestrictedError, + int_or_none, + join_nonempty, + parse_duration, + remove_start, + strip_or_none, + traverse_obj, + try_get, + unified_strdate, + unified_timestamp, + update_url_query, + urljoin, + xpath_text, +) + + +class RaiBaseIE(InfoExtractor): + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _GEO_COUNTRIES = ['IT'] + _GEO_BYPASS = False + + def _fix_m3u8_formats(self, media_url, video_id): + fmts = self._extract_m3u8_formats( + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + + # Fix malformed m3u8 manifests by setting audio-only/video-only formats + for f in fmts: + if not f.get('acodec'): + f['acodec'] = 'mp4a' + if not f.get('vcodec'): + f['vcodec'] = 'avc1' + man_url = f['url'] + if re.search(r'chunklist(?:_b\d+)*_ao[_.]', man_url): # audio only + f['vcodec'] = 'none' + elif re.search(r'chunklist(?:_b\d+)*_vo[_.]', man_url): # video only + f['acodec'] = 'none' + else: # video+audio + if f['acodec'] == 'none': + f['acodec'] = 'mp4a' + if f['vcodec'] == 'none': + f['vcodec'] = 'avc1' + + return fmts + + def _extract_relinker_info(self, relinker_url, video_id, audio_only=False): + def fix_cdata(s): + # remove \r\n\t before and after <![CDATA[ ]]> to avoid + # polluted text with xpath_text + s = re.sub(r'(\]\]>)[\r\n\t]+(</)', '\\1\\2', s) + return re.sub(r'(>)[\r\n\t]+(<!\[CDATA\[)', '\\1\\2', s) + + if not re.match(r'https?://', relinker_url): + return {'formats': [{'url': relinker_url}]} + + # set User-Agent to generic 'Rai' to avoid quality filtering from + # the media server and get the maximum qualities available + relinker = self._download_xml( + relinker_url, video_id, note='Downloading XML metadata', + transform_source=fix_cdata, query={'output': 64}, + headers={**self.geo_verification_headers(), 'User-Agent': 'Rai'}) + + if xpath_text(relinker, './license_url', default='{}') != '{}': + self.report_drm(video_id) + + is_live = xpath_text(relinker, './is_live', default='N') == 'Y' + duration = parse_duration(xpath_text(relinker, './duration', default=None)) + media_url = xpath_text(relinker, './url[@type="content"]', default=None) + + if not media_url: + self.raise_no_formats('The relinker returned no media url') + + # geo flag is a bit unreliable and not properly set all the time + geoprotection = xpath_text(relinker, './geoprotection', default='N') == 'Y' + + ext = determine_ext(media_url) + formats = [] + + if ext == 'mp3': + formats.append({ + 'url': media_url, + 'vcodec': 'none', + 'acodec': 'mp3', + 'format_id': 'https-mp3', + }) + elif ext == 'm3u8' or 'format=m3u8' in media_url: + formats.extend(self._fix_m3u8_formats(media_url, video_id)) + elif ext == 'f4m': + # very likely no longer needed. Cannot find any url that uses it. + manifest_url = update_url_query( + media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), + {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, f4m_id='hds', fatal=False)) + elif ext == 'mp4': + bitrate = int_or_none(xpath_text(relinker, './bitrate')) + formats.append({ + 'url': media_url, + 'tbr': bitrate if bitrate > 0 else None, + 'format_id': join_nonempty('https', bitrate, delim='-'), + }) + else: + raise ExtractorError('Unrecognized media file found') + + if (not formats and geoprotection is True) or '/video_no_available.mp4' in media_url: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + + if not audio_only and not is_live: + formats.extend(self._create_http_urls(media_url, relinker_url, formats, video_id)) + + return filter_dict({ + 'is_live': is_live, + 'duration': duration, + 'formats': formats, + }) + + def _create_http_urls(self, manifest_url, relinker_url, fmts, video_id): + _MANIFEST_REG = r'/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4)?(?:\.csmil)?/playlist\.m3u8' + _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' + _QUALITY = { + # tbr: w, h + 250: [352, 198], + 400: [512, 288], + 600: [512, 288], + 700: [512, 288], + 800: [700, 394], + 1200: [736, 414], + 1500: [920, 518], + 1800: [1024, 576], + 2400: [1280, 720], + 3200: [1440, 810], + 3600: [1440, 810], + 5000: [1920, 1080], + 10000: [1920, 1080], + } + + def percentage(number, target, pc=20, roof=125): + '''check if the target is in the range of number +/- percent''' + if not number or number < 0: + return False + return abs(target - number) < min(float(number) * float(pc) / 100.0, roof) + + def get_format_info(tbr): + import math + br = int_or_none(tbr) + if len(fmts) == 1 and not br: + br = fmts[0].get('tbr') + if br and br > 300: + tbr = math.floor(br / 100) * 100 + else: + tbr = 250 + + # try extracting info from available m3u8 formats + format_copy = [None, None] + for f in fmts: + if f.get('tbr'): + if percentage(tbr, f['tbr']): + format_copy[0] = f.copy() + if [f.get('width'), f.get('height')] == _QUALITY.get(tbr): + format_copy[1] = f.copy() + format_copy[1]['tbr'] = tbr + + # prefer format with similar bitrate because there might be + # multiple video with the same resolution but different bitrate + format_copy = format_copy[0] or format_copy[1] or {} + return { + 'format_id': f'https-{tbr}', + 'width': format_copy.get('width'), + 'height': format_copy.get('height'), + 'tbr': format_copy.get('tbr') or tbr, + 'vcodec': format_copy.get('vcodec') or 'avc1', + 'acodec': format_copy.get('acodec') or 'mp4a', + 'fps': format_copy.get('fps') or 25, + } if format_copy else { + 'format_id': f'https-{tbr}', + 'width': _QUALITY[tbr][0], + 'height': _QUALITY[tbr][1], + 'tbr': tbr, + 'vcodec': 'avc1', + 'acodec': 'mp4a', + 'fps': 25, + } + + # Check if MP4 download is available + try: + self._request_webpage( + HEADRequest(_MP4_TMPL % (relinker_url, '*')), video_id, 'Checking MP4 availability') + except ExtractorError as e: + self.to_screen(f'{video_id}: MP4 direct download is not available: {e.cause}') + return [] + + # filter out single-stream formats + fmts = [f for f in fmts + if not f.get('vcodec') == 'none' and not f.get('acodec') == 'none'] + + mobj = re.search(_MANIFEST_REG, manifest_url) + if not mobj: + return [] + available_qualities = mobj.group('quality').split(',') if mobj.group('quality') else ['*'] + + formats = [] + for q in filter(None, available_qualities): + self.write_debug(f'Creating https format for quality {q}') + formats.append({ + 'url': _MP4_TMPL % (relinker_url, q), + 'protocol': 'https', + 'ext': 'mp4', + **get_format_info(q) + }) + return formats + + @staticmethod + def _get_thumbnails_list(thumbs, url): + return [{ + 'url': urljoin(url, thumb_url), + } for thumb_url in (thumbs or {}).values() if thumb_url] + + @staticmethod + def _extract_subtitles(url, video_data): + STL_EXT = 'stl' + SRT_EXT = 'srt' + subtitles = {} + subtitles_array = video_data.get('subtitlesArray') or video_data.get('subtitleList') or [] + for k in ('subtitles', 'subtitlesUrl'): + subtitles_array.append({'url': video_data.get(k)}) + for subtitle in subtitles_array: + sub_url = subtitle.get('url') + if sub_url and isinstance(sub_url, str): + sub_lang = subtitle.get('language') or 'it' + sub_url = urljoin(url, sub_url) + sub_ext = determine_ext(sub_url, SRT_EXT) + subtitles.setdefault(sub_lang, []).append({ + 'ext': sub_ext, + 'url': sub_url, + }) + if STL_EXT == sub_ext: + subtitles[sub_lang].append({ + 'ext': SRT_EXT, + 'url': sub_url[:-len(STL_EXT)] + SRT_EXT, + }) + return subtitles + + +class RaiPlayIE(RaiBaseIE): + _VALID_URL = rf'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>{RaiBaseIE._UUID_RE}))\.(?:html|json)' + _TESTS = [{ + 'url': 'https://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', + 'info_dict': { + 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', + 'ext': 'mp4', + 'title': 'Report del 07/04/2014', + 'alt_title': 'St 2013/14 - Report - Espresso nel caffè - 07/04/2014', + 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', + 'thumbnail': r're:^https?://www\.raiplay\.it/.+\.jpg', + 'uploader': 'Rai 3', + 'creator': 'Rai 3', + 'duration': 6160, + 'series': 'Report', + 'season': '2013/14', + 'subtitles': {'it': 'count:4'}, + 'release_year': 2024, + 'episode': 'Espresso nel caffè - 07/04/2014', + 'timestamp': 1396919880, + 'upload_date': '20140408', + 'formats': 'count:4', + }, + 'params': {'skip_download': True}, + }, { + # 1080p + 'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html', + 'md5': 'aeda7243115380b2dd5e881fd42d949a', + 'info_dict': { + 'id': 'b1255a4a-8e72-4a2f-b9f3-fc1308e00736', + 'ext': 'mp4', + 'title': 'Blanca - S1E1 - Senza occhi', + 'alt_title': 'St 1 Ep 1 - Blanca - Senza occhi', + 'description': 'md5:75f95d5c030ec8bac263b1212322e28c', + 'thumbnail': r're:^https://www\.raiplay\.it/dl/img/.+\.jpg', + 'uploader': 'Rai Premium', + 'creator': 'Rai Fiction', + 'duration': 6493, + 'series': 'Blanca', + 'season': 'Season 1', + 'episode_number': 1, + 'release_year': 2021, + 'season_number': 1, + 'episode': 'Senza occhi', + 'timestamp': 1637318940, + 'upload_date': '20211119', + 'formats': 'count:7', + }, + 'params': {'skip_download': True}, + 'expected_warnings': ['Video not available. Likely due to geo-restriction.'] + }, { + # 1500 quality + 'url': 'https://www.raiplay.it/video/2012/09/S1E11---Tutto-cio-che-luccica-0cab3323-732e-45d6-8e86-7704acab6598.html', + 'md5': 'a634d20e8ab2d43724c273563f6bf87a', + 'info_dict': { + 'id': '0cab3323-732e-45d6-8e86-7704acab6598', + 'ext': 'mp4', + 'title': 'Mia and Me - S1E11 - Tutto ciò che luccica', + 'alt_title': 'St 1 Ep 11 - Mia and Me - Tutto ciò che luccica', + 'description': 'md5:4969e594184b1920c4c1f2b704da9dea', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai Gulp', + 'series': 'Mia and Me', + 'season': 'Season 1', + 'episode_number': 11, + 'release_year': 2015, + 'season_number': 1, + 'episode': 'Tutto ciò che luccica', + 'timestamp': 1348495020, + 'upload_date': '20120924', + }, + }, { + 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', + 'only_matching': True, + }, { + # subtitles at 'subtitlesArray' key (see #27698) + 'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html', + 'only_matching': True, + }, { + # DRM protected + 'url': 'https://www.raiplay.it/video/2021/06/Lo-straordinario-mondo-di-Zoey-S2E1-Lo-straordinario-ritorno-di-Zoey-3ba992de-2332-41ad-9214-73e32ab209f4.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + base, video_id = self._match_valid_url(url).groups() + + media = self._download_json( + f'{base}.json', video_id, 'Downloading video JSON') + + if not self.get_param('allow_unplayable_formats'): + if traverse_obj(media, (('program_info', None), 'rights_management', 'rights', 'drm')): + self.report_drm(video_id) + + video = media['video'] + relinker_info = self._extract_relinker_info(video['content_url'], video_id) + date_published = join_nonempty( + media.get('date_published'), media.get('time_published'), delim=' ') + season = media.get('season') + alt_title = join_nonempty(media.get('subtitle'), media.get('toptitle'), delim=' - ') + + return { + 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, + 'display_id': video_id, + 'title': media.get('name'), + 'alt_title': strip_or_none(alt_title or None), + 'description': media.get('description'), + 'uploader': strip_or_none( + traverse_obj(media, ('program_info', 'channel')) + or media.get('channel') or None), + 'creator': strip_or_none( + traverse_obj(media, ('program_info', 'editor')) + or media.get('editor') or None), + 'duration': parse_duration(video.get('duration')), + 'timestamp': unified_timestamp(date_published), + 'thumbnails': self._get_thumbnails_list(media.get('images'), url), + 'series': traverse_obj(media, ('program_info', 'name')), + 'season_number': int_or_none(season), + 'season': season if (season and not season.isdigit()) else None, + 'episode': media.get('episode_title'), + 'episode_number': int_or_none(media.get('episode')), + 'subtitles': self._extract_subtitles(url, video), + 'release_year': int_or_none(traverse_obj(media, ('track_info', 'edit_year'))), + **relinker_info + } + + +class RaiPlayLiveIE(RaiPlayIE): # XXX: Do not subclass from concrete IE + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' + _TESTS = [{ + 'url': 'http://www.raiplay.it/dirette/rainews24', + 'info_dict': { + 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', + 'display_id': 'rainews24', + 'ext': 'mp4', + 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', + 'uploader': 'Rai News 24', + 'creator': 'Rai News 24', + 'is_live': True, + 'live_status': 'is_live', + 'upload_date': '20090502', + 'timestamp': 1241276220, + 'formats': 'count:3', + }, + 'params': {'skip_download': True}, + }] + + +class RaiPlayPlaylistIE(InfoExtractor): + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?' + _TESTS = [{ + # entire series episodes + extras... + 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/', + 'info_dict': { + 'id': 'nondirloalmiocapo', + 'title': 'Non dirlo al mio capo', + 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', + }, + 'playlist_mincount': 30, + }, { + # single season + 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/episodi/stagione-2/', + 'info_dict': { + 'id': 'nondirloalmiocapo', + 'title': 'Non dirlo al mio capo - Stagione 2', + 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', + }, + 'playlist_count': 12, + }] + + def _real_extract(self, url): + base, playlist_id, extra_id = self._match_valid_url(url).groups() + + program = self._download_json( + f'{base}.json', playlist_id, 'Downloading program JSON') + + if extra_id: + extra_id = extra_id.upper().rstrip('/') + + playlist_title = program.get('name') + entries = [] + for b in (program.get('blocks') or []): + for s in (b.get('sets') or []): + if extra_id: + if extra_id != join_nonempty( + b.get('name'), s.get('name'), delim='/').replace(' ', '-').upper(): + continue + playlist_title = join_nonempty(playlist_title, s.get('name'), delim=' - ') + + s_id = s.get('id') + if not s_id: + continue + medias = self._download_json( + f'{base}/{s_id}.json', s_id, + 'Downloading content set JSON', fatal=False) + if not medias: + continue + for m in (medias.get('items') or []): + path_id = m.get('path_id') + if not path_id: + continue + video_url = urljoin(url, path_id) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) + + return self.playlist_result( + entries, playlist_id, playlist_title, + try_get(program, lambda x: x['program_info']['description'])) + + +class RaiPlaySoundIE(RaiBaseIE): + _VALID_URL = rf'(?P<base>https?://(?:www\.)?raiplaysound\.it/.+?-(?P<id>{RaiBaseIE._UUID_RE}))\.(?:html|json)' + _TESTS = [{ + 'url': 'https://www.raiplaysound.it/audio/2021/12/IL-RUGGITO-DEL-CONIGLIO-1ebae2a7-7cdb-42bb-842e-fe0d193e9707.html', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', + 'info_dict': { + 'id': '1ebae2a7-7cdb-42bb-842e-fe0d193e9707', + 'ext': 'mp3', + 'title': 'Il Ruggito del Coniglio del 10/12/2021', + 'alt_title': 'md5:0e6476cd57858bb0f3fcc835d305b455', + 'description': 'md5:2a17d2107e59a4a8faa0e18334139ee2', + 'thumbnail': r're:^https?://.+\.jpg$', + 'uploader': 'rai radio 2', + 'duration': 5685, + 'series': 'Il Ruggito del Coniglio', + 'episode': 'Il Ruggito del Coniglio del 10/12/2021', + 'creator': 'rai radio 2', + 'timestamp': 1638346620, + 'upload_date': '20211201', + }, + 'params': {'skip_download': True}, + }] + + def _real_extract(self, url): + base, audio_id = self._match_valid_url(url).group('base', 'id') + media = self._download_json(f'{base}.json', audio_id, 'Downloading audio JSON') + uid = try_get(media, lambda x: remove_start(remove_start(x['uniquename'], 'ContentItem-'), 'Page-')) + + info = {} + formats = [] + relinkers = set(traverse_obj(media, (('downloadable_audio', 'audio', ('live', 'cards', 0, 'audio')), 'url'))) + for r in relinkers: + info = self._extract_relinker_info(r, audio_id, True) + formats.extend(info.get('formats')) + + date_published = try_get(media, (lambda x: f'{x["create_date"]} {x.get("create_time") or ""}', + lambda x: x['live']['create_date'])) + + podcast_info = traverse_obj(media, 'podcast_info', ('live', 'cards', 0)) or {} + + return { + **info, + 'id': uid or audio_id, + 'display_id': audio_id, + 'title': traverse_obj(media, 'title', 'episode_title'), + 'alt_title': traverse_obj(media, ('track_info', 'media_name'), expected_type=strip_or_none), + 'description': media.get('description'), + 'uploader': traverse_obj(media, ('track_info', 'channel'), expected_type=strip_or_none), + 'creator': traverse_obj(media, ('track_info', 'editor'), expected_type=strip_or_none), + 'timestamp': unified_timestamp(date_published), + 'thumbnails': self._get_thumbnails_list(podcast_info.get('images'), url), + 'series': podcast_info.get('title'), + 'season_number': int_or_none(media.get('season')), + 'episode': media.get('episode_title'), + 'episode_number': int_or_none(media.get('episode')), + 'formats': formats, + } + + +class RaiPlaySoundLiveIE(RaiPlaySoundIE): # XXX: Do not subclass from concrete IE + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?P<id>[^/?#&]+)$)' + _TESTS = [{ + 'url': 'https://www.raiplaysound.it/radio2', + 'info_dict': { + 'id': 'b00a50e6-f404-4af6-8f8c-ff3b9af73a44', + 'display_id': 'radio2', + 'ext': 'mp4', + 'title': r're:Rai Radio 2 \d+-\d+-\d+ \d+:\d+', + 'thumbnail': r're:^https://www\.raiplaysound\.it/dl/img/.+\.png', + 'uploader': 'rai radio 2', + 'series': 'Rai Radio 2', + 'creator': 'raiplaysound', + 'is_live': True, + 'live_status': 'is_live', + }, + 'params': {'skip_download': True}, + }] + + +class RaiPlaySoundPlaylistIE(InfoExtractor): + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?:programmi|playlist|audiolibri)/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?' + _TESTS = [{ + # entire show + 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio', + 'info_dict': { + 'id': 'ilruggitodelconiglio', + 'title': 'Il Ruggito del Coniglio', + 'description': 'md5:62a627b3a2d0635d08fa8b6e0a04f27e', + }, + 'playlist_mincount': 65, + }, { + # single season + 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio/puntate/prima-stagione-1995', + 'info_dict': { + 'id': 'ilruggitodelconiglio_puntate_prima-stagione-1995', + 'title': 'Prima Stagione 1995', + }, + 'playlist_count': 1, + }] + + def _real_extract(self, url): + base, playlist_id, extra_id = self._match_valid_url(url).group('base', 'id', 'extra_id') + url = f'{base}.json' + program = self._download_json(url, playlist_id, 'Downloading program JSON') + + if extra_id: + extra_id = extra_id.rstrip('/') + playlist_id += '_' + extra_id.replace('/', '_') + path = next(c['path_id'] for c in program.get('filters') or [] if extra_id in c.get('weblink')) + program = self._download_json( + urljoin('https://www.raiplaysound.it', path), playlist_id, 'Downloading program secondary JSON') + + entries = [ + self.url_result(urljoin(base, c['path_id']), ie=RaiPlaySoundIE.ie_key()) + for c in traverse_obj(program, 'cards', ('block', 'cards')) or [] + if c.get('path_id')] + + return self.playlist_result(entries, playlist_id, program.get('title'), + traverse_obj(program, ('podcast_info', 'description'))) + + +class RaiIE(RaiBaseIE): + _VALID_URL = rf'https?://[^/]+\.(?:rai\.(?:it|tv))/.+?-(?P<id>{RaiBaseIE._UUID_RE})(?:-.+?)?\.html' + _TESTS = [{ + 'url': 'https://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', + 'info_dict': { + 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', + 'ext': 'mp4', + 'title': 'TG PRIMO TEMPO', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1758, + 'upload_date': '20140612', + }, + 'params': {'skip_download': True}, + 'expected_warnings': ['Video not available. Likely due to geo-restriction.'] + }, { + 'url': 'https://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', + 'info_dict': { + 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', + 'ext': 'mp4', + 'title': 'TG1 ore 20:00 del 03/11/2016', + 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2214, + 'upload_date': '20161103' + }, + 'params': {'skip_download': True}, + }, { + # Direct MMS: Media URL no longer works. + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + content_id = self._match_id(url) + media = self._download_json( + f'https://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-{content_id}.html?json', + content_id, 'Downloading video JSON', fatal=False, expected_status=404) + + if media is None: + return None + + if 'Audio' in media['type']: + relinker_info = { + 'formats': [{ + 'format_id': join_nonempty('https', media.get('formatoAudio'), delim='-'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + 'vcodec': 'none', + 'acodec': media.get('formatoAudio'), + }] + } + elif 'Video' in media['type']: + relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) + else: + raise ExtractorError('not a media file') + + thumbnails = self._get_thumbnails_list( + {image_type: media.get(image_type) for image_type in ( + 'image', 'image_medium', 'image_300')}, url) + + return { + 'id': content_id, + 'title': strip_or_none(media.get('name') or media.get('title')), + 'description': strip_or_none(media.get('desc')) or None, + 'thumbnails': thumbnails, + 'uploader': strip_or_none(media.get('author')) or None, + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(media.get('length')), + 'subtitles': self._extract_subtitles(url, media), + **relinker_info + } + + +class RaiNewsIE(RaiBaseIE): + _VALID_URL = rf'https?://(www\.)?rainews\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html' + _EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)'] + _TESTS = [{ + # new rainews player (#3911) + 'url': 'https://www.rainews.it/video/2024/02/membri-della-croce-rossa-evacuano-gli-abitanti-di-un-villaggio-nella-regione-ucraina-di-kharkiv-il-filmato-dallucraina--31e8017c-845c-43f5-9c48-245b43c3a079.html', + 'info_dict': { + 'id': '31e8017c-845c-43f5-9c48-245b43c3a079', + 'ext': 'mp4', + 'title': 'md5:1e81364b09de4a149042bac3c7d36f0b', + 'duration': 196, + 'upload_date': '20240225', + 'uploader': 'rainews', + 'formats': 'count:2', + }, + 'params': {'skip_download': True}, + }, { + # old content with fallback method to extract media urls + 'url': 'https://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', + 'info_dict': { + 'id': '1632c009-c843-4836-bb65-80c33084a64b', + 'ext': 'mp4', + 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', + 'description': 'I film in uscita questa settimana.', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 833, + 'upload_date': '20161103', + 'formats': 'count:8', + }, + 'params': {'skip_download': True}, + 'expected_warnings': ['unable to extract player_data'], + }, { + # iframe + drm + 'url': 'https://www.rainews.it/iframe/video/2022/07/euro2022-europei-calcio-femminile-italia-belgio-gol-0-1-video-4de06a69-de75-4e32-a657-02f0885f8118.html', + 'only_matching': True, + }] + _PLAYER_TAG = 'news' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + player_data = self._search_json( + rf'<rai{self._PLAYER_TAG}-player\s*data=\'', webpage, 'player_data', video_id, + transform_source=clean_html, default={}) + track_info = player_data.get('track_info') + relinker_url = traverse_obj(player_data, 'mediapolis', 'content_url') + + if not relinker_url: + # fallback on old implementation for some old content + try: + return RaiIE._real_extract(self, url) + except GeoRestrictedError: + raise + except ExtractorError as e: + raise ExtractorError('Relinker URL not found', cause=e) + + relinker_info = self._extract_relinker_info(urljoin(url, relinker_url), video_id) + + return { + 'id': video_id, + 'title': player_data.get('title') or track_info.get('title') or self._og_search_title(webpage), + 'upload_date': unified_strdate(track_info.get('date')), + 'uploader': strip_or_none(track_info.get('editor') or None), + **relinker_info + } + + +class RaiCulturaIE(RaiNewsIE): # XXX: Do not subclass from concrete IE + _VALID_URL = rf'https?://(www\.)?raicultura\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html' + _EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)'] + _TESTS = [{ + 'url': 'https://www.raicultura.it/letteratura/articoli/2018/12/Alberto-Asor-Rosa-Letteratura-e-potere-05ba8775-82b5-45c5-a89d-dd955fbde1fb.html', + 'info_dict': { + 'id': '05ba8775-82b5-45c5-a89d-dd955fbde1fb', + 'ext': 'mp4', + 'title': 'Alberto Asor Rosa: Letteratura e potere', + 'duration': 1756, + 'upload_date': '20181206', + 'uploader': 'raicultura', + 'formats': 'count:2', + }, + 'params': {'skip_download': True}, + }] + _PLAYER_TAG = 'cultura' + + +class RaiSudtirolIE(RaiBaseIE): + _VALID_URL = r'https?://raisudtirol\.rai\.it/.+media=(?P<id>\w+)' + _TESTS = [{ + # mp4 file + 'url': 'https://raisudtirol.rai.it/la/index.php?media=Ptv1619729460', + 'info_dict': { + 'id': 'Ptv1619729460', + 'ext': 'mp4', + 'title': 'Euro: trasmisciun d\'economia - 29-04-2021 20:51', + 'series': 'Euro: trasmisciun d\'economia', + 'upload_date': '20210429', + 'thumbnail': r're:https://raisudtirol\.rai\.it/img/.+\.jpg', + 'uploader': 'raisudtirol', + 'formats': 'count:1', + }, + 'params': {'skip_download': True}, + }, { + # m3u manifest + 'url': 'https://raisudtirol.rai.it/it/kidsplayer.php?lang=it&media=GUGGUG_P1.smil', + 'info_dict': { + 'id': 'GUGGUG_P1', + 'ext': 'mp4', + 'title': 'GUGGUG! La Prospettiva - Die Perspektive', + 'uploader': 'raisudtirol', + 'formats': 'count:6', + }, + 'params': {'skip_download': True}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_date = self._html_search_regex( + r'<span class="med_data">(.+?)</span>', webpage, 'video_date', default=None) + video_title = self._html_search_regex([ + r'<span class="med_title">(.+?)</span>', r'title: \'(.+?)\','], + webpage, 'video_title', default=None) + video_url = self._html_search_regex([ + r'sources:\s*\[\{file:\s*"(.+?)"\}\]', + r'<source\s+src="(.+?)"\s+type="application/x-mpegURL"'], + webpage, 'video_url', default=None) + + ext = determine_ext(video_url) + if ext == 'm3u8': + formats = self._extract_m3u8_formats(video_url, video_id) + elif ext == 'mp4': + formats = [{ + 'format_id': 'https-mp4', + 'url': self._proto_relative_url(video_url), + 'width': 1024, + 'height': 576, + 'fps': 25, + 'vcodec': 'avc1', + 'acodec': 'mp4a', + }] + else: + formats = [] + self.raise_no_formats(f'Unrecognized media file: {video_url}') + + return { + 'id': video_id, + 'title': join_nonempty(video_title, video_date, delim=' - '), + 'series': video_title if video_date else None, + 'upload_date': unified_strdate(video_date), + 'thumbnail': urljoin('https://raisudtirol.rai.it/', self._html_search_regex( + r'image: \'(.+?)\'', webpage, 'video_thumb', default=None)), + 'uploader': 'raisudtirol', + 'formats': formats, + } diff --git a/yt_dlp/extractor/raywenderlich.py b/yt_dlp/extractor/raywenderlich.py new file mode 100644 index 0000000..e0e3c3e --- /dev/null +++ b/yt_dlp/extractor/raywenderlich.py @@ -0,0 +1,177 @@ +import re + +from .common import InfoExtractor +from .vimeo import VimeoIE +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + merge_dicts, + try_get, + unescapeHTML, + unified_timestamp, + urljoin, +) + + +class RayWenderlichIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<course_id>[^/]+)/lessons/(?P<id>\d+) + ''' + + _TESTS = [{ + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios/lessons/1', + 'info_dict': { + 'id': '248377018', + 'ext': 'mp4', + 'title': 'Introduction', + 'description': 'md5:804d031b3efa9fcb49777d512d74f722', + 'timestamp': 1513906277, + 'upload_date': '20171222', + 'duration': 133, + 'uploader': 'Ray Wenderlich', + 'uploader_id': 'user3304672', + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + }, + 'add_ie': [VimeoIE.ie_key()], + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { + 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'only_matching': True, + }] + + @staticmethod + def _extract_video_id(data, lesson_id): + if not data: + return + groups = try_get(data, lambda x: x['groups'], list) or [] + if not groups: + return + for group in groups: + if not isinstance(group, dict): + continue + contents = try_get(data, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): + continue + ordinal = int_or_none(content.get('ordinal')) + if ordinal != lesson_id: + continue + video_id = content.get('identifier') + if video_id: + return compat_str(video_id) + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + course_id, lesson_id = mobj.group('course_id', 'id') + display_id = '%s/%s' % (course_id, lesson_id) + + webpage = self._download_webpage(url, display_id) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image', webpage, 'thumbnail') + + if '>Subscribe to unlock' in webpage: + raise ExtractorError( + 'This content is only available for subscribers', + expected=True) + + info = { + 'thumbnail': thumbnail, + } + + vimeo_id = self._search_regex( + r'data-vimeo-id=["\'](\d+)', webpage, 'vimeo id', default=None) + + if not vimeo_id: + data = self._parse_json( + self._search_regex( + r'data-collection=(["\'])(?P<data>{.+?})\1', webpage, + 'data collection', default='{}', group='data'), + display_id, transform_source=unescapeHTML, fatal=False) + video_id = self._extract_video_id( + data, lesson_id) or self._search_regex( + r'/videos/(\d+)/', thumbnail, 'video id') + headers = { + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + } + csrf_token = self._html_search_meta( + 'csrf-token', webpage, 'csrf token', default=None) + if csrf_token: + headers['X-CSRF-Token'] = csrf_token + video = self._download_json( + 'https://videos.raywenderlich.com/api/v1/videos/%s.json' + % video_id, display_id, headers=headers)['video'] + vimeo_id = video['clips'][0]['provider_id'] + info.update({ + '_type': 'url_transparent', + 'title': video.get('name'), + 'description': video.get('description') or video.get( + 'meta_description'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('created_at')), + }) + + return merge_dicts(info, self.url_result( + VimeoIE._smuggle_referrer( + 'https://player.vimeo.com/video/%s' % vimeo_id, url), + ie=VimeoIE.ie_key(), video_id=vimeo_id)) + + +class RayWenderlichCourseIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<id>[^/]+) + ''' + + _TEST = { + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios', + 'info_dict': { + 'title': 'Testing in iOS', + 'id': '3530-testing-in-ios', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 29, + } + + @classmethod + def suitable(cls, url): + return False if RayWenderlichIE.suitable(url) else super( + RayWenderlichCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_id = self._match_id(url) + + webpage = self._download_webpage(url, course_id) + + entries = [] + lesson_urls = set() + for lesson_url in re.findall( + r'<a[^>]+\bhref=["\'](/%s/lessons/\d+)' % course_id, webpage): + if lesson_url in lesson_urls: + continue + lesson_urls.add(lesson_url) + entries.append(self.url_result( + urljoin(url, lesson_url), ie=RayWenderlichIE.ie_key())) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) + + return self.playlist_result(entries, course_id, title) diff --git a/yt_dlp/extractor/rbgtum.py b/yt_dlp/extractor/rbgtum.py new file mode 100644 index 0000000..54f194c --- /dev/null +++ b/yt_dlp/extractor/rbgtum.py @@ -0,0 +1,142 @@ +import re + +from .common import InfoExtractor +from ..utils import parse_qs, remove_start, traverse_obj, ExtractorError + + +class RbgTumIE(InfoExtractor): + _VALID_URL = r'https?://(?:live\.rbg\.tum\.de|tum\.live)/w/(?P<id>[^?#]+)' + _TESTS = [{ + # Combined view + 'url': 'https://live.rbg.tum.de/w/cpp/22128', + 'md5': '53a5e7b3e07128e33bbf36687fe1c08f', + 'info_dict': { + 'id': 'cpp/22128', + 'ext': 'mp4', + 'title': 'Lecture: October 18. 2022', + 'series': 'Concepts of C++ programming (IN2377)', + } + }, { + # Presentation only + 'url': 'https://live.rbg.tum.de/w/I2DL/12349/PRES', + 'md5': '36c584272179f3e56b0db5d880639cba', + 'info_dict': { + 'id': 'I2DL/12349/PRES', + 'ext': 'mp4', + 'title': 'Lecture 3: Introduction to Neural Networks', + 'series': 'Introduction to Deep Learning (IN2346)', + } + }, { + # Camera only + 'url': 'https://live.rbg.tum.de/w/fvv-info/16130/CAM', + 'md5': 'e04189d92ff2f56aedf5cede65d37aad', + 'info_dict': { + 'id': 'fvv-info/16130/CAM', + 'ext': 'mp4', + 'title': 'Fachschaftsvollversammlung', + 'series': 'Fachschaftsvollversammlung Informatik', + } + }, { + 'url': 'https://tum.live/w/linalginfo/27102', + 'only_matching': True, + }, ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8 = self._html_search_regex(r'"(https://[^"]+\.m3u8[^"]*)', webpage, 'm3u8') + lecture_title = self._html_search_regex(r'<h1[^>]*>([^<]+)</h1>', webpage, 'title', fatal=False) + lecture_series_title = remove_start(self._html_extract_title(webpage), 'TUM-Live | ') + + formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + + return { + 'id': video_id, + 'title': lecture_title, + 'series': lecture_series_title, + 'formats': formats, + } + + +class RbgTumCourseIE(InfoExtractor): + _VALID_URL = r'https?://(?P<hostname>(?:live\.rbg\.tum\.de|tum\.live))/old/course/(?P<id>(?P<year>\d+)/(?P<term>\w+)/(?P<slug>[^/?#]+))' + _TESTS = [{ + 'url': 'https://live.rbg.tum.de/old/course/2022/S/fpv', + 'info_dict': { + 'title': 'Funktionale Programmierung und Verifikation (IN0003)', + 'id': '2022/S/fpv', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 13, + }, { + 'url': 'https://live.rbg.tum.de/old/course/2022/W/set', + 'info_dict': { + 'title': 'SET FSMPIC', + 'id': '2022/W/set', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 6, + }, { + 'url': 'https://tum.live/old/course/2023/S/linalginfo', + 'only_matching': True, + }, ] + + def _real_extract(self, url): + course_id, hostname, year, term, slug = self._match_valid_url(url).group('id', 'hostname', 'year', 'term', 'slug') + meta = self._download_json( + f'https://{hostname}/api/courses/{slug}/', course_id, fatal=False, + query={'year': year, 'term': term}) or {} + lecture_series_title = meta.get('Name') + lectures = [self.url_result(f'https://{hostname}/w/{slug}/{stream_id}', RbgTumIE) + for stream_id in traverse_obj(meta, ('Streams', ..., 'ID'))] + + if not lectures: + webpage = self._download_webpage(url, course_id) + lecture_series_title = remove_start(self._html_extract_title(webpage), 'TUM-Live | ') + lectures = [self.url_result(f'https://{hostname}{lecture_path}', RbgTumIE) + for lecture_path in re.findall(r'href="(/w/[^/"]+/[^/"]+)"', webpage)] + + return self.playlist_result(lectures, course_id, lecture_series_title) + + +class RbgTumNewCourseIE(InfoExtractor): + _VALID_URL = r'https?://(?P<hostname>(?:live\.rbg\.tum\.de|tum\.live))/\?' + _TESTS = [{ + 'url': 'https://live.rbg.tum.de/?year=2022&term=S&slug=fpv&view=3', + 'info_dict': { + 'title': 'Funktionale Programmierung und Verifikation (IN0003)', + 'id': '2022/S/fpv', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 13, + }, { + 'url': 'https://live.rbg.tum.de/?year=2022&term=W&slug=set&view=3', + 'info_dict': { + 'title': 'SET FSMPIC', + 'id': '2022/W/set', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 6, + }, { + 'url': 'https://tum.live/?year=2023&term=S&slug=linalginfo&view=3', + 'only_matching': True, + }] + + def _real_extract(self, url): + query = parse_qs(url) + errors = [key for key in ('year', 'term', 'slug') if not query.get(key)] + if errors: + raise ExtractorError(f'Input URL is missing query parameters: {", ".join(errors)}') + year, term, slug = query['year'][0], query['term'][0], query['slug'][0] + hostname = self._match_valid_url(url).group('hostname') + + return self.url_result(f'https://{hostname}/old/course/{year}/{term}/{slug}', RbgTumCourseIE) diff --git a/yt_dlp/extractor/rcs.py b/yt_dlp/extractor/rcs.py new file mode 100644 index 0000000..b865f63 --- /dev/null +++ b/yt_dlp/extractor/rcs.py @@ -0,0 +1,372 @@ +import re + +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import ( + ExtractorError, + base_url, + clean_html, + extract_attributes, + get_element_html_by_class, + get_element_html_by_id, + int_or_none, + js_to_json, + mimetype2ext, + sanitize_url, + traverse_obj, + try_call, + url_basename, + urljoin, +) + + +class RCSBaseIE(InfoExtractor): + # based on VideoPlayerLoader.prototype.getVideoSrc + # and VideoPlayerLoader.prototype.transformSrc from + # https://js2.corriereobjects.it/includes2013/LIBS/js/corriere_video.sjs + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _RCS_ID_RE = r'[\w-]+-\d{10}' + _MIGRATION_MAP = { + 'videoamica-vh.akamaihd': 'amica', + 'media2-amica-it.akamaized': 'amica', + 'corrierevam-vh.akamaihd': 'corriere', + 'media2vam-corriere-it.akamaized': 'corriere', + 'cormezzogiorno-vh.akamaihd': 'corrieredelmezzogiorno', + 'media2vam-mezzogiorno-corriere-it.akamaized': 'corrieredelmezzogiorno', + 'corveneto-vh.akamaihd': 'corrieredelveneto', + 'media2vam-veneto-corriere-it.akamaized': 'corrieredelveneto', + 'corbologna-vh.akamaihd': 'corrieredibologna', + 'media2vam-bologna-corriere-it.akamaized': 'corrieredibologna', + 'corfiorentino-vh.akamaihd': 'corrierefiorentino', + 'media2vam-fiorentino-corriere-it.akamaized': 'corrierefiorentino', + 'corinnovazione-vh.akamaihd': 'corriereinnovazione', + 'media2-gazzanet-gazzetta-it.akamaized': 'gazzanet', + 'videogazzanet-vh.akamaihd': 'gazzanet', + 'videogazzaworld-vh.akamaihd': 'gazzaworld', + 'gazzettavam-vh.akamaihd': 'gazzetta', + 'media2vam-gazzetta-it.akamaized': 'gazzetta', + 'videoiodonna-vh.akamaihd': 'iodonna', + 'media2-leitv-it.akamaized': 'leitv', + 'videoleitv-vh.akamaihd': 'leitv', + 'videoliving-vh.akamaihd': 'living', + 'media2-living-corriere-it.akamaized': 'living', + 'media2-oggi-it.akamaized': 'oggi', + 'videooggi-vh.akamaihd': 'oggi', + 'media2-quimamme-it.akamaized': 'quimamme', + 'quimamme-vh.akamaihd': 'quimamme', + 'videorunning-vh.akamaihd': 'running', + 'media2-style-corriere-it.akamaized': 'style', + 'style-vh.akamaihd': 'style', + 'videostyle-vh.akamaihd': 'style', + 'media2-stylepiccoli-it.akamaized': 'stylepiccoli', + 'stylepiccoli-vh.akamaihd': 'stylepiccoli', + 'doveviaggi-vh.akamaihd': 'viaggi', + 'media2-doveviaggi-it.akamaized': 'viaggi', + 'media2-vivimilano-corriere-it.akamaized': 'vivimilano', + 'vivimilano-vh.akamaihd': 'vivimilano', + 'media2-youreporter-it.akamaized': 'youreporter' + } + + def _get_video_src(self, video): + for source in traverse_obj(video, ( + 'mediaProfile', 'mediaFile', lambda _, v: v.get('mimeType'))): + url = source['value'] + for s, r in ( + ('media2vam.corriere.it.edgesuite.net', 'media2vam-corriere-it.akamaized.net'), + ('media.youreporter.it.edgesuite.net', 'media-youreporter-it.akamaized.net'), + ('corrierepmd.corriere.it.edgesuite.net', 'corrierepmd-corriere-it.akamaized.net'), + ('media2vam-corriere-it.akamaized.net/fcs.quotidiani/vr/videos/', 'video.corriere.it/vr360/videos/'), + ('http://', 'https://'), + ): + url = url.replace(s, r) + + type_ = mimetype2ext(source['mimeType']) + if type_ == 'm3u8' and '-vh.akamaihd' in url: + # still needed for some old content: see _TESTS #3 + matches = re.search(r'(?:https?:)?//(?P<host>[\w\.\-]+)\.net/i(?P<path>.+)$', url) + if matches: + url = f'https://vod.rcsobjects.it/hls/{self._MIGRATION_MAP[matches.group("host")]}{matches.group("path")}' + if traverse_obj(video, ('mediaProfile', 'geoblocking')) or ( + type_ == 'm3u8' and 'fcs.quotidiani_!' in url): + url = url.replace('vod.rcsobjects', 'vod-it.rcsobjects') + if type_ == 'm3u8' and 'vod' in url: + url = url.replace('.csmil', '.urlset') + if type_ == 'mp3': + url = url.replace('media2vam-corriere-it.akamaized.net', 'vod.rcsobjects.it/corriere') + + yield { + 'type': type_, + 'url': url, + 'bitrate': source.get('bitrate') + } + + def _create_http_formats(self, m3u8_formats, video_id): + for f in m3u8_formats: + if f['vcodec'] == 'none': + continue + http_url = re.sub(r'(https?://[^/]+)/hls/([^?#]+?\.mp4).+', r'\g<1>/\g<2>', f['url']) + if http_url == f['url']: + continue + + http_f = f.copy() + del http_f['manifest_url'] + format_id = try_call(lambda: http_f['format_id'].replace('hls-', 'https-')) + urlh = self._request_webpage(HEADRequest(http_url), video_id, fatal=False, + note=f'Check filesize for {format_id}') + if not urlh: + continue + + http_f.update({ + 'format_id': format_id, + 'url': http_url, + 'protocol': 'https', + 'filesize_approx': int_or_none(urlh.headers.get('Content-Length', None)), + }) + yield http_f + + def _create_formats(self, sources, video_id): + for source in sources: + if source['type'] == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + source['url'], video_id, 'mp4', m3u8_id='hls', fatal=False) + yield from m3u8_formats + yield from self._create_http_formats(m3u8_formats, video_id) + elif source['type'] == 'mp3': + yield { + 'format_id': 'https-mp3', + 'ext': 'mp3', + 'acodec': 'mp3', + 'vcodec': 'none', + 'abr': source.get('bitrate'), + 'url': source['url'], + } + + def _real_extract(self, url): + cdn, video_id = self._match_valid_url(url).group('cdn', 'id') + display_id, video_data = None, None + + if re.match(self._UUID_RE, video_id) or re.match(self._RCS_ID_RE, video_id): + url = f'https://video.{cdn}/video-json/{video_id}' + else: + webpage = self._download_webpage(url, video_id) + data_config = get_element_html_by_id('divVideoPlayer', webpage) or get_element_html_by_class('divVideoPlayer', webpage) + + if data_config: + data_config = self._parse_json( + extract_attributes(data_config).get('data-config'), + video_id, fatal=False) or {} + if data_config.get('newspaper'): + cdn = f'{data_config["newspaper"]}.it' + display_id, video_id = video_id, data_config.get('uuid') or video_id + url = f'https://video.{cdn}/video-json/{video_id}' + else: + json_url = self._search_regex( + r'''(?x)url\s*=\s*(["']) + (?P<url> + (?:https?:)?//video\.rcs\.it + /fragment-includes/video-includes/[^"']+?\.json + )\1;''', + webpage, video_id, group='url', default=None) + if json_url: + video_data = self._download_json(sanitize_url(json_url, scheme='https'), video_id) + display_id, video_id = video_id, video_data.get('id') or video_id + + if not video_data: + webpage = self._download_webpage(url, video_id) + + video_data = self._search_json( + '##start-video##', webpage, 'video data', video_id, default=None, + end_pattern='##end-video##', transform_source=js_to_json) + + if not video_data: + # try search for iframes + emb = RCSEmbedsIE._extract_url(webpage) + if emb: + return { + '_type': 'url_transparent', + 'url': emb, + 'ie_key': RCSEmbedsIE.ie_key() + } + + if not video_data: + raise ExtractorError('Video data not found in the page') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': video_data.get('title'), + 'description': (clean_html(video_data.get('description')) + or clean_html(video_data.get('htmlDescription')) + or self._html_search_meta('description', webpage)), + 'uploader': video_data.get('provider') or cdn, + 'formats': list(self._create_formats(self._get_video_src(video_data), video_id)), + } + + +class RCSEmbedsIE(RCSBaseIE): + _VALID_URL = r'''(?x) + https?://(?P<vid>video)\. + (?P<cdn> + (?: + rcs| + (?:corriere\w+\.)?corriere| + (?:gazzanet\.)?gazzetta + )\.it) + /video-embed/(?P<id>[^/=&\?]+?)(?:$|\?)''' + _EMBED_REGEX = [r'''(?x) + (?: + data-frame-src=| + <iframe[^\n]+src= + ) + (["']) + (?P<url>(?:https?:)?//video\. + (?: + rcs| + (?:corriere\w+\.)?corriere| + (?:gazzanet\.)?gazzetta + ) + \.it/video-embed/.+?) + \1'''] + _TESTS = [{ + 'url': 'https://video.rcs.it/video-embed/iodonna-0001585037', + 'md5': '0faca97df525032bb9847f690bc3720c', + 'info_dict': { + 'id': 'iodonna-0001585037', + 'ext': 'mp4', + 'title': 'Sky Arte racconta Madonna nella serie "Artist to icon"', + 'description': 'md5:65b09633df9ffee57f48b39e34c9e067', + 'uploader': 'rcs.it', + } + }, { + 'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789', + 'only_matching': True + }, { + 'url': 'https://video.gazzetta.it/video-embed/49612410-00ca-11eb-bcd8-30d4253e0140', + 'only_matching': True + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.iodonna.it/video-iodonna/personaggi-video/monica-bellucci-piu-del-lavoro-oggi-per-me-sono-importanti-lamicizia-e-la-famiglia/', + 'info_dict': { + 'id': 'iodonna-0002033648', + 'ext': 'mp4', + 'title': 'Monica Bellucci: «Più del lavoro, oggi per me sono importanti l\'amicizia e la famiglia»', + 'description': 'md5:daea6d9837351e56b1ab615c06bebac1', + 'uploader': 'rcs.it', + } + }] + + @staticmethod + def _sanitize_url(url): + url = sanitize_url(url, scheme='https') + return urljoin(base_url(url), url_basename(url)) + + @classmethod + def _extract_embed_urls(cls, url, webpage): + return map(cls._sanitize_url, super()._extract_embed_urls(url, webpage)) + + +class RCSIE(RCSBaseIE): + _VALID_URL = r'''(?x)https?://(?P<vid>video|viaggi)\. + (?P<cdn> + (?: + corrieredelmezzogiorno\. + |corrieredelveneto\. + |corrieredibologna\. + |corrierefiorentino\. + )?corriere\.it + |(?:gazzanet\.)?gazzetta\.it) + /(?!video-embed/)[^?#]+?/(?P<id>[^/\?]+)(?=\?|/$|$)''' + _TESTS = [{ + # json iframe directly from id + 'url': 'https://video.corriere.it/sport/formula-1/vettel-guida-ferrari-sf90-mugello-suo-fianco-c-elecrerc-bendato-video-esilarante/b727632a-f9d0-11ea-91b0-38d50a849abb', + 'md5': '14946840dec46ecfddf66ba4eea7d2b2', + 'info_dict': { + 'id': 'b727632a-f9d0-11ea-91b0-38d50a849abb', + 'ext': 'mp4', + 'title': 'Vettel guida la Ferrari SF90 al Mugello e al suo fianco c\'è Leclerc (bendato): il video è esilarante', + 'description': 'md5:3915ce5ebb3d2571deb69a5eb85ac9b5', + 'uploader': 'Corriere Tv', + } + }, { + # search for video id inside the page + 'url': 'https://viaggi.corriere.it/video/norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen/', + 'md5': 'f22a92d9e666e80f2fffbf2825359c81', + 'info_dict': { + 'id': '5b7cd134-e2c1-11ea-89b3-b56dd0df2aa2', + 'display_id': 'norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen', + 'ext': 'mp4', + 'title': 'La nuova spettacolare attrazione in Norvegia: il ponte sopra Vøringsfossen', + 'description': 'md5:18b35a291f6746c0c8dacd16e5f5f4f8', + 'uploader': 'DOVE Viaggi', + } + }, { + # only audio format https://github.com/yt-dlp/yt-dlp/issues/5683 + 'url': 'https://video.corriere.it/cronaca/audio-telefonata-il-papa-becciu-santita-lettera-che-mi-ha-inviato-condanna/b94c0d20-70c2-11ed-9572-e4b947a0ebd2', + 'md5': 'aaffb08d02f2ce4292a4654694c78150', + 'info_dict': { + 'id': 'b94c0d20-70c2-11ed-9572-e4b947a0ebd2', + 'ext': 'mp3', + 'title': 'L\'audio della telefonata tra il Papa e Becciu: «Santità, la lettera che mi ha inviato è una condanna»', + 'description': 'md5:c0ddb61bd94a8d4e0d4bb9cda50a689b', + 'uploader': 'Corriere Tv', + 'formats': [{'format_id': 'https-mp3', 'ext': 'mp3'}], + } + }, { + # old content still needs cdn migration + 'url': 'https://viaggi.corriere.it/video/milano-varallo-sesia-sul-treno-a-vapore/', + 'md5': '2dfdce7af249654ad27eeba03fe1e08d', + 'info_dict': { + 'id': 'd8f6c8d0-f7d7-11e8-bfca-f74cf4634191', + 'display_id': 'milano-varallo-sesia-sul-treno-a-vapore', + 'ext': 'mp4', + 'title': 'Milano-Varallo Sesia sul treno a vapore', + 'description': 'md5:6348f47aac230397fe341a74f7678d53', + 'uploader': 'DOVE Viaggi', + } + }, { + 'url': 'https://video.corriere.it/video-360/metro-copenaghen-tutta-italiana/a248a7f0-e2db-11e9-9830-af2de6b1f945', + 'only_matching': True + }] + + +class RCSVariousIE(RCSBaseIE): + _VALID_URL = r'''(?x)https?://www\. + (?P<cdn> + leitv\.it| + youreporter\.it| + amica\.it + )/(?:[^/]+/)?(?P<id>[^/]+?)(?:$|\?|/)''' + _TESTS = [{ + 'url': 'https://www.leitv.it/benessere/mal-di-testa/', + 'md5': '3b7a683d105a7313ec7513b014443631', + 'info_dict': { + 'id': 'leitv-0000125151', + 'display_id': 'mal-di-testa', + 'ext': 'mp4', + 'title': 'Cervicalgia e mal di testa, il video con i suggerimenti dell\'esperto', + 'description': 'md5:ae21418f34cee0b8d02a487f55bcabb5', + 'uploader': 'leitv.it', + } + }, { + 'url': 'https://www.youreporter.it/fiume-sesia-3-ottobre-2020/', + 'md5': '3989b6d603482611a2abd2f32b79f739', + 'info_dict': { + 'id': 'youreporter-0000332574', + 'display_id': 'fiume-sesia-3-ottobre-2020', + 'ext': 'mp4', + 'title': 'Fiume Sesia 3 ottobre 2020', + 'description': 'md5:0070eef1cc884d13c970a4125063de55', + 'uploader': 'youreporter.it', + } + }, { + 'url': 'https://www.amica.it/video-post/saint-omer-al-cinema-il-film-leone-dargento-che-ribalta-gli-stereotipi/', + 'md5': '187cce524dfd0343c95646c047375fc4', + 'info_dict': { + 'id': 'amica-0001225365', + 'display_id': 'saint-omer-al-cinema-il-film-leone-dargento-che-ribalta-gli-stereotipi', + 'ext': 'mp4', + 'title': '"Saint Omer": al cinema il film Leone d\'argento che ribalta gli stereotipi', + 'description': 'md5:b1c8869c2dcfd6073a2a311ba0008aa8', + 'uploader': 'rcs.it', + } + }] diff --git a/yt_dlp/extractor/rcti.py b/yt_dlp/extractor/rcti.py new file mode 100644 index 0000000..6a7c7f3 --- /dev/null +++ b/yt_dlp/extractor/rcti.py @@ -0,0 +1,373 @@ +import json +import random +import time + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + dict_get, + ExtractorError, + strip_or_none, + traverse_obj, + try_get +) + + +class RCTIPlusBaseIE(InfoExtractor): + def _real_initialize(self): + self._AUTH_KEY = self._download_json( + 'https://api.rctiplus.com/api/v1/visitor?platform=web', # platform can be web, mweb, android, ios + None, 'Fetching authorization key')['data']['access_token'] + + def _call_api(self, url, video_id, note=None): + json = self._download_json( + url, video_id, note=note, headers={'Authorization': self._AUTH_KEY}) + if json.get('status', {}).get('code', 0) != 0: + raise ExtractorError(f'{self.IE_NAME} said: {json["status"]["message_client"]}', cause=json) + return json.get('data'), json.get('meta') + + +class RCTIPlusIE(RCTIPlusBaseIE): + _VALID_URL = r'https?://www\.rctiplus\.com/(?:programs/\d+?/.*?/)?(?P<type>episode|clip|extra|live-event|missed-event)/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.rctiplus.com/programs/1259/kiko-untuk-lola/episode/22124/untuk-lola', + 'md5': '56ed45affad45fa18d5592a1bc199997', + 'info_dict': { + 'id': 'v_e22124', + 'title': 'Untuk Lola', + 'display_id': 'untuk-lola', + 'description': 'md5:2b809075c0b1e071e228ad6d13e41deb', + 'ext': 'mp4', + 'duration': 1400, + 'timestamp': 1615978800, + 'upload_date': '20210317', + 'series': 'Kiko : Untuk Lola', + 'season_number': 1, + 'episode_number': 1, + 'channel': 'RCTI', + }, + 'params': { + 'fixup': 'never', + }, + }, { # Clip; Series title doesn't appear on metadata JSON + 'url': 'https://www.rctiplus.com/programs/316/cahaya-terindah/clip/3921/make-a-wish', + 'md5': 'd179b2ff356f0e91a53bcc6a4d8504f0', + 'info_dict': { + 'id': 'v_c3921', + 'title': 'Make A Wish', + 'display_id': 'make-a-wish', + 'description': 'Make A Wish', + 'ext': 'mp4', + 'duration': 288, + 'timestamp': 1571652600, + 'upload_date': '20191021', + 'series': 'Cahaya Terindah', + 'channel': 'RCTI', + }, + 'params': { + 'fixup': 'never', + }, + }, { # Extra + 'url': 'https://www.rctiplus.com/programs/616/inews-malam/extra/9438/diungkapkan-melalui-surat-terbuka-ceo-ruangguru-belva-devara-mundur-dari-staf-khusus-presiden', + 'md5': 'c48106afdbce609749f5e0c007d9278a', + 'info_dict': { + 'id': 'v_ex9438', + 'title': 'md5:2ede828c0f8bde249e0912be150314ca', + 'display_id': 'md5:62b8d4e9ff096db527a1ad797e8a9933', + 'description': 'md5:2ede828c0f8bde249e0912be150314ca', + 'ext': 'mp4', + 'duration': 93, + 'timestamp': 1587561540, + 'upload_date': '20200422', + 'series': 'iNews Malam', + 'channel': 'INews', + }, + }, { # Missed event/replay + 'url': 'https://www.rctiplus.com/missed-event/2507/mou-signing-ceremony-27-juli-2021-1400-wib', + 'md5': '649c5f27250faed1452ca8b91e06922d', + 'info_dict': { + 'id': 'v_pe2507', + 'title': 'MOU Signing Ceremony | 27 Juli 2021 | 14.00 WIB', + 'display_id': 'mou-signing-ceremony-27-juli-2021-1400-wib', + 'ext': 'mp4', + 'timestamp': 1627142400, + 'upload_date': '20210724', + 'was_live': True, + 'release_timestamp': 1627369200, + }, + 'params': { + 'fixup': 'never', + }, + }, { # Live event; Cloudfront CDN + 'url': 'https://www.rctiplus.com/live-event/2530/dai-muda-charging-imun-dengan-iman-4-agustus-2021-1600-wib', + 'info_dict': { + 'id': 'v_le2530', + 'title': 'Dai Muda : Charging Imun dengan Iman | 4 Agustus 2021 | 16.00 WIB', + 'display_id': 'dai-muda-charging-imun-dengan-iman-4-agustus-2021-1600-wib', + 'ext': 'mp4', + 'timestamp': 1627898400, + 'upload_date': '20210802', + 'release_timestamp': 1628067600, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This live event has ended.', + }, { # TV; live_at is null + 'url': 'https://www.rctiplus.com/live-event/1/rcti', + 'info_dict': { + 'id': 'v_lt1', + 'title': 'RCTI', + 'display_id': 'rcti', + 'ext': 'mp4', + 'timestamp': 1546344000, + 'upload_date': '20190101', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + }] + _CONVIVA_JSON_TEMPLATE = { + 't': 'CwsSessionHb', + 'cid': 'ff84ae928c3b33064b76dec08f12500465e59a6f', + 'clid': '0', + 'sid': 0, + 'seq': 0, + 'caps': 0, + 'sf': 7, + 'sdk': True, + } + + def _real_extract(self, url): + match = self._match_valid_url(url).groupdict() + video_type, video_id, display_id = match['type'], match['id'], match['display_id'] + + url_api_version = 'v2' if video_type == 'missed-event' else 'v1' + appier_id = '23984824_' + str(random.randint(0, 10000000000)) # Based on the webpage's uuidRandom generator + video_json = self._call_api( + f'https://api.rctiplus.com/api/{url_api_version}/{video_type}/{video_id}/url?appierid={appier_id}', display_id, 'Downloading video URL JSON')[0] + video_url = video_json['url'] + + is_upcoming = try_get(video_json, lambda x: x['current_date'] < x['live_at']) + if is_upcoming is None: + is_upcoming = try_get(video_json, lambda x: x['current_date'] < x['start_date']) + if is_upcoming: + self.raise_no_formats( + 'This event will start at %s.' % video_json['live_label'] if video_json.get('live_label') else 'This event has not started yet.', expected=True) + if 'akamaized' in video_url: + # For some videos hosted on Akamai's CDN (possibly AES-encrypted ones?), a session needs to at least be made via Conviva's API + conviva_json_data = { + **self._CONVIVA_JSON_TEMPLATE, + 'url': video_url, + 'sst': int(time.time()) + } + conviva_json_res = self._download_json( + 'https://ff84ae928c3b33064b76dec08f12500465e59a6f.cws.conviva.com/0/wsg', display_id, + 'Creating Conviva session', 'Failed to create Conviva session', + fatal=False, data=json.dumps(conviva_json_data).encode('utf-8')) + if conviva_json_res and conviva_json_res.get('err') != 'ok': + self.report_warning('Conviva said: %s' % str(conviva_json_res.get('err'))) + + video_meta, meta_paths = self._call_api( + 'https://api.rctiplus.com/api/v1/%s/%s' % (video_type, video_id), display_id, 'Downloading video metadata') + + thumbnails, image_path = [], meta_paths.get('image_path', 'https://rstatic.akamaized.net/media/') + if video_meta.get('portrait_image'): + thumbnails.append({ + 'id': 'portrait_image', + 'url': '%s%d%s' % (image_path, 2000, video_meta['portrait_image']) # 2000px seems to be the highest resolution that can be given + }) + if video_meta.get('landscape_image'): + thumbnails.append({ + 'id': 'landscape_image', + 'url': '%s%d%s' % (image_path, 2000, video_meta['landscape_image']) + }) + try: + formats = self._extract_m3u8_formats(video_url, display_id, 'mp4', headers={'Referer': 'https://www.rctiplus.com/'}) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + self.raise_geo_restricted(countries=['ID'], metadata_available=True) + else: + raise e + for f in formats: + if 'akamaized' in f['url'] or 'cloudfront' in f['url']: + f.setdefault('http_headers', {})['Referer'] = 'https://www.rctiplus.com/' # Referer header is required for akamai/cloudfront CDNs + + return { + 'id': video_meta.get('product_id') or video_json.get('product_id'), + 'title': dict_get(video_meta, ('title', 'name')) or dict_get(video_json, ('content_name', 'assets_name')), + 'display_id': display_id, + 'description': video_meta.get('summary'), + 'timestamp': video_meta.get('release_date') or video_json.get('start_date'), + 'duration': video_meta.get('duration'), + 'categories': [video_meta['genre']] if video_meta.get('genre') else None, + 'average_rating': video_meta.get('star_rating'), + 'series': video_meta.get('program_title') or video_json.get('program_title'), + 'season_number': video_meta.get('season'), + 'episode_number': video_meta.get('episode'), + 'channel': video_json.get('tv_name'), + 'channel_id': video_json.get('tv_id'), + 'formats': formats, + 'thumbnails': thumbnails, + 'is_live': video_type == 'live-event' and not is_upcoming, + 'was_live': video_type == 'missed-event', + 'live_status': 'is_upcoming' if is_upcoming else None, + 'release_timestamp': video_json.get('live_at'), + } + + +class RCTIPlusSeriesIE(RCTIPlusBaseIE): + _VALID_URL = r'https?://www\.rctiplus\.com/programs/(?P<id>\d+)/(?P<display_id>[^/?#&]+)(?:/(?P<type>episodes|extras|clips))?' + _TESTS = [{ + 'url': 'https://www.rctiplus.com/programs/829/putri-untuk-pangeran', + 'playlist_mincount': 1019, + 'info_dict': { + 'id': '829', + 'title': 'Putri Untuk Pangeran', + 'description': 'md5:aca7b54d05bd95a67d4f4613cc1d622d', + 'age_limit': 2, + 'cast': ['Verrel Bramasta', 'Ranty Maria', 'Riza Syah', 'Ivan Fadilla', 'Nicole Parham', 'Dll', 'Aviv Elham'], + 'display_id': 'putri-untuk-pangeran', + 'tags': 'count:18', + }, + }, { # No episodes + 'url': 'https://www.rctiplus.com/programs/615/inews-pagi', + 'playlist_mincount': 388, + 'info_dict': { + 'id': '615', + 'title': 'iNews Pagi', + 'description': 'md5:f18ee3d4643cfb41c358e5a9b693ee04', + 'age_limit': 2, + 'tags': 'count:11', + 'display_id': 'inews-pagi', + } + }] + _AGE_RATINGS = { # Based off https://id.wikipedia.org/wiki/Sistem_rating_konten_televisi with additional ratings + 'S-SU': 2, + 'SU': 2, + 'P': 2, + 'A': 7, + 'R': 13, + 'R-R/1': 17, # Labelled as 17+ despite being R + 'D': 18, + } + + @classmethod + def suitable(cls, url): + return False if RCTIPlusIE.suitable(url) else super(RCTIPlusSeriesIE, cls).suitable(url) + + def _entries(self, url, display_id=None, note='Downloading entries JSON', metadata={}): + total_pages = 0 + try: + total_pages = self._call_api( + '%s&length=20&page=0' % url, + display_id, note)[1]['pagination']['total_page'] + except ExtractorError as e: + if 'not found' in str(e): + return [] + raise e + if total_pages <= 0: + return [] + + for page_num in range(1, total_pages + 1): + episode_list = self._call_api( + '%s&length=20&page=%s' % (url, page_num), + display_id, '%s page %s' % (note, page_num))[0] or [] + + for video_json in episode_list: + yield { + '_type': 'url', + 'url': video_json['share_link'], + 'ie_key': RCTIPlusIE.ie_key(), + 'id': video_json.get('product_id'), + 'title': video_json.get('title'), + 'display_id': video_json.get('title_code').replace('_', '-'), + 'description': video_json.get('summary'), + 'timestamp': video_json.get('release_date'), + 'duration': video_json.get('duration'), + 'season_number': video_json.get('season'), + 'episode_number': video_json.get('episode'), + **metadata + } + + def _series_entries(self, series_id, display_id=None, video_type=None, metadata={}): + if not video_type or video_type in 'episodes': + try: + seasons_list = self._call_api( + f'https://api.rctiplus.com/api/v1/program/{series_id}/season', + display_id, 'Downloading seasons list JSON')[0] + except ExtractorError as e: + if 'not found' not in str(e): + raise + seasons_list = [] + for season in seasons_list: + yield from self._entries( + f'https://api.rctiplus.com/api/v2/program/{series_id}/episode?season={season["season"]}', + display_id, f'Downloading season {season["season"]} episode entries', metadata) + if not video_type or video_type in 'extras': + yield from self._entries( + f'https://api.rctiplus.com/api/v2/program/{series_id}/extra?content_id=0', + display_id, 'Downloading extra entries', metadata) + if not video_type or video_type in 'clips': + yield from self._entries( + f'https://api.rctiplus.com/api/v2/program/{series_id}/clip?content_id=0', + display_id, 'Downloading clip entries', metadata) + + def _real_extract(self, url): + series_id, display_id, video_type = self._match_valid_url(url).group('id', 'display_id', 'type') + if video_type: + self.report_warning( + f'Only {video_type} will be downloaded. ' + f'To download everything from the series, remove "/{video_type}" from the URL') + + series_meta, meta_paths = self._call_api( + f'https://api.rctiplus.com/api/v1/program/{series_id}/detail', display_id, 'Downloading series metadata') + metadata = { + 'age_limit': try_get(series_meta, lambda x: self._AGE_RATINGS[x['age_restriction'][0]['code']]), + 'cast': traverse_obj(series_meta, (('starring', 'creator', 'writer'), ..., 'name'), + expected_type=lambda x: strip_or_none(x) or None), + 'tags': traverse_obj(series_meta, ('tag', ..., 'name'), + expected_type=lambda x: strip_or_none(x) or None), + } + return self.playlist_result( + self._series_entries(series_id, display_id, video_type, metadata), series_id, + series_meta.get('title'), series_meta.get('summary'), display_id=display_id, **metadata) + + +class RCTIPlusTVIE(RCTIPlusBaseIE): + _VALID_URL = r'https?://www\.rctiplus\.com/((tv/(?P<tvname>\w+))|(?P<eventname>live-event|missed-event))' + _TESTS = [{ + 'url': 'https://www.rctiplus.com/tv/rcti', + 'info_dict': { + 'id': 'v_lt1', + 'title': 'RCTI', + 'ext': 'mp4', + 'timestamp': 1546344000, + 'upload_date': '20190101', + }, + 'params': { + 'skip_download': True, + } + }, { + # Returned video will always change + 'url': 'https://www.rctiplus.com/live-event', + 'only_matching': True, + }, { + # Returned video will also always change + 'url': 'https://www.rctiplus.com/missed-event', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if RCTIPlusIE.suitable(url) else super(RCTIPlusTVIE, cls).suitable(url) + + def _real_extract(self, url): + match = self._match_valid_url(url).groupdict() + tv_id = match.get('tvname') or match.get('eventname') + webpage = self._download_webpage(url, tv_id) + video_type, video_id = self._search_regex( + r'url\s*:\s*["\']https://api\.rctiplus\.com/api/v./(?P<type>[^/]+)/(?P<id>\d+)/url', + webpage, 'video link', group=('type', 'id')) + return self.url_result(f'https://www.rctiplus.com/{video_type}/{video_id}/{tv_id}', 'RCTIPlus') diff --git a/yt_dlp/extractor/rds.py b/yt_dlp/extractor/rds.py new file mode 100644 index 0000000..1a1c663 --- /dev/null +++ b/yt_dlp/extractor/rds.py @@ -0,0 +1,68 @@ +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_iso8601, + js_to_json, +) +from ..compat import compat_str + + +class RDSIE(InfoExtractor): + _WORKING = False + IE_DESC = 'RDS.ca' + _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<id>[^/]+)-\d+\.\d+' + + _TESTS = [{ + # has two 9c9media ContentPackages, the web player selects the first ContentPackage + 'url': 'https://www.rds.ca/videos/Hockey/NationalHockeyLeague/teams/9/forum-du-5-a-7-jesperi-kotkaniemi-de-retour-de-finlande-3.1377606', + 'info_dict': { + 'id': '2083309', + 'display_id': 'forum-du-5-a-7-jesperi-kotkaniemi-de-retour-de-finlande', + 'ext': 'flv', + 'title': 'Forum du 5 à 7 : Kotkaniemi de retour de Finlande', + 'description': 'md5:83fa38ecc4a79b19e433433254077f25', + 'timestamp': 1606129030, + 'upload_date': '20201123', + 'duration': 773.039, + } + }, { + 'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json) + video_id = compat_str(item['id']) + title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta( + 'title', webpage, 'title', fatal=True) + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage, 'description') + thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex( + [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"', + r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'], + webpage, 'thumbnail', fatal=False) + timestamp = parse_iso8601(self._search_regex( + r'<span[^>]+itemprop="uploadDate"[^>]+content="([^"]+)"', + webpage, 'upload date', fatal=False)) + duration = parse_duration(self._search_regex( + r'<span[^>]+itemprop="duration"[^>]+content="([^"]+)"', + webpage, 'duration', fatal=False)) + age_limit = self._family_friendly_search(webpage) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'display_id': display_id, + 'url': '9c9media:rds_web:%s' % video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'age_limit': age_limit, + 'ie_key': 'NineCNineMedia', + } diff --git a/yt_dlp/extractor/redbee.py b/yt_dlp/extractor/redbee.py new file mode 100644 index 0000000..4d71133 --- /dev/null +++ b/yt_dlp/extractor/redbee.py @@ -0,0 +1,380 @@ +import json +import re +import time +import urllib.parse +import uuid + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + strip_or_none, + traverse_obj, + try_call, + unified_timestamp, +) + + +class RedBeeBaseIE(InfoExtractor): + _DEVICE_ID = str(uuid.uuid4()) + + @property + def _API_URL(self): + """ + Ref: https://apidocs.emp.ebsd.ericsson.net + Subclasses must set _REDBEE_CUSTOMER, _REDBEE_BUSINESS_UNIT + """ + return f'https://exposure.api.redbee.live/v2/customer/{self._REDBEE_CUSTOMER}/businessunit/{self._REDBEE_BUSINESS_UNIT}' + + def _get_bearer_token(self, asset_id, jwt=None): + request = { + 'deviceId': self._DEVICE_ID, + 'device': { + 'deviceId': self._DEVICE_ID, + 'name': 'Mozilla Firefox 102', + 'type': 'WEB', + }, + } + if jwt: + request['jwt'] = jwt + + return self._download_json( + f'{self._API_URL}/auth/{"gigyaLogin" if jwt else "anonymous"}', + asset_id, data=json.dumps(request).encode('utf-8'), headers={ + 'Content-Type': 'application/json;charset=utf-8' + })['sessionToken'] + + def _get_formats_and_subtitles(self, asset_id, **kwargs): + bearer_token = self._get_bearer_token(asset_id, **kwargs) + api_response = self._download_json( + f'{self._API_URL}/entitlement/{asset_id}/play', + asset_id, headers={ + 'Authorization': f'Bearer {bearer_token}', + 'Accept': 'application/json, text/plain, */*' + }) + + formats, subtitles = [], {} + for format in api_response['formats']: + if not format.get('mediaLocator'): + continue + + fmts, subs = [], {} + if format.get('format') == 'DASH': + fmts, subs = self._extract_mpd_formats_and_subtitles( + format['mediaLocator'], asset_id, fatal=False) + elif format.get('format') == 'SMOOTHSTREAMING': + fmts, subs = self._extract_ism_formats_and_subtitles( + format['mediaLocator'], asset_id, fatal=False) + elif format.get('format') == 'HLS': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + format['mediaLocator'], asset_id, fatal=False) + + if format.get('drm'): + for f in fmts: + f['has_drm'] = True + + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return formats, subtitles + + +class ParliamentLiveUKIE(RedBeeBaseIE): + IE_NAME = 'parliamentlive.tv' + IE_DESC = 'UK parliament videos' + _VALID_URL = r'(?i)https?://(?:www\.)?parliamentlive\.tv/Event/Index/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + _REDBEE_CUSTOMER = 'UKParliament' + _REDBEE_BUSINESS_UNIT = 'ParliamentLive' + + _TESTS = [{ + 'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b', + 'info_dict': { + 'id': 'c1e9d44d-fd6c-4263-b50f-97ed26cc998b', + 'ext': 'mp4', + 'title': 'Home Affairs Committee', + 'timestamp': 1395153872, + 'upload_date': '20140318', + 'thumbnail': r're:https?://[^?#]+c1e9d44d-fd6c-4263-b50f-97ed26cc998b[^/]*/thumbnail', + }, + }, { + 'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4', + 'only_matching': True, + }, { + 'url': 'https://parliamentlive.tv/Event/Index/27cf25e4-e77b-42a3-93c5-c815cd6d7377', + 'info_dict': { + 'id': '27cf25e4-e77b-42a3-93c5-c815cd6d7377', + 'ext': 'mp4', + 'title': 'House of Commons', + 'timestamp': 1658392447, + 'upload_date': '20220721', + 'thumbnail': r're:https?://[^?#]+27cf25e4-e77b-42a3-93c5-c815cd6d7377[^/]*/thumbnail', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats, subtitles = self._get_formats_and_subtitles(video_id) + + video_info = self._download_json( + f'https://www.parliamentlive.tv/Event/GetShareVideo/{video_id}', video_id, fatal=False) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': traverse_obj(video_info, ('event', 'title')), + 'thumbnail': traverse_obj(video_info, 'thumbnailUrl'), + 'timestamp': traverse_obj( + video_info, ('event', 'publishedStartTime'), expected_type=unified_timestamp), + '_format_sort_fields': ('res', 'proto'), + } + + +class RTBFIE(RedBeeBaseIE): + _WORKING = False + _VALID_URL = r'''(?x) + https?://(?:www\.)?rtbf\.be/ + (?: + video/[^?]+\?.*\bid=| + ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| + auvio/[^/]+\?.*\b(?P<live>l)?id= + )(?P<id>\d+)''' + _NETRC_MACHINE = 'rtbf' + + _REDBEE_CUSTOMER = 'RTBF' + _REDBEE_BUSINESS_UNIT = 'Auvio' + + _TESTS = [{ + 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', + 'md5': '8c876a1cceeb6cf31b476461ade72384', + 'info_dict': { + 'id': '1921274', + 'ext': 'mp4', + 'title': 'Les Diables au coeur (épisode 2)', + 'description': '(du 25/04/2014)', + 'duration': 3099.54, + 'upload_date': '20140425', + 'timestamp': 1398456300, + }, + 'skip': 'No longer available', + }, { + # geo restricted + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', + 'only_matching': True, + }, { + # Live + 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', + 'only_matching': True, + }, { + # Audio + 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', + 'only_matching': True, + }, { + # With Subtitle + 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', + 'only_matching': True, + }, { + 'url': 'https://www.rtbf.be/auvio/detail_investigation?id=2921926', + 'md5': 'd5d11bb62169fef38d7ce7ac531e034f', + 'info_dict': { + 'id': '2921926', + 'ext': 'mp4', + 'title': 'Le handicap un confinement perpétuel - Maladie de Lyme', + 'description': 'md5:dcbd5dcf6015488c9069b057c15ccc52', + 'duration': 5258.8, + 'upload_date': '20220727', + 'timestamp': 1658934000, + 'series': '#Investigation', + 'thumbnail': r're:^https?://[^?&]+\.jpg$', + }, + }, { + 'url': 'https://www.rtbf.be/auvio/detail_la-belgique-criminelle?id=2920492', + 'md5': '054f9f143bc79c89647c35e5a7d35fa8', + 'info_dict': { + 'id': '2920492', + 'ext': 'mp4', + 'title': '04 - Le crime de la rue Royale', + 'description': 'md5:0c3da1efab286df83f2ab3f8f96bd7a6', + 'duration': 1574.6, + 'upload_date': '20220723', + 'timestamp': 1658596887, + 'series': 'La Belgique criminelle - TV', + 'thumbnail': r're:^https?://[^?&]+\.jpg$', + }, + }] + + _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' + _PROVIDERS = { + 'YOUTUBE': 'Youtube', + 'DAILYMOTION': 'Dailymotion', + 'VIMEO': 'Vimeo', + } + _QUALITIES = [ + ('mobile', 'SD'), + ('web', 'MD'), + ('high', 'HD'), + ] + _LOGIN_URL = 'https://login.rtbf.be/accounts.login' + _GIGYA_API_KEY = '3_kWKuPgcdAybqnqxq_MvHVk0-6PN8Zk8pIIkJM_yXOu-qLPDDsGOtIDFfpGivtbeO' + _LOGIN_COOKIE_ID = f'glt_{_GIGYA_API_KEY}' + + def _perform_login(self, username, password): + if self._get_cookies(self._LOGIN_URL).get(self._LOGIN_COOKIE_ID): + return + + self._set_cookie('.rtbf.be', 'gmid', 'gmid.ver4', secure=True, expire_time=time.time() + 3600) + + login_response = self._download_json( + self._LOGIN_URL, None, data=urllib.parse.urlencode({ + 'loginID': username, + 'password': password, + 'APIKey': self._GIGYA_API_KEY, + 'targetEnv': 'jssdk', + 'sessionExpiration': '-2', + }).encode('utf-8'), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + + if login_response['statusCode'] != 200: + raise ExtractorError('Login failed. Server message: %s' % login_response['errorMessage'], expected=True) + + self._set_cookie('.rtbf.be', self._LOGIN_COOKIE_ID, login_response['sessionInfo']['login_token'], + secure=True, expire_time=time.time() + 3600) + + def _get_formats_and_subtitles(self, url, media_id): + login_token = self._get_cookies(url).get(self._LOGIN_COOKIE_ID) + if not login_token: + self.raise_login_required() + + session_jwt = try_call(lambda: self._get_cookies(url)['rtbf_jwt'].value) or self._download_json( + 'https://login.rtbf.be/accounts.getJWT', media_id, query={ + 'login_token': login_token.value, + 'APIKey': self._GIGYA_API_KEY, + 'sdk': 'js_latest', + 'authMode': 'cookie', + 'pageURL': url, + 'sdkBuild': '13273', + 'format': 'json', + })['id_token'] + + return super()._get_formats_and_subtitles(media_id, jwt=session_jwt) + + def _real_extract(self, url): + live, media_id = self._match_valid_url(url).groups() + embed_page = self._download_webpage( + 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), + media_id, query={'id': media_id}) + + media_data = self._html_search_regex(r'data-media="([^"]+)"', embed_page, 'media data', fatal=False) + if not media_data: + if re.search(r'<div[^>]+id="js-error-expired"[^>]+class="(?![^"]*hidden)', embed_page): + raise ExtractorError('Livestream has ended.', expected=True) + if re.search(r'<div[^>]+id="js-sso-connect"[^>]+class="(?![^"]*hidden)', embed_page): + self.raise_login_required() + + raise ExtractorError('Could not find media data') + + data = self._parse_json(media_data, media_id) + + error = data.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + provider = data.get('provider') + if provider in self._PROVIDERS: + return self.url_result(data['url'], self._PROVIDERS[provider]) + + title = traverse_obj(data, 'subtitle', 'title') + is_live = data.get('isLive') + height_re = r'-(\d+)p\.' + formats, subtitles = [], {} + + # The old api still returns m3u8 and mpd manifest for livestreams, but these are 'fake' + # since all they contain is a 20s video that is completely unrelated. + # https://github.com/yt-dlp/yt-dlp/issues/4656#issuecomment-1214461092 + m3u8_url = None if data.get('isLive') else traverse_obj(data, 'urlHlsAes128', 'urlHls') + if m3u8_url: + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x + http_url = data.get('url') + if formats and http_url and re.search(height_re, http_url): + http_url = fix_url(http_url) + for m3u8_f in formats[:]: + height = m3u8_f.get('height') + if not height: + continue + f = m3u8_f.copy() + del f['protocol'] + f.update({ + 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), + 'url': re.sub(height_re, '-%dp.' % height, http_url), + }) + formats.append(f) + else: + sources = data.get('sources') or {} + for key, format_id in self._QUALITIES: + format_url = sources.get(key) + if not format_url: + continue + height = int_or_none(self._search_regex( + height_re, format_url, 'height', default=None)) + formats.append({ + 'format_id': format_id, + 'url': fix_url(format_url), + 'height': height, + }) + + mpd_url = None if data.get('isLive') else data.get('urlDash') + if mpd_url and (self.get_param('allow_unplayable_formats') or not data.get('drm')): + fmts, subs = self._extract_mpd_formats_and_subtitles( + mpd_url, media_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + audio_url = data.get('urlAudio') + if audio_url: + formats.append({ + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + + for track in (data.get('tracks') or {}).values(): + sub_url = track.get('url') + if not sub_url: + continue + subtitles.setdefault(track.get('lang') or 'fr', []).append({ + 'url': sub_url, + }) + + if not formats: + fmts, subs = self._get_formats_and_subtitles(url, f'live_{media_id}' if is_live else media_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': media_id, + 'formats': formats, + 'title': title, + 'description': strip_or_none(data.get('description')), + 'thumbnail': data.get('thumbnail'), + 'duration': float_or_none(data.get('realDuration')), + 'timestamp': int_or_none(data.get('liveFrom')), + 'series': data.get('programLabel'), + 'subtitles': subtitles, + 'is_live': is_live, + '_format_sort_fields': ('res', 'proto'), + } diff --git a/yt_dlp/extractor/redbulltv.py b/yt_dlp/extractor/redbulltv.py new file mode 100644 index 0000000..d1de249 --- /dev/null +++ b/yt_dlp/extractor/redbulltv.py @@ -0,0 +1,224 @@ +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + float_or_none, + ExtractorError, +) + + +class RedBullTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com(?:/[^/]+)?(?:/tv)?)(?:/events/[^/]+)?/(?:videos?|live|(?:film|episode)s)/(?P<id>AP-\w+)' + _TESTS = [{ + # film + 'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11', + 'md5': 'fb0445b98aa4394e504b413d98031d1f', + 'info_dict': { + 'id': 'AP-1Q6XCDTAN1W11', + 'ext': 'mp4', + 'title': 'ABC of... WRC - ABC of... S1E6', + 'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31', + 'duration': 1582.04, + }, + }, { + # episode + 'url': 'https://www.redbull.tv/video/AP-1PMHKJFCW1W11', + 'info_dict': { + 'id': 'AP-1PMHKJFCW1W11', + 'ext': 'mp4', + 'title': 'Grime - Hashtags S2E4', + 'description': 'md5:5546aa612958c08a98faaad4abce484d', + 'duration': 904, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.redbull.com/int-en/tv/video/AP-1UWHCAR9S1W11/rob-meets-sam-gaze?playlist=playlists::3f81040a-2f31-4832-8e2e-545b1d39d173', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/us-en/videos/AP-1YM9QCYE52111', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/us-en/events/AP-1XV2K61Q51W11/live/AP-1XUJ86FDH1W11', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/films/AP-1ZSMAW8FH2111', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/episodes/AP-1TQWK7XE11W11', + 'only_matching': True, + }] + + def extract_info(self, video_id): + session = self._download_json( + 'https://api.redbull.tv/v3/session', video_id, + note='Downloading access token', query={ + 'category': 'personal_computer', + 'os_family': 'http', + }) + if session.get('code') == 'error': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, session['message'])) + token = session['token'] + + try: + video = self._download_json( + 'https://api.redbull.tv/v3/products/' + video_id, + video_id, note='Downloading video information', + headers={'Authorization': token} + ) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: + error_message = self._parse_json( + e.cause.response.read().decode(), video_id)['error'] + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error_message), expected=True) + raise + + title = video['title'].strip() + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + 'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token), + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + + for resource in video.get('resources', []): + if resource.startswith('closed_caption_'): + splitted_resource = resource.split('_') + if splitted_resource[2]: + subtitles.setdefault('en', []).append({ + 'url': 'https://resources.redbull.tv/%s/%s' % (video_id, resource), + 'ext': splitted_resource[2], + }) + + subheading = video.get('subheading') + if subheading: + title += ' - %s' % subheading + + return { + 'id': video_id, + 'title': title, + 'description': video.get('long_description') or video.get( + 'short_description'), + 'duration': float_or_none(video.get('duration'), scale=1000), + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.extract_info(video_id) + + +class RedBullEmbedIE(RedBullTVIE): # XXX: Do not subclass from concrete IE + _VALID_URL = r'https?://(?:www\.)?redbull\.com/embed/(?P<id>rrn:content:[^:]+:[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}:[a-z]{2}-[A-Z]{2,3})' + _TESTS = [{ + # HLS manifest accessible only using assetId + 'url': 'https://www.redbull.com/embed/rrn:content:episode-videos:f3021f4f-3ed4-51ac-915a-11987126e405:en-INT', + 'only_matching': True, + }] + _VIDEO_ESSENSE_TMPL = '''... on %s { + videoEssence { + attributes + } + }''' + + def _real_extract(self, url): + rrn_id = self._match_id(url) + asset_id = self._download_json( + 'https://edge-graphql.crepo-production.redbullaws.com/v1/graphql', + rrn_id, headers={ + 'Accept': 'application/json', + 'API-KEY': 'e90a1ff11335423998b100c929ecc866', + }, query={ + 'query': '''{ + resource(id: "%s", enforceGeoBlocking: false) { + %s + %s + } +}''' % (rrn_id, self._VIDEO_ESSENSE_TMPL % 'LiveVideo', self._VIDEO_ESSENSE_TMPL % 'VideoResource'), + })['data']['resource']['videoEssence']['attributes']['assetId'] + return self.extract_info(asset_id) + + +class RedBullTVRrnContentIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull\.com/(?P<region>[a-z]{2,3})-(?P<lang>[a-z]{2})/tv/(?:video|live|film)/(?P<id>rrn:content:[^:]+:[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:live-videos:e3e6feb4-e95f-50b7-962a-c70f8fd13c73/mens-dh-finals-fort-william', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:videos:a36a0f36-ff1b-5db8-a69d-ee11a14bf48b/tn-ts-style?playlist=rrn:content:event-profiles:83f05926-5de8-5389-b5e4-9bb312d715e8:extras', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/tv/film/rrn:content:films:d1f4d00e-4c04-5d19-b510-a805ffa2ab83/follow-me', + 'only_matching': True, + }] + + def _real_extract(self, url): + region, lang, rrn_id = self._match_valid_url(url).groups() + rrn_id += ':%s-%s' % (lang, region.upper()) + return self.url_result( + 'https://www.redbull.com/embed/' + rrn_id, + RedBullEmbedIE.ie_key(), rrn_id) + + +class RedBullIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull\.com/(?P<region>[a-z]{2,3})-(?P<lang>[a-z]{2})/(?P<type>(?:episode|film|(?:(?:recap|trailer)-)?video)s|live)/(?!AP-|rrn:content:)(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.redbull.com/int-en/episodes/grime-hashtags-s02-e04', + 'md5': 'db8271a7200d40053a1809ed0dd574ff', + 'info_dict': { + 'id': 'AA-1MT8DQWA91W14', + 'ext': 'mp4', + 'title': 'Grime - Hashtags S2E4', + 'description': 'md5:5546aa612958c08a98faaad4abce484d', + }, + }, { + 'url': 'https://www.redbull.com/int-en/films/kilimanjaro-mountain-of-greatness', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/recap-videos/uci-mountain-bike-world-cup-2017-mens-xco-finals-from-vallnord', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/trailer-videos/kings-of-content', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/videos/tnts-style-red-bull-dance-your-style-s1-e12', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/live/mens-dh-finals-fort-william', + 'only_matching': True, + }, { + # only available on the int-en website so a fallback is need for the API + # https://www.redbull.com/v3/api/graphql/v1/v3/query/en-GB>en-INT?filter[uriSlug]=fia-wrc-saturday-recap-estonia&rb3Schema=v1:hero + 'url': 'https://www.redbull.com/gb-en/live/fia-wrc-saturday-recap-estonia', + 'only_matching': True, + }] + _INT_FALLBACK_LIST = ['de', 'en', 'es', 'fr'] + _LAT_FALLBACK_MAP = ['ar', 'bo', 'car', 'cl', 'co', 'mx', 'pe'] + + def _real_extract(self, url): + region, lang, filter_type, display_id = self._match_valid_url(url).groups() + if filter_type == 'episodes': + filter_type = 'episode-videos' + elif filter_type == 'live': + filter_type = 'live-videos' + + regions = [region.upper()] + if region != 'int': + if region in self._LAT_FALLBACK_MAP: + regions.append('LAT') + if lang in self._INT_FALLBACK_LIST: + regions.append('INT') + locale = '>'.join(['%s-%s' % (lang, reg) for reg in regions]) + + rrn_id = self._download_json( + 'https://www.redbull.com/v3/api/graphql/v1/v3/query/' + locale, + display_id, query={ + 'filter[type]': filter_type, + 'filter[uriSlug]': display_id, + 'rb3Schema': 'v1:hero', + })['data']['id'] + + return self.url_result( + 'https://www.redbull.com/embed/' + rrn_id, + RedBullEmbedIE.ie_key(), rrn_id) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py new file mode 100644 index 0000000..62f669f --- /dev/null +++ b/yt_dlp/extractor/reddit.py @@ -0,0 +1,353 @@ +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + traverse_obj, + try_get, + unescapeHTML, + urlencode_postdata, + url_or_none, +) + + +class RedditIE(InfoExtractor): + _NETRC_MACHINE = 'reddit' + _VALID_URL = r'https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?comments/(?P<id>[^/?#&]+))' + _TESTS = [{ + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'display_id': '6rrwyj', + 'title': 'That small heart attack.', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:4', + 'timestamp': 1501941939, + 'upload_date': '20170805', + 'uploader': 'Antw87', + 'duration': 12, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + 'channel_id': 'videos', + }, + 'params': { + 'skip_download': True, + }, + }, { + # 1080p fallback format + 'url': 'https://www.reddit.com/r/aww/comments/90bu6w/heat_index_was_110_degrees_so_we_offered_him_a/', + 'md5': '8b5902cfda3006bf90faea7adf765a49', + 'info_dict': { + 'id': 'gyh95hiqc0b11', + 'ext': 'mp4', + 'display_id': '90bu6w', + 'title': 'Heat index was 110 degrees so we offered him a cold drink. He went for a full body soak instead', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:7', + 'timestamp': 1532051078, + 'upload_date': '20180720', + 'uploader': 'FootLoosePickleJuice', + 'duration': 14, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + 'channel_id': 'aww', + }, + }, { + # User post + 'url': 'https://www.reddit.com/user/creepyt0es/comments/nip71r/i_plan_to_make_more_stickers_and_prints_check/', + 'info_dict': { + 'id': 'zasobba6wp071', + 'ext': 'mp4', + 'display_id': 'nip71r', + 'title': 'I plan to make more stickers and prints! Check them out on my Etsy! Or get them through my Patreon. Links below.', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:5', + 'timestamp': 1621709093, + 'upload_date': '20210522', + 'uploader': 'creepyt0es', + 'duration': 6, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + 'channel_id': 'u_creepyt0es', + }, + 'params': { + 'skip_download': True, + }, + }, { + # videos embedded in reddit text post + 'url': 'https://www.reddit.com/r/KamenRider/comments/wzqkxp/finale_kamen_rider_revice_episode_50_family_to/', + 'playlist_count': 2, + 'info_dict': { + 'id': 'wzqkxp', + 'title': 'md5:72d3d19402aa11eff5bd32fc96369b37', + }, + }, { + # crossposted reddit-hosted media + 'url': 'https://www.reddit.com/r/dumbfuckers_club/comments/zjjw82/cringe/', + 'md5': '746180895c7b75a9d6b05341f507699a', + 'info_dict': { + 'id': 'a1oneun6pa5a1', + 'ext': 'mp4', + 'display_id': 'zjjw82', + 'title': 'Cringe', + 'uploader': 'Otaku-senpai69420', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'upload_date': '20221212', + 'timestamp': 1670812309, + 'duration': 16, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + 'channel_id': 'dumbfuckers_club', + }, + }, { + # post link without subreddit + 'url': 'https://www.reddit.com/comments/124pp33', + 'md5': '15eec9d828adcef4468b741a7e45a395', + 'info_dict': { + 'id': 'antsenjc2jqa1', + 'ext': 'mp4', + 'display_id': '124pp33', + 'title': 'Harmless prank of some old friends', + 'uploader': 'Dudezila', + 'channel_id': 'ContagiousLaughter', + 'duration': 17, + 'upload_date': '20230328', + 'timestamp': 1680012043, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'age_limit': 0, + 'comment_count': int, + 'dislike_count': int, + 'like_count': int, + }, + }, { + # quarantined subreddit post + 'url': 'https://old.reddit.com/r/GenZedong/comments/12fujy3/based_hasan/', + 'md5': '3156ea69e3c1f1b6259683c5abd36e71', + 'info_dict': { + 'id': '8bwtclfggpsa1', + 'ext': 'mp4', + 'display_id': '12fujy3', + 'title': 'Based Hasan?', + 'uploader': 'KingNigelXLII', + 'channel_id': 'GenZedong', + 'duration': 16, + 'upload_date': '20230408', + 'timestamp': 1680979138, + 'age_limit': 0, + 'comment_count': int, + 'dislike_count': int, + 'like_count': int, + }, + 'skip': 'Requires account that has opted-in to the GenZedong subreddit', + }, { + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', + 'only_matching': True, + }, { + # imgur + 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, + }, { + # imgur @ old reddit + 'url': 'https://old.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, + }, { + # streamable + 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/', + 'only_matching': True, + }, { + # youtube + 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', + 'only_matching': True, + }, { + # reddit video @ nm reddit + 'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/', + 'only_matching': True, + }, { + 'url': 'https://www.redditmedia.com/r/serbia/comments/pu9wbx/ako_vu%C4%8Di%C4%87_izgubi_izbore_ja_%C4%87u_da_crknem/', + 'only_matching': True, + }] + + def _perform_login(self, username, password): + captcha = self._download_json( + 'https://www.reddit.com/api/requires_captcha/login.json', None, + 'Checking login requirement')['required'] + if captcha: + raise ExtractorError('Reddit is requiring captcha before login', expected=True) + login = self._download_json( + f'https://www.reddit.com/api/login/{username}', None, data=urlencode_postdata({ + 'op': 'login-main', + 'user': username, + 'passwd': password, + 'api_type': 'json', + }), note='Logging in', errnote='Login request failed') + errors = '; '.join(traverse_obj(login, ('json', 'errors', ..., 1))) + if errors: + raise ExtractorError(f'Unable to login, Reddit API says {errors}', expected=True) + elif not traverse_obj(login, ('json', 'data', 'cookie', {str})): + raise ExtractorError('Unable to login, no cookie was returned') + + def _real_extract(self, url): + host, slug, video_id = self._match_valid_url(url).group('host', 'slug', 'id') + + data = self._download_json( + f'https://{host}/{slug}/.json', video_id, fatal=False, expected_status=403) + if not data: + fallback_host = 'old.reddit.com' if host != 'old.reddit.com' else 'www.reddit.com' + self.to_screen(f'{host} request failed, retrying with {fallback_host}') + data = self._download_json( + f'https://{fallback_host}/{slug}/.json', video_id, expected_status=403) + + if traverse_obj(data, 'error') == 403: + reason = data.get('reason') + if reason == 'quarantined': + self.raise_login_required('Quarantined subreddit; an account that has opted in is required') + elif reason == 'private': + self.raise_login_required('Private subreddit; an account that has been approved is required') + else: + raise ExtractorError(f'HTTP Error 403 Forbidden; reason given: {reason}') + + data = data[0]['data']['children'][0]['data'] + video_url = data['url'] + + over_18 = data.get('over_18') + if over_18 is True: + age_limit = 18 + elif over_18 is False: + age_limit = 0 + else: + age_limit = None + + thumbnails = [] + + def add_thumbnail(src): + if not isinstance(src, dict): + return + thumbnail_url = url_or_none(src.get('url')) + if not thumbnail_url: + return + thumbnails.append({ + 'url': unescapeHTML(thumbnail_url), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + 'http_headers': {'Accept': '*/*'}, + }) + + for image in try_get(data, lambda x: x['preview']['images']) or []: + if not isinstance(image, dict): + continue + add_thumbnail(image.get('source')) + resolutions = image.get('resolutions') + if isinstance(resolutions, list): + for resolution in resolutions: + add_thumbnail(resolution) + + info = { + 'title': data.get('title'), + 'thumbnails': thumbnails, + 'timestamp': float_or_none(data.get('created_utc')), + 'uploader': data.get('author'), + 'channel_id': data.get('subreddit'), + 'like_count': int_or_none(data.get('ups')), + 'dislike_count': int_or_none(data.get('downs')), + 'comment_count': int_or_none(data.get('num_comments')), + 'age_limit': age_limit, + } + + parsed_url = urllib.parse.urlparse(video_url) + + # Check for embeds in text posts, or else raise to avoid recursing into the same reddit URL + if 'reddit.com' in parsed_url.netloc and f'/{video_id}/' in parsed_url.path: + entries = [] + for media in traverse_obj(data, ('media_metadata', ...), expected_type=dict): + if not media.get('id') or media.get('e') != 'RedditVideo': + continue + formats = [] + if media.get('hlsUrl'): + formats.extend(self._extract_m3u8_formats( + unescapeHTML(media['hlsUrl']), video_id, 'mp4', m3u8_id='hls', fatal=False)) + if media.get('dashUrl'): + formats.extend(self._extract_mpd_formats( + unescapeHTML(media['dashUrl']), video_id, mpd_id='dash', fatal=False)) + if formats: + entries.append({ + 'id': media['id'], + 'display_id': video_id, + 'formats': formats, + **info, + }) + if entries: + return self.playlist_result(entries, video_id, info.get('title')) + raise ExtractorError('No media found', expected=True) + + # Check if media is hosted on reddit: + reddit_video = traverse_obj(data, ( + (None, ('crosspost_parent_list', ...)), ('secure_media', 'media'), 'reddit_video'), get_all=False) + if reddit_video: + playlist_urls = [ + try_get(reddit_video, lambda x: unescapeHTML(x[y])) + for y in ('dash_url', 'hls_url') + ] + + # Update video_id + display_id = video_id + video_id = self._search_regex( + r'https?://v\.redd\.it/(?P<id>[^/?#&]+)', reddit_video['fallback_url'], + 'video_id', default=display_id) + + dash_playlist_url = playlist_urls[0] or f'https://v.redd.it/{video_id}/DASHPlaylist.mpd' + hls_playlist_url = playlist_urls[1] or f'https://v.redd.it/{video_id}/HLSPlaylist.m3u8' + + formats = [{ + 'url': unescapeHTML(reddit_video['fallback_url']), + 'height': int_or_none(reddit_video.get('height')), + 'width': int_or_none(reddit_video.get('width')), + 'tbr': int_or_none(reddit_video.get('bitrate_kbps')), + 'acodec': 'none', + 'vcodec': 'h264', + 'ext': 'mp4', + 'format_id': 'fallback', + 'format_note': 'DASH video, mp4_dash', + }] + hls_fmts, subtitles = self._extract_m3u8_formats_and_subtitles( + hls_playlist_url, display_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(hls_fmts) + dash_fmts, dash_subs = self._extract_mpd_formats_and_subtitles( + dash_playlist_url, display_id, mpd_id='dash', fatal=False) + formats.extend(dash_fmts) + self._merge_subtitles(dash_subs, target=subtitles) + + return { + **info, + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'duration': int_or_none(reddit_video.get('duration')), + } + + if parsed_url.netloc == 'v.redd.it': + self.raise_no_formats('This video is processing', expected=True, video_id=video_id) + return { + **info, + 'id': parsed_url.path.split('/')[1], + 'display_id': video_id, + } + + # Not hosted on reddit, must continue extraction + return { + **info, + 'display_id': video_id, + '_type': 'url_transparent', + 'url': video_url, + } diff --git a/yt_dlp/extractor/redge.py b/yt_dlp/extractor/redge.py new file mode 100644 index 0000000..875d6f8 --- /dev/null +++ b/yt_dlp/extractor/redge.py @@ -0,0 +1,135 @@ +import functools + +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import ( + float_or_none, + int_or_none, + join_nonempty, + parse_qs, + update_url_query, +) +from ..utils.traversal import traverse_obj + + +class RedCDNLivxIE(InfoExtractor): + _VALID_URL = r'https?://[^.]+\.(?:dcs\.redcdn|atmcdn)\.pl/(?:live(?:dash|hls|ss)|nvr)/o2/(?P<tenant>[^/?#]+)/(?P<id>[^?#]+)\.livx' + IE_NAME = 'redcdnlivx' + + _TESTS = [{ + 'url': 'https://r.dcs.redcdn.pl/livedash/o2/senat/ENC02/channel.livx?indexMode=true&startTime=638272860000&stopTime=638292544000', + 'info_dict': { + 'id': 'ENC02-638272860000-638292544000', + 'ext': 'mp4', + 'title': 'ENC02', + 'duration': 19683.982, + 'live_status': 'was_live', + }, + }, { + 'url': 'https://r.dcs.redcdn.pl/livedash/o2/sejm/ENC18/live.livx?indexMode=true&startTime=722333096000&stopTime=722335562000', + 'info_dict': { + 'id': 'ENC18-722333096000-722335562000', + 'ext': 'mp4', + 'title': 'ENC18', + 'duration': 2463.995, + 'live_status': 'was_live', + }, + }, { + 'url': 'https://r.dcs.redcdn.pl/livehls/o2/sportevolution/live/triathlon2018/warsaw.livx/playlist.m3u8?startTime=550305000000&stopTime=550327620000', + 'info_dict': { + 'id': 'triathlon2018-warsaw-550305000000-550327620000', + 'ext': 'mp4', + 'title': 'triathlon2018/warsaw', + 'duration': 22619.98, + 'live_status': 'was_live', + }, + }, { + 'url': 'https://n-25-12.dcs.redcdn.pl/nvr/o2/sejm/Migacz-ENC01/1.livx?startTime=722347200000&stopTime=722367345000', + 'only_matching': True, + }, { + 'url': 'https://redir.atmcdn.pl/nvr/o2/sejm/ENC08/1.livx?startTime=503831270000&stopTime=503840040000', + 'only_matching': True, + }] + + """ + Known methods (first in url path): + - `livedash` - DASH MPD + - `livehls` - HTTP Live Streaming + - `livess` - IIS Smooth Streaming + - `nvr` - CCTV mode, directly returns a file, typically flv, avc1, aac + - `sc` - shoutcast/icecast (audio streams, like radio) + """ + + def _real_extract(self, url): + tenant, path = self._match_valid_url(url).group('tenant', 'id') + qs = parse_qs(url) + start_time = traverse_obj(qs, ('startTime', 0, {int_or_none})) + stop_time = traverse_obj(qs, ('stopTime', 0, {int_or_none})) + + def livx_mode(mode): + suffix = '' + if mode == 'livess': + suffix = '/manifest' + elif mode == 'livehls': + suffix = '/playlist.m3u8' + file_qs = {} + if start_time: + file_qs['startTime'] = start_time + if stop_time: + file_qs['stopTime'] = stop_time + if mode == 'nvr': + file_qs['nolimit'] = 1 + elif mode != 'sc': + file_qs['indexMode'] = 'true' + return update_url_query(f'https://r.dcs.redcdn.pl/{mode}/o2/{tenant}/{path}.livx{suffix}', file_qs) + + # no id or title for a transmission. making ones up. + title = path \ + .replace('/live', '').replace('live/', '') \ + .replace('/channel', '').replace('channel/', '') \ + .strip('/') + video_id = join_nonempty(title.replace('/', '-'), start_time, stop_time) + + formats = [] + # downloading the manifest separately here instead of _extract_ism_formats to also get some stream metadata + ism_res = self._download_xml_handle( + livx_mode('livess'), video_id, + note='Downloading ISM manifest', + errnote='Failed to download ISM manifest', + fatal=False) + ism_doc = None + if ism_res is not False: + ism_doc, ism_urlh = ism_res + formats, _ = self._parse_ism_formats_and_subtitles(ism_doc, ism_urlh.url, 'ss') + + nvr_urlh = self._request_webpage( + HEADRequest(livx_mode('nvr')), video_id, 'Follow flv file redirect', fatal=False, + expected_status=lambda _: True) + if nvr_urlh and nvr_urlh.status == 200: + formats.append({ + 'url': nvr_urlh.url, + 'ext': 'flv', + 'format_id': 'direct-0', + 'preference': -1, # might be slow + }) + formats.extend(self._extract_mpd_formats(livx_mode('livedash'), video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_m3u8_formats( + livx_mode('livehls'), video_id, m3u8_id='hls', ext='mp4', fatal=False)) + + time_scale = traverse_obj(ism_doc, ('@TimeScale', {int_or_none})) or 10000000 + duration = traverse_obj( + ism_doc, ('@Duration', {functools.partial(float_or_none, scale=time_scale)})) or None + + live_status = None + if traverse_obj(ism_doc, '@IsLive') == 'TRUE': + live_status = 'is_live' + elif duration: + live_status = 'was_live' + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'duration': duration, + 'live_status': live_status, + } diff --git a/yt_dlp/extractor/redgifs.py b/yt_dlp/extractor/redgifs.py new file mode 100644 index 0000000..f945320 --- /dev/null +++ b/yt_dlp/extractor/redgifs.py @@ -0,0 +1,260 @@ +import functools + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + qualities, + try_get, + OnDemandPagedList, +) + + +class RedGifsBaseInfoExtractor(InfoExtractor): + _FORMATS = { + 'gif': 250, + 'sd': 480, + 'hd': None, + } + + _API_HEADERS = { + 'referer': 'https://www.redgifs.com/', + 'origin': 'https://www.redgifs.com', + 'content-type': 'application/json', + } + + def _parse_gif_data(self, gif_data): + video_id = gif_data.get('id') + quality = qualities(tuple(self._FORMATS.keys())) + + orig_height = int_or_none(gif_data.get('height')) + aspect_ratio = try_get(gif_data, lambda x: orig_height / x['width']) + + formats = [] + for format_id, height in self._FORMATS.items(): + video_url = gif_data['urls'].get(format_id) + if not video_url: + continue + height = min(orig_height, height or orig_height) + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'width': height * aspect_ratio if aspect_ratio else None, + 'height': height, + 'quality': quality(format_id), + }) + + return { + 'id': video_id, + 'webpage_url': f'https://redgifs.com/watch/{video_id}', + 'extractor_key': RedGifsIE.ie_key(), + 'extractor': 'RedGifs', + 'title': ' '.join(gif_data.get('tags') or []) or 'RedGifs', + 'timestamp': int_or_none(gif_data.get('createDate')), + 'uploader': gif_data.get('userName'), + 'duration': int_or_none(gif_data.get('duration')), + 'view_count': int_or_none(gif_data.get('views')), + 'like_count': int_or_none(gif_data.get('likes')), + 'categories': gif_data.get('tags') or [], + 'tags': gif_data.get('tags'), + 'age_limit': 18, + 'formats': formats, + } + + def _fetch_oauth_token(self, video_id): + # https://github.com/Redgifs/api/wiki/Temporary-tokens + auth = self._download_json('https://api.redgifs.com/v2/auth/temporary', + video_id, note='Fetching temporary token') + if not auth.get('token'): + raise ExtractorError('Unable to get temporary token') + self._API_HEADERS['authorization'] = f'Bearer {auth["token"]}' + + def _call_api(self, ep, video_id, *args, **kwargs): + for first_attempt in True, False: + if 'authorization' not in self._API_HEADERS: + self._fetch_oauth_token(video_id) + try: + headers = dict(self._API_HEADERS) + headers['x-customheader'] = f'https://www.redgifs.com/watch/{video_id}' + data = self._download_json( + f'https://api.redgifs.com/v2/{ep}', video_id, headers=headers, *args, **kwargs) + break + except ExtractorError as e: + if first_attempt and isinstance(e.cause, HTTPError) and e.cause.status == 401: + del self._API_HEADERS['authorization'] # refresh the token + continue + raise + + if 'error' in data: + raise ExtractorError(f'RedGifs said: {data["error"]}', expected=True, video_id=video_id) + return data + + def _fetch_page(self, ep, video_id, query, page): + query['page'] = page + 1 + data = self._call_api( + ep, video_id, query=query, note=f'Downloading JSON metadata page {page + 1}') + + for entry in data['gifs']: + yield self._parse_gif_data(entry) + + def _prepare_api_query(self, query, fields): + api_query = [ + (field_name, query.get(field_name, (default,))[0]) + for field_name, default in fields.items()] + + return {key: val for key, val in api_query if val is not None} + + def _paged_entries(self, ep, item_id, query, fields): + page = int_or_none(query.get('page', (None,))[0]) + page_fetcher = functools.partial( + self._fetch_page, ep, item_id, self._prepare_api_query(query, fields)) + return page_fetcher(page) if page else OnDemandPagedList(page_fetcher, self._PAGE_SIZE) + + +class RedGifsIE(RedGifsBaseInfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?redgifs\.com/watch/|thumbs2\.redgifs\.com/)(?P<id>[^-/?#\.]+)' + _TESTS = [{ + 'url': 'https://www.redgifs.com/watch/squeakyhelplesswisent', + 'info_dict': { + 'id': 'squeakyhelplesswisent', + 'ext': 'mp4', + 'title': 'Hotwife Legs Thick', + 'timestamp': 1636287915, + 'upload_date': '20211107', + 'uploader': 'ignored52', + 'duration': 16, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + 'tags': list, + } + }, { + 'url': 'https://thumbs2.redgifs.com/SqueakyHelplessWisent-mobile.mp4#t=0', + 'info_dict': { + 'id': 'squeakyhelplesswisent', + 'ext': 'mp4', + 'title': 'Hotwife Legs Thick', + 'timestamp': 1636287915, + 'upload_date': '20211107', + 'uploader': 'ignored52', + 'duration': 16, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + 'tags': list, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url).lower() + video_info = self._call_api( + f'gifs/{video_id}?views=yes', video_id, note='Downloading video info') + return self._parse_gif_data(video_info['gif']) + + +class RedGifsSearchIE(RedGifsBaseInfoExtractor): + IE_DESC = 'Redgifs search' + _VALID_URL = r'https?://(?:www\.)?redgifs\.com/browse\?(?P<query>[^#]+)' + _PAGE_SIZE = 80 + _TESTS = [ + { + 'url': 'https://www.redgifs.com/browse?tags=Lesbian', + 'info_dict': { + 'id': 'tags=Lesbian', + 'title': 'Lesbian', + 'description': 'RedGifs search for Lesbian, ordered by trending' + }, + 'playlist_mincount': 100, + }, + { + 'url': 'https://www.redgifs.com/browse?type=g&order=latest&tags=Lesbian', + 'info_dict': { + 'id': 'type=g&order=latest&tags=Lesbian', + 'title': 'Lesbian', + 'description': 'RedGifs search for Lesbian, ordered by latest' + }, + 'playlist_mincount': 100, + }, + { + 'url': 'https://www.redgifs.com/browse?type=g&order=latest&tags=Lesbian&page=2', + 'info_dict': { + 'id': 'type=g&order=latest&tags=Lesbian&page=2', + 'title': 'Lesbian', + 'description': 'RedGifs search for Lesbian, ordered by latest' + }, + 'playlist_count': 80, + } + ] + + def _real_extract(self, url): + query_str = self._match_valid_url(url).group('query') + query = compat_parse_qs(query_str) + if not query.get('tags'): + raise ExtractorError('Invalid query tags', expected=True) + + tags = query.get('tags')[0] + order = query.get('order', ('trending',))[0] + + query['search_text'] = [tags] + entries = self._paged_entries('gifs/search', query_str, query, { + 'search_text': None, + 'order': 'trending', + 'type': None, + }) + + return self.playlist_result( + entries, query_str, tags, f'RedGifs search for {tags}, ordered by {order}') + + +class RedGifsUserIE(RedGifsBaseInfoExtractor): + IE_DESC = 'Redgifs user' + _VALID_URL = r'https?://(?:www\.)?redgifs\.com/users/(?P<username>[^/?#]+)(?:\?(?P<query>[^#]+))?' + _PAGE_SIZE = 30 + _TESTS = [ + { + 'url': 'https://www.redgifs.com/users/lamsinka89', + 'info_dict': { + 'id': 'lamsinka89', + 'title': 'lamsinka89', + 'description': 'RedGifs user lamsinka89, ordered by recent' + }, + 'playlist_mincount': 100, + }, + { + 'url': 'https://www.redgifs.com/users/lamsinka89?page=3', + 'info_dict': { + 'id': 'lamsinka89?page=3', + 'title': 'lamsinka89', + 'description': 'RedGifs user lamsinka89, ordered by recent' + }, + 'playlist_count': 30, + }, + { + 'url': 'https://www.redgifs.com/users/lamsinka89?order=best&type=g', + 'info_dict': { + 'id': 'lamsinka89?order=best&type=g', + 'title': 'lamsinka89', + 'description': 'RedGifs user lamsinka89, ordered by best' + }, + 'playlist_mincount': 100, + } + ] + + def _real_extract(self, url): + username, query_str = self._match_valid_url(url).group('username', 'query') + playlist_id = f'{username}?{query_str}' if query_str else username + + query = compat_parse_qs(query_str) + order = query.get('order', ('recent',))[0] + + entries = self._paged_entries(f'users/{username}/search', playlist_id, query, { + 'order': 'recent', + 'type': None, + }) + + return self.playlist_result( + entries, playlist_id, username, f'RedGifs user {username}, ordered by {order}') diff --git a/yt_dlp/extractor/redtube.py b/yt_dlp/extractor/redtube.py new file mode 100644 index 0000000..965abbe --- /dev/null +++ b/yt_dlp/extractor/redtube.py @@ -0,0 +1,144 @@ +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + merge_dicts, + str_to_int, + unified_strdate, + url_or_none, + urljoin, +) + + +class RedTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com(?:\.br)?/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)' + _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)'] + _TESTS = [{ + 'url': 'https://www.redtube.com/38864951', + 'md5': '4fba70cbca3aefd25767ab4b523c9878', + 'info_dict': { + 'id': '38864951', + 'ext': 'mp4', + 'title': 'Public Sex on the Balcony in Freezing Paris! Amateur Couple LeoLulu', + 'description': 'Watch video Public Sex on the Balcony in Freezing Paris! Amateur Couple LeoLulu on Redtube, home of free Blowjob porn videos and Blonde sex movies online. Video length: (10:46) - Uploaded by leolulu - Verified User - Starring Pornstar: Leolulu', + 'upload_date': '20210111', + 'timestamp': 1610343109, + 'duration': 646, + 'view_count': int, + 'age_limit': 18, + 'thumbnail': r're:https://\wi-ph\.rdtcdn\.com/videos/.+/.+\.jpg', + }, + }, { + 'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286', + 'only_matching': True, + }, { + 'url': 'http://it.redtube.com/66418', + 'only_matching': True, + }, { + 'url': 'https://www.redtube.com.br/103224331', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + f'https://www.redtube.com/{video_id}', video_id) + + ERRORS = ( + (('video-deleted-info', '>This video has been removed'), 'has been removed'), + (('private_video_text', '>This video is private', '>Send a friend request to its owner to be able to view it'), 'is private'), + ) + + for patterns, message in ERRORS: + if any(p in webpage for p in patterns): + raise ExtractorError( + 'Video %s %s' % (video_id, message), expected=True) + + info = self._search_json_ld(webpage, video_id, default={}) + + if not info.get('title'): + info['title'] = self._html_search_regex( + (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle|video_title)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>', + r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',), + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) + + formats = [] + sources = self._parse_json( + self._search_regex( + r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'), + video_id, fatal=False) + if sources and isinstance(sources, dict): + for format_id, format_url in sources.items(): + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + medias = self._parse_json( + self._search_regex( + r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage, + 'media definitions', default='{}'), + video_id, fatal=False) + for media in medias if isinstance(medias, list) else []: + format_url = urljoin('https://www.redtube.com', media.get('videoUrl')) + if not format_url: + continue + format_id = media.get('format') + quality = media.get('quality') + if format_id == 'hls' or (format_id == 'mp4' and not quality): + more_media = self._download_json(format_url, video_id, fatal=False) + else: + more_media = [media] + for media in more_media if isinstance(more_media, list) else []: + format_url = url_or_none(media.get('videoUrl')) + if not format_url: + continue + format_id = media.get('format') + if format_id == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id or 'hls', + fatal=False)) + continue + format_id = media.get('quality') + formats.append({ + 'url': format_url, + 'ext': 'mp4', + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + if not formats: + video_url = self._html_search_regex( + r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') + formats.append({'url': video_url, 'ext': 'mp4'}) + + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._search_regex( + r'<span[^>]+>(?:ADDED|Published on) ([^<]+)<', + webpage, 'upload date', default=None)) + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, default=None) or self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) + view_count = str_to_int(self._search_regex( + (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)', + r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)', + r'<span[^>]+\bclass=["\']video_view_count[^>]*>\s*([\d,.]+)'), + webpage, 'view count', default=None)) + + # No self-labeling, but they describe themselves as + # "Home of Videos Porno" + age_limit = 18 + + return merge_dicts(info, { + 'id': video_id, + 'ext': 'mp4', + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'age_limit': age_limit, + 'formats': formats, + }) diff --git a/yt_dlp/extractor/rentv.py b/yt_dlp/extractor/rentv.py new file mode 100644 index 0000000..abb537c --- /dev/null +++ b/yt_dlp/extractor/rentv.py @@ -0,0 +1,104 @@ +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + url_or_none, +) + + +class RENTVIE(InfoExtractor): + _WORKING = False + _VALID_URL = r'(?:rentv:|https?://(?:www\.)?ren\.tv/(?:player|video/epizod)/)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://ren.tv/video/epizod/118577', + 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': '118577', + 'ext': 'mp4', + 'title': 'Документальный спецпроект: "Промывка мозгов. Технологии XXI века"', + 'timestamp': 1472230800, + 'upload_date': '20160826', + } + }, { + 'url': 'http://ren.tv/player/118577', + 'only_matching': True, + }, { + 'url': 'rentv:118577', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage('http://ren.tv/player/' + video_id, video_id) + config = self._parse_json(self._search_regex( + r'config\s*=\s*({.+})\s*;', webpage, 'config'), video_id) + title = config['title'] + formats = [] + for video in config['src']: + src = url_or_none(video.get('src')) + if not src: + continue + ext = determine_ext(src) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src, + }) + return { + 'id': video_id, + 'title': title, + 'description': config.get('description'), + 'thumbnail': config.get('image'), + 'duration': int_or_none(config.get('duration')), + 'timestamp': int_or_none(config.get('date')), + 'formats': formats, + } + + +class RENTVArticleIE(InfoExtractor): + _WORKING = False + _VALID_URL = r'https?://(?:www\.)?ren\.tv/novosti/\d{4}-\d{2}-\d{2}/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'http://ren.tv/novosti/2016-10-26/video-mikroavtobus-popavshiy-v-dtp-s-gruzovikami-v-podmoskove-prevratilsya-v', + 'md5': 'ebd63c4680b167693745ab91343df1d6', + 'info_dict': { + 'id': '136472', + 'ext': 'mp4', + 'title': 'Видео: микроавтобус, попавший в ДТП с грузовиками в Подмосковье, превратился в груду металла', + 'description': 'Жертвами столкновения двух фур и микроавтобуса, по последним данным, стали семь человек.', + } + }, { + # TODO: invalid m3u8 + 'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video', + 'info_dict': { + 'id': 'playlist', + 'ext': 'mp4', + 'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ', + 'uploader': 'ren.tv', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'skip': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) + + entries = [] + for config_profile in drupal_settings.get('ren_jwplayer', {}).values(): + media_id = config_profile.get('mediaid') + if not media_id: + continue + media_id = compat_str(media_id) + entries.append(self.url_result('rentv:' + media_id, 'RENTV', media_id)) + return self.playlist_result(entries, display_id) diff --git a/yt_dlp/extractor/restudy.py b/yt_dlp/extractor/restudy.py new file mode 100644 index 0000000..f49262a --- /dev/null +++ b/yt_dlp/extractor/restudy.py @@ -0,0 +1,41 @@ +from .common import InfoExtractor + + +class RestudyIE(InfoExtractor): + _WORKING = False + _VALID_URL = r'https?://(?:(?:www|portal)\.)?restudy\.dk/video/[^/]+/id/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.restudy.dk/video/play/id/1637', + 'info_dict': { + 'id': '1637', + 'ext': 'flv', + 'title': 'Leiden-frosteffekt', + 'description': 'Denne video er et eksperiment med flydende kvælstof.', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'https://portal.restudy.dk/video/leiden-frosteffekt/id/1637', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage).strip() + description = self._og_search_description(webpage).strip() + + formats = self._extract_smil_formats( + 'https://cdn.portal.restudy.dk/dynamic/themes/front/awsmedia/SmilDirectory/video_%s.xml' % video_id, + video_id) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + } diff --git a/yt_dlp/extractor/reuters.py b/yt_dlp/extractor/reuters.py new file mode 100644 index 0000000..0a8f13b --- /dev/null +++ b/yt_dlp/extractor/reuters.py @@ -0,0 +1,66 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + int_or_none, + unescapeHTML, +) + + +class ReutersIE(InfoExtractor): + _WORKING = False + _VALID_URL = r'https?://(?:www\.)?reuters\.com/.*?\?.*?videoId=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.reuters.com/video/2016/05/20/san-francisco-police-chief-resigns?videoId=368575562', + 'md5': '8015113643a0b12838f160b0b81cc2ee', + 'info_dict': { + 'id': '368575562', + 'ext': 'mp4', + 'title': 'San Francisco police chief resigns', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://www.reuters.com/assets/iframe/yovideo?videoId=%s' % video_id, video_id) + video_data = js_to_json(self._search_regex( + r'(?s)Reuters\.yovideo\.drawPlayer\(({.*?})\);', + webpage, 'video data')) + + def get_json_value(key, fatal=False): + return self._search_regex(r'"%s"\s*:\s*"([^"]+)"' % key, video_data, key, fatal=fatal) + + title = unescapeHTML(get_json_value('title', fatal=True)) + mmid, fid = re.search(r',/(\d+)\?f=(\d+)', get_json_value('flv', fatal=True)).groups() + + mas_data = self._download_json( + 'http://mas-e.cds1.yospace.com/mas/%s/%s?trans=json' % (mmid, fid), + video_id, transform_source=js_to_json) + formats = [] + for f in mas_data: + f_url = f.get('url') + if not f_url: + continue + method = f.get('method') + if method == 'hls': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + container = f.get('container') + ext = '3gp' if method == 'mobile' else container + formats.append({ + 'format_id': ext, + 'url': f_url, + 'ext': ext, + 'container': container if method != 'mobile' else None, + }) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': get_json_value('thumb'), + 'duration': int_or_none(get_json_value('seconds')), + 'formats': formats, + } diff --git a/yt_dlp/extractor/reverbnation.py b/yt_dlp/extractor/reverbnation.py new file mode 100644 index 0000000..06b6c3c --- /dev/null +++ b/yt_dlp/extractor/reverbnation.py @@ -0,0 +1,51 @@ +from .common import InfoExtractor +from ..utils import ( + qualities, + str_or_none, +) + + +class ReverbNationIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P<id>\d+).*?$' + _TESTS = [{ + 'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', + 'md5': 'c0aaf339bcee189495fdf5a8c8ba8645', + 'info_dict': { + 'id': '16965047', + 'ext': 'mp3', + 'title': 'MONA LISA', + 'uploader': 'ALKILADOS', + 'uploader_id': '216429', + 'thumbnail': r're:^https?://.*\.jpg', + }, + }] + + def _real_extract(self, url): + song_id = self._match_id(url) + + api_res = self._download_json( + 'https://api.reverbnation.com/song/%s' % song_id, + song_id, + note='Downloading information of song %s' % song_id + ) + + THUMBNAILS = ('thumbnail', 'image') + quality = qualities(THUMBNAILS) + thumbnails = [] + for thumb_key in THUMBNAILS: + if api_res.get(thumb_key): + thumbnails.append({ + 'url': api_res[thumb_key], + 'preference': quality(thumb_key) + }) + + return { + 'id': song_id, + 'title': api_res['name'], + 'url': api_res['url'], + 'uploader': api_res.get('artist', {}).get('name'), + 'uploader_id': str_or_none(api_res.get('artist', {}).get('id')), + 'thumbnails': thumbnails, + 'ext': 'mp3', + 'vcodec': 'none', + } diff --git a/yt_dlp/extractor/rheinmaintv.py b/yt_dlp/extractor/rheinmaintv.py new file mode 100644 index 0000000..c3b352d --- /dev/null +++ b/yt_dlp/extractor/rheinmaintv.py @@ -0,0 +1,94 @@ +from .common import InfoExtractor +from ..utils import extract_attributes, merge_dicts, remove_end + + +class RheinMainTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rheinmaintv\.de/sendungen/(?:[\w-]+/)*(?P<video_id>(?P<display_id>[\w-]+)/vom-\d{2}\.\d{2}\.\d{4}(?:/\d+)?)' + _TESTS = [{ + 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/auf-dem-weg-zur-deutschen-meisterschaft/vom-07.11.2022/', + 'info_dict': { + 'id': 'auf-dem-weg-zur-deutschen-meisterschaft-vom-07.11.2022', + 'ext': 'ismv', # ismv+isma will be merged into mp4 + 'alt_title': 'Auf dem Weg zur Deutschen Meisterschaft', + 'title': 'Auf dem Weg zur Deutschen Meisterschaft', + 'upload_date': '20221108', + 'view_count': int, + 'display_id': 'auf-dem-weg-zur-deutschen-meisterschaft', + 'thumbnail': r're:^https://.+\.jpg', + 'description': 'md5:48c59b74192bc819a9b34af1d5ed1eb9', + 'timestamp': 1667933057, + 'duration': 243.0, + }, + 'params': {'skip_download': 'ism'}, + }, { + 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften/vom-14.11.2022/', + 'info_dict': { + 'id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften-vom-14.11.2022', + 'ext': 'ismv', + 'title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften', + 'timestamp': 1668526214, + 'display_id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften', + 'alt_title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften', + 'view_count': int, + 'thumbnail': r're:^https://.+\.jpg', + 'duration': 345.0, + 'description': 'md5:9370ba29526984006c2cba1372e5c5a0', + 'upload_date': '20221115', + }, + 'params': {'skip_download': 'ism'}, + }, { + 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/casino-mainz-bei-den-deutschen-meisterschaften/vom-14.11.2022/', + 'info_dict': { + 'id': 'casino-mainz-bei-den-deutschen-meisterschaften-vom-14.11.2022', + 'ext': 'ismv', + 'title': 'Casino Mainz bei den Deutschen Meisterschaften', + 'view_count': int, + 'timestamp': 1668527402, + 'alt_title': 'Casino Mainz bei den Deutschen Meisterschaften', + 'upload_date': '20221115', + 'display_id': 'casino-mainz-bei-den-deutschen-meisterschaften', + 'duration': 348.0, + 'thumbnail': r're:^https://.+\.jpg', + 'description': 'md5:70fc1660eeba96da17199e5bdff4c0aa', + }, + 'params': {'skip_download': 'ism'}, + }, { + 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/bricks4kids/vom-22.06.2022/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('display_id') + video_id = mobj.group('video_id').replace('/', '-') + webpage = self._download_webpage(url, video_id) + + source, img = self._search_regex(r'(?s)(?P<source><source[^>]*>)(?P<img><img[^>]*>)', + webpage, 'video', group=('source', 'img')) + source = extract_attributes(source) + img = extract_attributes(img) + + raw_json_ld = list(self._yield_json_ld(webpage, video_id)) + json_ld = self._json_ld(raw_json_ld, video_id) + json_ld.pop('url', None) + + ism_manifest_url = ( + source.get('src') + or next(json_ld.get('embedUrl') for json_ld in raw_json_ld if json_ld.get('@type') == 'VideoObject') + ) + formats, subtitles = self._extract_ism_formats_and_subtitles(ism_manifest_url, video_id) + + return merge_dicts({ + 'id': video_id, + 'display_id': display_id, + 'title': + self._html_search_regex(r'<h1><span class="title">([^<]*)</span>', + webpage, 'headline', default=None) + or img.get('title') or json_ld.get('title') or self._og_search_title(webpage) + or remove_end(self._html_extract_title(webpage), ' -'), + 'alt_title': img.get('alt'), + 'description': json_ld.get('description') or self._og_search_description(webpage), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': [{'url': img['src']}] if 'src' in img else json_ld.get('thumbnails'), + }, json_ld) diff --git a/yt_dlp/extractor/ridehome.py b/yt_dlp/extractor/ridehome.py new file mode 100644 index 0000000..78f838a --- /dev/null +++ b/yt_dlp/extractor/ridehome.py @@ -0,0 +1,96 @@ +from .art19 import Art19IE +from .common import InfoExtractor +from ..utils import extract_attributes, get_elements_html_by_class +from ..utils.traversal import traverse_obj + + +class RideHomeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ridehome\.info/show/[\w-]+/(?P<id>[\w-]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://www.ridehome.info/show/techmeme-ride-home/thu-1228-will-2024-be-the-year-apple-gets-serious-about-gaming-on-macs/', + 'info_dict': { + 'id': 'thu-1228-will-2024-be-the-year-apple-gets-serious-about-gaming-on-macs', + }, + 'playlist_count': 1, + 'playlist': [{ + 'md5': 'c84ea3cc96950a9ab86fe540f3edc588', + 'info_dict': { + 'id': '540e5493-9fe6-4c14-a488-dc508d8794b2', + 'ext': 'mp3', + 'title': 'Thu. 12/28 – Will 2024 Be The Year Apple Gets Serious About Gaming On Macs?', + 'description': 'md5:9dba86ae9b5047a8150eceddeeb629c2', + 'series': 'Techmeme Ride Home', + 'series_id': '3c30e8f4-ab48-415b-9421-1ae06cd4058b', + 'upload_date': '20231228', + 'timestamp': 1703780995, + 'modified_date': '20231230', + 'episode_id': '540e5493-9fe6-4c14-a488-dc508d8794b2', + 'modified_timestamp': 1703912404, + 'release_date': '20231228', + 'release_timestamp': 1703782800, + 'duration': 1000.1502, + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com/images/.*\.jpeg$', + }, + }], + }, { + 'url': 'https://www.ridehome.info/show/techmeme-ride-home/portfolio-profile-sensel-with-ilyarosenberg/', + 'info_dict': { + 'id': 'portfolio-profile-sensel-with-ilyarosenberg', + }, + 'playlist_count': 1, + 'playlist': [{ + 'md5': 'bf9d6efad221008ce71aea09d5533cf6', + 'info_dict': { + 'id': '6beed803-b1ef-4536-9fef-c23cf6b4dcac', + 'ext': 'mp3', + 'title': '(Portfolio Profile) Sensel - With @IlyaRosenberg', + 'description': 'md5:e1e4a970bce04290e0ba6f030b0125db', + 'series': 'Techmeme Ride Home', + 'series_id': '3c30e8f4-ab48-415b-9421-1ae06cd4058b', + 'upload_date': '20220108', + 'timestamp': 1641656064, + 'modified_date': '20230418', + 'episode_id': '6beed803-b1ef-4536-9fef-c23cf6b4dcac', + 'modified_timestamp': 1681843318, + 'release_date': '20220108', + 'release_timestamp': 1641672000, + 'duration': 2789.38122, + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com/images/.*\.jpeg$' + }, + }], + }, { + 'url': 'https://www.ridehome.info/show/spacecasts/big-tech-news-apples-macbook-pro-event/', + 'info_dict': { + 'id': 'big-tech-news-apples-macbook-pro-event', + }, + 'playlist_count': 1, + 'playlist': [{ + 'md5': 'b1428530c6e03904a8271e978007fc05', + 'info_dict': { + 'id': 'f4780044-6c4b-4ce0-8215-8a86cc66bff7', + 'ext': 'mp3', + 'title': 'md5:e6c05d44d59b6577a4145ac339de5040', + 'description': 'md5:14152f7228c8a301a77e3d6bc891b145', + 'series': 'SpaceCasts', + 'series_id': '8e3e837d-7fe0-4a23-8e11-894917e07e17', + 'upload_date': '20211026', + 'timestamp': 1635271450, + 'modified_date': '20230502', + 'episode_id': 'f4780044-6c4b-4ce0-8215-8a86cc66bff7', + 'modified_timestamp': 1683057500, + 'release_date': '20211026', + 'release_timestamp': 1635272124, + 'duration': 2266.30531, + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com/images/.*\.jpeg$' + }, + }], + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + + urls = traverse_obj( + get_elements_html_by_class('iframeContainer', webpage), + (..., {extract_attributes}, lambda k, v: k == 'data-src' and Art19IE.suitable(v))) + return self.playlist_from_matches(urls, article_id, ie=Art19IE) diff --git a/yt_dlp/extractor/rinsefm.py b/yt_dlp/extractor/rinsefm.py new file mode 100644 index 0000000..f87b895 --- /dev/null +++ b/yt_dlp/extractor/rinsefm.py @@ -0,0 +1,89 @@ +from .common import InfoExtractor +from ..utils import ( + MEDIA_EXTENSIONS, + determine_ext, + parse_iso8601, + traverse_obj, + url_or_none, +) + + +class RinseFMBaseIE(InfoExtractor): + @staticmethod + def _parse_entry(entry): + return { + **traverse_obj(entry, { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'url': ('fileUrl', {url_or_none}), + 'release_timestamp': ('episodeDate', {parse_iso8601}), + 'thumbnail': ('featuredImage', 0, 'filename', {str}, + {lambda x: x and f'https://rinse.imgix.net/media/{x}'}), + 'webpage_url': ('slug', {str}, + {lambda x: x and f'https://rinse.fm/episodes/{x}'}), + }), + 'vcodec': 'none', + 'extractor_key': RinseFMIE.ie_key(), + 'extractor': RinseFMIE.IE_NAME, + } + + +class RinseFMIE(RinseFMBaseIE): + _VALID_URL = r'https?://(?:www\.)?rinse\.fm/episodes/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://rinse.fm/episodes/club-glow-15-12-2023-2000/', + 'md5': '76ee0b719315617df42e15e710f46c7b', + 'info_dict': { + 'id': '1536535', + 'ext': 'mp3', + 'title': 'Club Glow - 15/12/2023 - 20:00', + 'thumbnail': r're:^https://.+\.(?:jpg|JPG)$', + 'release_timestamp': 1702598400, + 'release_date': '20231215' + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + entry = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['entry'] + + return self._parse_entry(entry) + + +class RinseFMArtistPlaylistIE(RinseFMBaseIE): + _VALID_URL = r'https?://(?:www\.)?rinse\.fm/shows/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://rinse.fm/shows/resources/', + 'info_dict': { + 'id': 'resources', + 'title': '[re]sources', + 'description': '[re]sources est un label parisien piloté par le DJ et producteur Tommy Kid.' + }, + 'playlist_mincount': 40 + }, { + 'url': 'https://rinse.fm/shows/ivy/', + 'info_dict': { + 'id': 'ivy', + 'title': '[IVY]', + 'description': 'A dedicated space for DNB/Turbo House and 4x4.' + }, + 'playlist_mincount': 7 + }] + + def _entries(self, data): + for episode in traverse_obj(data, ( + 'props', 'pageProps', 'episodes', lambda _, v: determine_ext(v['fileUrl']) in MEDIA_EXTENSIONS.audio) + ): + yield self._parse_entry(episode) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + title = self._og_search_title(webpage) or self._html_search_meta('title', webpage) + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage) + data = self._search_nextjs_data(webpage, playlist_id) + + return self.playlist_result( + self._entries(data), playlist_id, title, description=description) diff --git a/yt_dlp/extractor/rmcdecouverte.py b/yt_dlp/extractor/rmcdecouverte.py new file mode 100644 index 0000000..8d29b30 --- /dev/null +++ b/yt_dlp/extractor/rmcdecouverte.py @@ -0,0 +1,71 @@ +from .common import InfoExtractor +from .brightcove import BrightcoveLegacyIE +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import smuggle_url + + +class RMCDecouverteIE(InfoExtractor): + _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/(?:[^?#]*_(?P<id>\d+)|mediaplayer-direct)/?(?:[#?]|$)' + + _TESTS = [{ + 'url': 'https://rmcdecouverte.bfmtv.com/vestiges-de-guerre_22240/les-bunkers-secrets-domaha-beach_25303/', + 'info_dict': { + 'id': '6250879771001', + 'ext': 'mp4', + 'title': 'LES BUNKERS SECRETS D´OMAHA BEACH', + 'uploader_id': '1969646226001', + 'description': 'md5:aed573ca24abde62a148e0eba909657d', + 'timestamp': 1619622984, + 'upload_date': '20210428', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://rmcdecouverte.bfmtv.com/wheeler-dealers-occasions-a-saisir/program_2566/', + 'info_dict': { + 'id': '5983675500001', + 'ext': 'mp4', + 'title': 'CORVETTE', + 'description': 'md5:c1e8295521e45ffebf635d6a7658f506', + 'uploader_id': '1969646226001', + 'upload_date': '20181226', + 'timestamp': 1545861635, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'only available for a week', + }, { + 'url': 'https://rmcdecouverte.bfmtv.com/avions-furtifs-la-technologie-de-lextreme_10598', + 'only_matching': True, + }, { + # The website accepts any URL as long as it has _\d+ at the end + 'url': 'https://rmcdecouverte.bfmtv.com/any/thing/can/go/here/_10598', + 'only_matching': True, + }, { + # live, geo restricted, bypassable + 'url': 'https://rmcdecouverte.bfmtv.com/mediaplayer-direct/', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('id') or 'direct' + webpage = self._download_webpage(url, display_id) + brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + if brightcove_legacy_url: + brightcove_id = compat_parse_qs(compat_urlparse.urlparse( + brightcove_legacy_url).query)['@videoPlayer'][0] + else: + brightcove_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'brightcove id') + return self.url_result( + smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['FR']}), + 'BrightcoveNew', brightcove_id) diff --git a/yt_dlp/extractor/rockstargames.py b/yt_dlp/extractor/rockstargames.py new file mode 100644 index 0000000..1662243 --- /dev/null +++ b/yt_dlp/extractor/rockstargames.py @@ -0,0 +1,65 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class RockstarGamesIE(InfoExtractor): + _WORKING = False + _VALID_URL = r'https?://(?:www\.)?rockstargames\.com/videos(?:/video/|#?/?\?.*\bvideo=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.rockstargames.com/videos/video/11544/', + 'md5': '03b5caa6e357a4bd50e3143fc03e5733', + 'info_dict': { + 'id': '11544', + 'ext': 'mp4', + 'title': 'Further Adventures in Finance and Felony Trailer', + 'description': 'md5:6d31f55f30cb101b5476c4a379e324a3', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1464876000, + 'upload_date': '20160602', + } + }, { + 'url': 'http://www.rockstargames.com/videos#/?video=48', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'https://www.rockstargames.com/videoplayer/videos/get-video.json', + video_id, query={ + 'id': video_id, + 'locale': 'en_us', + })['video'] + + title = video['title'] + + formats = [] + for v in video['files_processed']['video/mp4']: + if not v.get('src'): + continue + resolution = v.get('resolution') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', resolution or '', 'height', default=None)) + formats.append({ + 'url': self._proto_relative_url(v['src']), + 'format_id': resolution, + 'height': height, + }) + + if not formats: + youtube_id = video.get('youtube_id') + if youtube_id: + return self.url_result(youtube_id, 'Youtube') + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': self._proto_relative_url(video.get('screencap')), + 'timestamp': parse_iso8601(video.get('created')), + 'formats': formats, + } diff --git a/yt_dlp/extractor/rokfin.py b/yt_dlp/extractor/rokfin.py new file mode 100644 index 0000000..5099f3a --- /dev/null +++ b/yt_dlp/extractor/rokfin.py @@ -0,0 +1,455 @@ +import itertools +import json +import re +import urllib.parse +from datetime import datetime + +from .common import InfoExtractor, SearchInfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + float_or_none, + format_field, + int_or_none, + str_or_none, + traverse_obj, + try_get, + unescapeHTML, + unified_timestamp, + url_or_none, + urlencode_postdata, +) + +_API_BASE_URL = 'https://prod-api-v2.production.rokfin.com/api/v2/public/' + + +class RokfinIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rokfin\.com/(?P<id>(?P<type>post|stream)/\d+)' + _NETRC_MACHINE = 'rokfin' + _AUTH_BASE = 'https://secure.rokfin.com/auth/realms/rokfin-web/protocol/openid-connect' + _access_mgmt_tokens = {} # OAuth 2.0: RFC 6749, Sec. 1.4-5 + _TESTS = [{ + 'url': 'https://www.rokfin.com/post/57548/Mitt-Romneys-Crazy-Solution-To-Climate-Change', + 'info_dict': { + 'id': 'post/57548', + 'ext': 'mp4', + 'title': 'Mitt Romney\'s Crazy Solution To Climate Change', + 'thumbnail': r're:https://img\.production\.rokfin\.com/.+', + 'upload_date': '20211023', + 'timestamp': 1634998029, + 'channel': 'Jimmy Dore', + 'channel_id': '65429', + 'channel_url': 'https://rokfin.com/TheJimmyDoreShow', + 'availability': 'public', + 'live_status': 'not_live', + 'dislike_count': int, + 'like_count': int, + 'duration': 213, + } + }, { + 'url': 'https://rokfin.com/post/223/Julian-Assange-Arrested-Streaming-In-Real-Time', + 'info_dict': { + 'id': 'post/223', + 'ext': 'mp4', + 'title': 'Julian Assange Arrested: Streaming In Real Time', + 'thumbnail': r're:https://img\.production\.rokfin\.com/.+', + 'upload_date': '20190412', + 'timestamp': 1555052644, + 'channel': 'Ron Placone', + 'channel_id': '10', + 'channel_url': 'https://rokfin.com/RonPlacone', + 'availability': 'public', + 'live_status': 'not_live', + 'dislike_count': int, + 'like_count': int, + 'tags': ['FreeThinkingMedia^', 'RealProgressives^'], + } + }, { + 'url': 'https://www.rokfin.com/stream/10543/Its-A-Crazy-Mess-Regional-Director-Blows-Whistle-On-Pfizers-Vaccine-Trial-Data', + 'info_dict': { + 'id': 'stream/10543', + 'ext': 'mp4', + 'title': '"It\'s A Crazy Mess" Regional Director Blows Whistle On Pfizer\'s Vaccine Trial Data', + 'thumbnail': r're:https://img\.production\.rokfin\.com/.+', + 'description': 'md5:324ce2d3e3b62e659506409e458b9d8e', + 'channel': 'TLAVagabond', + 'channel_id': '53856', + 'channel_url': 'https://rokfin.com/TLAVagabond', + 'availability': 'public', + 'is_live': False, + 'was_live': True, + 'live_status': 'was_live', + 'timestamp': 1635874720, + 'release_timestamp': 1635874720, + 'release_date': '20211102', + 'upload_date': '20211102', + 'dislike_count': int, + 'like_count': int, + 'tags': ['FreeThinkingMedia^'], + } + }, { + 'url': 'https://rokfin.com/post/126703/Brave-New-World--Aldous-Huxley-DEEPDIVE--Chpts-13--Quite-Frankly--Jay-Dyer', + 'info_dict': { + 'id': 'post/126703', + 'ext': 'mp4', + 'title': 'Brave New World - Aldous Huxley DEEPDIVE! (Chpts 1-3) - Quite Frankly & Jay Dyer', + 'thumbnail': r're:https://img\.production\.rokfin\.com/.+', + 'channel': 'Jay Dyer', + 'channel_id': '186881', + 'channel_url': 'https://rokfin.com/jaydyer', + 'availability': 'premium_only', + 'live_status': 'not_live', + 'dislike_count': int, + 'like_count': int, + 'timestamp': 1678213357, + 'upload_date': '20230307', + 'tags': ['FreeThinkingMedia^', 'OpenMind^'], + 'description': 'md5:cb04e32e68326c9b2b251b297bacff35', + 'duration': 3100, + } + }, { + 'url': 'https://rokfin.com/stream/31332/The-Grayzone-live-on-Nordstream-blame-game', + 'info_dict': { + 'id': 'stream/31332', + 'ext': 'mp4', + 'title': 'The Grayzone live on Nordstream blame game', + 'thumbnail': r're:https://image\.v\.rokfin\.com/.+', + 'channel': 'Max Blumenthal', + 'channel_id': '248902', + 'channel_url': 'https://rokfin.com/MaxBlumenthal', + 'availability': 'premium_only', + 'live_status': 'was_live', + 'dislike_count': int, + 'like_count': int, + 'timestamp': 1678475166, + 'release_timestamp': 1678475166.0, + 'release_date': '20230310', + 'upload_date': '20230310', + 'tags': ['FreeThinkingMedia^'], + } + }] + + def _real_extract(self, url): + video_id, video_type = self._match_valid_url(url).group('id', 'type') + metadata = self._download_json_using_access_token(f'{_API_BASE_URL}{video_id}', video_id) + + scheduled = unified_timestamp(metadata.get('scheduledAt')) + live_status = ('was_live' if metadata.get('stoppedAt') + else 'is_upcoming' if scheduled + else 'is_live' if video_type == 'stream' + else 'not_live') + + video_url = traverse_obj(metadata, 'url', ('content', 'contentUrl'), expected_type=url_or_none) + if video_url in (None, 'fake.m3u8'): + video_url = format_field(self._search_regex( + r'https?://[^/]+/([^/]+)/storyboard.vtt', + traverse_obj(metadata, 'timelineUrl', ('content', 'timelineUrl'), expected_type=url_or_none), + video_id, default=None), None, 'https://stream.v.rokfin.com/%s.m3u8') + + formats, subtitles = [{'url': video_url}] if video_url else [], {} + if determine_ext(video_url) == 'm3u8': + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, fatal=False, live=live_status == 'is_live') + + if not formats: + if traverse_obj(metadata, 'premiumPlan', 'premium'): + self.raise_login_required('This video is only available to premium users', True, method='cookies') + elif scheduled: + self.raise_no_formats( + f'Stream is offline; scheduled for {datetime.fromtimestamp(scheduled).strftime("%Y-%m-%d %H:%M:%S")}', + video_id=video_id, expected=True) + + uploader = traverse_obj(metadata, ('createdBy', 'username'), ('creator', 'username')) + timestamp = (scheduled or float_or_none(metadata.get('postedAtMilli'), 1000) + or unified_timestamp(metadata.get('creationDateTime'))) + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': str_or_none(traverse_obj(metadata, 'title', ('content', 'contentTitle'))), + 'duration': float_or_none(traverse_obj(metadata, ('content', 'duration'))), + 'thumbnail': url_or_none(traverse_obj(metadata, 'thumbnail', ('content', 'thumbnailUrl1'))), + 'description': str_or_none(traverse_obj(metadata, 'description', ('content', 'contentDescription'))), + 'like_count': int_or_none(metadata.get('likeCount')), + 'dislike_count': int_or_none(metadata.get('dislikeCount')), + 'channel': str_or_none(traverse_obj(metadata, ('createdBy', 'name'), ('creator', 'name'))), + 'channel_id': str_or_none(traverse_obj(metadata, ('createdBy', 'id'), ('creator', 'id'))), + 'channel_url': url_or_none(f'https://rokfin.com/{uploader}') if uploader else None, + 'timestamp': timestamp, + 'release_timestamp': timestamp if live_status != 'not_live' else None, + 'tags': traverse_obj(metadata, ('tags', ..., 'title'), expected_type=str_or_none), + 'live_status': live_status, + 'availability': self._availability( + needs_premium=bool(traverse_obj(metadata, 'premiumPlan', 'premium')), + is_private=False, needs_subscription=False, needs_auth=False, is_unlisted=False), + # 'comment_count': metadata.get('numComments'), # Data provided by website is wrong + '__post_extractor': self.extract_comments(video_id) if video_type == 'post' else None, + } + + def _get_comments(self, video_id): + pages_total = None + for page_n in itertools.count(): + raw_comments = self._download_json( + f'{_API_BASE_URL}comment?postId={video_id[5:]}&page={page_n}&size=50', + video_id, note=f'Downloading viewer comments page {page_n + 1}{format_field(pages_total, None, " of %s")}', + fatal=False) or {} + + for comment in raw_comments.get('content') or []: + yield { + 'text': str_or_none(comment.get('comment')), + 'author': str_or_none(comment.get('name')), + 'id': comment.get('commentId'), + 'author_id': comment.get('userId'), + 'parent': 'root', + 'like_count': int_or_none(comment.get('numLikes')), + 'dislike_count': int_or_none(comment.get('numDislikes')), + 'timestamp': unified_timestamp(comment.get('postedAt')) + } + + pages_total = int_or_none(raw_comments.get('totalPages')) or None + is_last = raw_comments.get('last') + if not raw_comments.get('content') or is_last or (page_n > pages_total if pages_total else is_last is not False): + return + + def _perform_login(self, username, password): + # https://openid.net/specs/openid-connect-core-1_0.html#CodeFlowAuth (Sec. 3.1) + login_page = self._download_webpage( + f'{self._AUTH_BASE}/auth?client_id=web&redirect_uri=https%3A%2F%2Frokfin.com%2Ffeed&response_mode=fragment&response_type=code&scope=openid', + None, note='loading login page', errnote='error loading login page') + authentication_point_url = unescapeHTML(self._search_regex( + r'<form\s+[^>]+action\s*=\s*"(https://secure\.rokfin\.com/auth/realms/rokfin-web/login-actions/authenticate\?[^"]+)"', + login_page, name='Authentication URL')) + + resp_body = self._download_webpage( + authentication_point_url, None, note='logging in', fatal=False, expected_status=404, + data=urlencode_postdata({'username': username, 'password': password, 'rememberMe': 'off', 'credentialId': ''})) + if not self._authentication_active(): + if re.search(r'(?i)(invalid\s+username\s+or\s+password)', resp_body or ''): + raise ExtractorError('invalid username/password', expected=True) + raise ExtractorError('Login failed') + + urlh = self._request_webpage( + f'{self._AUTH_BASE}/auth', None, + note='granting user authorization', errnote='user authorization rejected by Rokfin', + query={ + 'client_id': 'web', + 'prompt': 'none', + 'redirect_uri': 'https://rokfin.com/silent-check-sso.html', + 'response_mode': 'fragment', + 'response_type': 'code', + 'scope': 'openid', + }) + self._access_mgmt_tokens = self._download_json( + f'{self._AUTH_BASE}/token', None, + note='getting access credentials', errnote='error getting access credentials', + data=urlencode_postdata({ + 'code': urllib.parse.parse_qs(urllib.parse.urldefrag(urlh.url).fragment).get('code')[0], + 'client_id': 'web', + 'grant_type': 'authorization_code', + 'redirect_uri': 'https://rokfin.com/silent-check-sso.html' + })) + + def _authentication_active(self): + return not ( + {'KEYCLOAK_IDENTITY', 'KEYCLOAK_IDENTITY_LEGACY', 'KEYCLOAK_SESSION', 'KEYCLOAK_SESSION_LEGACY'} + - set(self._get_cookies(self._AUTH_BASE))) + + def _get_auth_token(self): + return try_get(self._access_mgmt_tokens, lambda x: ' '.join([x['token_type'], x['access_token']])) + + def _download_json_using_access_token(self, url_or_request, video_id, headers={}, query={}): + assert 'authorization' not in headers + headers = headers.copy() + auth_token = self._get_auth_token() + refresh_token = self._access_mgmt_tokens.get('refresh_token') + if auth_token: + headers['authorization'] = auth_token + + json_string, urlh = self._download_webpage_handle( + url_or_request, video_id, headers=headers, query=query, expected_status=401) + if not auth_token or urlh.status != 401 or refresh_token is None: + return self._parse_json(json_string, video_id) + + self._access_mgmt_tokens = self._download_json( + f'{self._AUTH_BASE}/token', video_id, + note='User authorization expired or canceled by Rokfin. Re-authorizing ...', errnote='Failed to re-authorize', + data=urlencode_postdata({ + 'grant_type': 'refresh_token', + 'refresh_token': refresh_token, + 'client_id': 'web' + })) + headers['authorization'] = self._get_auth_token() + if headers['authorization'] is None: + raise ExtractorError('User authorization lost', expected=True) + + return self._download_json(url_or_request, video_id, headers=headers, query=query) + + +class RokfinPlaylistBaseIE(InfoExtractor): + _TYPES = { + 'video': 'post', + 'audio': 'post', + 'stream': 'stream', + 'dead_stream': 'stream', + 'stack': 'stack', + } + + def _get_video_data(self, metadata): + for content in metadata.get('content') or []: + media_type = self._TYPES.get(content.get('mediaType')) + video_id = content.get('id') if media_type == 'post' else content.get('mediaId') + if not media_type or not video_id: + continue + + yield self.url_result(f'https://rokfin.com/{media_type}/{video_id}', video_id=f'{media_type}/{video_id}', + video_title=str_or_none(traverse_obj(content, ('content', 'contentTitle')))) + + +class RokfinStackIE(RokfinPlaylistBaseIE): + IE_NAME = 'rokfin:stack' + IE_DESC = 'Rokfin Stacks' + _VALID_URL = r'https?://(?:www\.)?rokfin\.com/stack/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.rokfin.com/stack/271/Tulsi-Gabbard-Portsmouth-Townhall-FULL--Feb-9-2020', + 'playlist_count': 8, + 'info_dict': { + 'id': '271', + }, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + return self.playlist_result(self._get_video_data( + self._download_json(f'{_API_BASE_URL}stack/{list_id}', list_id)), list_id) + + +class RokfinChannelIE(RokfinPlaylistBaseIE): + IE_NAME = 'rokfin:channel' + IE_DESC = 'Rokfin Channels' + _VALID_URL = r'https?://(?:www\.)?rokfin\.com/(?!((feed/?)|(discover/?)|(channels/?))$)(?P<id>[^/]+)/?$' + _TESTS = [{ + 'url': 'https://rokfin.com/TheConvoCouch', + 'playlist_mincount': 100, + 'info_dict': { + 'id': '12071-new', + 'title': 'TheConvoCouch - New', + 'description': 'md5:bb622b1bca100209b91cd685f7847f06', + }, + }] + + _TABS = { + 'new': 'posts', + 'top': 'top', + 'videos': 'video', + 'podcasts': 'audio', + 'streams': 'stream', + 'stacks': 'stack', + } + + def _real_initialize(self): + self._validate_extractor_args() + + def _validate_extractor_args(self): + requested_tabs = self._configuration_arg('tab', None) + if requested_tabs is not None and (len(requested_tabs) > 1 or requested_tabs[0] not in self._TABS): + raise ExtractorError(f'Invalid extractor-arg "tab". Must be one of {", ".join(self._TABS)}', expected=True) + + def _entries(self, channel_id, channel_name, tab): + pages_total = None + for page_n in itertools.count(0): + if tab in ('posts', 'top'): + data_url = f'{_API_BASE_URL}user/{channel_name}/{tab}?page={page_n}&size=50' + else: + data_url = f'{_API_BASE_URL}post/search/{tab}?page={page_n}&size=50&creator={channel_id}' + metadata = self._download_json( + data_url, channel_name, + note=f'Downloading video metadata page {page_n + 1}{format_field(pages_total, None, " of %s")}') + + yield from self._get_video_data(metadata) + pages_total = int_or_none(metadata.get('totalPages')) or None + is_last = metadata.get('last') + if is_last or (page_n > pages_total if pages_total else is_last is not False): + return + + def _real_extract(self, url): + channel_name = self._match_id(url) + channel_info = self._download_json(f'{_API_BASE_URL}user/{channel_name}', channel_name) + channel_id = channel_info['id'] + tab = self._configuration_arg('tab', default=['new'])[0] + + return self.playlist_result( + self._entries(channel_id, channel_name, self._TABS[tab]), + f'{channel_id}-{tab}', f'{channel_name} - {tab.title()}', str_or_none(channel_info.get('description'))) + + +class RokfinSearchIE(SearchInfoExtractor): + IE_NAME = 'rokfin:search' + IE_DESC = 'Rokfin Search' + _SEARCH_KEY = 'rkfnsearch' + _TYPES = { + 'video': (('id', 'raw'), 'post'), + 'audio': (('id', 'raw'), 'post'), + 'stream': (('content_id', 'raw'), 'stream'), + 'dead_stream': (('content_id', 'raw'), 'stream'), + 'stack': (('content_id', 'raw'), 'stack'), + } + _TESTS = [{ + 'url': 'rkfnsearch5:"zelenko"', + 'playlist_count': 5, + 'info_dict': { + 'id': '"zelenko"', + 'title': '"zelenko"', + } + }] + _db_url = None + _db_access_key = None + + def _real_initialize(self): + self._db_url, self._db_access_key = self.cache.load(self.ie_key(), 'auth', default=(None, None)) + if not self._db_url: + self._get_db_access_credentials() + + def _search_results(self, query): + total_pages = None + for page_number in itertools.count(1): + search_results = self._run_search_query( + query, data={'query': query, 'page': {'size': 100, 'current': page_number}}, + note=f'Downloading page {page_number}{format_field(total_pages, None, " of ~%s")}') + total_pages = traverse_obj(search_results, ('meta', 'page', 'total_pages'), expected_type=int_or_none) + + for result in search_results.get('results') or []: + video_id_key, video_type = self._TYPES.get(traverse_obj(result, ('content_type', 'raw')), (None, None)) + video_id = traverse_obj(result, video_id_key, expected_type=int_or_none) + if video_id and video_type: + yield self.url_result(url=f'https://rokfin.com/{video_type}/{video_id}') + if not search_results.get('results'): + return + + def _run_search_query(self, video_id, data, **kwargs): + data = json.dumps(data).encode() + for attempt in range(2): + search_results = self._download_json( + self._db_url, video_id, data=data, fatal=(attempt == 1), + headers={'authorization': self._db_access_key}, **kwargs) + if search_results: + return search_results + self.write_debug('Updating access credentials') + self._get_db_access_credentials(video_id) + + def _get_db_access_credentials(self, video_id=None): + auth_data = {'SEARCH_KEY': None, 'ENDPOINT_BASE': None} + notfound_err_page = self._download_webpage( + 'https://rokfin.com/discover', video_id, expected_status=404, note='Downloading home page') + for js_file_path in re.findall(r'<script\b[^>]*\ssrc\s*=\s*"(/static/js/[^">]+)"', notfound_err_page): + js_content = self._download_webpage( + f'https://rokfin.com{js_file_path}', video_id, note='Downloading JavaScript file', fatal=False) + auth_data.update(re.findall( + rf'REACT_APP_({"|".join(auth_data.keys())})\s*:\s*"([^"]+)"', js_content or '')) + if not all(auth_data.values()): + continue + + self._db_url = url_or_none(f'{auth_data["ENDPOINT_BASE"]}/api/as/v1/engines/rokfin-search/search.json') + self._db_access_key = f'Bearer {auth_data["SEARCH_KEY"]}' + self.cache.store(self.ie_key(), 'auth', (self._db_url, self._db_access_key)) + return + raise ExtractorError('Unable to extract access credentials') diff --git a/yt_dlp/extractor/roosterteeth.py b/yt_dlp/extractor/roosterteeth.py new file mode 100644 index 0000000..5c62239 --- /dev/null +++ b/yt_dlp/extractor/roosterteeth.py @@ -0,0 +1,352 @@ +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + LazyList, + int_or_none, + join_nonempty, + parse_iso8601, + parse_qs, + smuggle_url, + str_or_none, + url_or_none, + urlencode_postdata, + urljoin, +) +from ..utils.traversal import traverse_obj + + +class RoosterTeethBaseIE(InfoExtractor): + _NETRC_MACHINE = 'roosterteeth' + _API_BASE = 'https://svod-be.roosterteeth.com' + _API_BASE_URL = f'{_API_BASE}/api/v1' + + def _perform_login(self, username, password): + if self._get_cookies(self._API_BASE_URL).get('rt_access_token'): + return + + try: + self._download_json( + 'https://auth.roosterteeth.com/oauth/token', + None, 'Logging in', data=urlencode_postdata({ + 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5', + 'grant_type': 'password', + 'username': username, + 'password': password, + })) + except ExtractorError as e: + msg = 'Unable to login' + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + resp = self._parse_json(e.cause.response.read().decode(), None, fatal=False) + if resp: + error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') + if error: + msg += ': ' + error + self.report_warning(msg) + + def _extract_video_info(self, data): + thumbnails = [] + for image in traverse_obj(data, ('included', 'images')): + if image.get('type') not in ('episode_image', 'bonus_feature_image'): + continue + thumbnails.extend([{ + 'id': name, + 'url': url, + } for name, url in (image.get('attributes') or {}).items() if url_or_none(url)]) + + attributes = data.get('attributes') or {} + title = traverse_obj(attributes, 'title', 'display_title') + sub_only = attributes.get('is_sponsors_only') + + episode_id = str_or_none(data.get('uuid')) + video_id = str_or_none(data.get('id')) + if video_id and 'parent_content_id' in attributes: # parent_content_id is a bonus-only key + video_id += '-bonus' # there are collisions with bonus ids and regular ids + elif not video_id: + video_id = episode_id + + return { + 'id': video_id, + 'display_id': attributes.get('slug'), + 'title': title, + 'description': traverse_obj(attributes, 'description', 'caption'), + 'series': traverse_obj(attributes, 'show_title', 'parent_content_title'), + 'season_number': int_or_none(attributes.get('season_number')), + 'season_id': str_or_none(attributes.get('season_id')), + 'episode': title, + 'episode_number': int_or_none(attributes.get('number')), + 'episode_id': episode_id, + 'channel_id': attributes.get('channel_id'), + 'duration': int_or_none(attributes.get('length')), + 'release_timestamp': parse_iso8601(attributes.get('original_air_date')), + 'thumbnails': thumbnails, + 'availability': self._availability( + needs_premium=sub_only, needs_subscription=sub_only, needs_auth=sub_only, + is_private=False, is_unlisted=False), + 'tags': attributes.get('genres') + } + + +class RoosterTeethIE(RoosterTeethBaseIE): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:bonus-feature|episode|watch)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'info_dict': { + 'id': '9156', + 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'ext': 'mp4', + 'title': 'Million Dollars, But... The Game Announcement', + 'description': 'md5:168a54b40e228e79f4ddb141e89fe4f5', + 'thumbnail': r're:^https?://.*\.png$', + 'series': 'Million Dollars, But...', + 'episode': 'Million Dollars, But... The Game Announcement', + 'tags': ['Game Show', 'Sketch'], + 'season_number': 2, + 'availability': 'public', + 'episode_number': 10, + 'episode_id': '00374575-464e-11e7-a302-065410f210c4', + 'season': 'Season 2', + 'season_id': 'ffa27d48-464d-11e7-a302-065410f210c4', + 'channel_id': '92b6bb21-91d2-4b1b-bf95-3268fa0d9939', + 'duration': 145, + 'release_timestamp': 1462982400, + 'release_date': '20160511', + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://roosterteeth.com/watch/rwby-bonus-25', + 'info_dict': { + 'id': '40432', + 'display_id': 'rwby-bonus-25', + 'title': 'Grimm', + 'description': 'md5:f30ff570741213418a8d2c19868b93ab', + 'episode': 'Grimm', + 'channel_id': '92f780eb-ebfe-4bf5-a3b5-c6ad5460a5f1', + 'thumbnail': r're:^https?://.*\.(png|jpe?g)$', + 'ext': 'mp4', + 'availability': 'public', + 'episode_id': 'f8117b13-f068-499e-803e-eec9ea2dec8c', + 'episode_number': 3, + 'tags': ['Animation'], + 'season_id': '4b8f0a9e-12c4-41ed-8caa-fed15a85bab8', + 'season': 'Season 1', + 'series': 'RWBY: World of Remnant', + 'season_number': 1, + 'duration': 216, + 'release_timestamp': 1413489600, + 'release_date': '20141016', + }, + 'params': {'skip_download': True}, + }, { + # bonus feature with /watch/ url + 'url': 'https://roosterteeth.com/watch/rwby-bonus-21', + 'info_dict': { + 'id': '33-bonus', + 'display_id': 'rwby-bonus-21', + 'title': 'Volume 5 Yang Character Short', + 'description': 'md5:8c2440bc763ea90c52cfe0a68093e1f7', + 'episode': 'Volume 5 Yang Character Short', + 'channel_id': '92f780eb-ebfe-4bf5-a3b5-c6ad5460a5f1', + 'thumbnail': r're:^https?://.*\.(png|jpe?g)$', + 'ext': 'mp4', + 'availability': 'public', + 'episode_id': 'f2a9f132-1fe2-44ad-8956-63d7c0267720', + 'episode_number': 55, + 'series': 'RWBY', + 'duration': 255, + 'release_timestamp': 1507993200, + 'release_date': '20171014', + }, + 'params': {'skip_download': True}, + }, { + # only works with video_data['attributes']['url'] m3u8 url + 'url': 'https://www.roosterteeth.com/watch/achievement-hunter-achievement-hunter-fatality-walkthrough-deathstroke-lex-luthor-captain-marvel-green-lantern-and-wonder-woman', + 'info_dict': { + 'id': '25394', + 'ext': 'mp4', + 'title': 'Fatality Walkthrough: Deathstroke, Lex Luthor, Captain Marvel, Green Lantern, and Wonder Woman', + 'description': 'md5:91bb934698344fb9647b1c7351f16964', + 'availability': 'public', + 'thumbnail': r're:^https?://.*\.(png|jpe?g)$', + 'episode': 'Fatality Walkthrough: Deathstroke, Lex Luthor, Captain Marvel, Green Lantern, and Wonder Woman', + 'episode_number': 71, + 'episode_id': 'ffaec998-464d-11e7-a302-065410f210c4', + 'season': 'Season 2008', + 'tags': ['Gaming'], + 'series': 'Achievement Hunter', + 'display_id': 'md5:4465ce4f001735f9d7a2ae529a543d31', + 'season_id': 'ffa13340-464d-11e7-a302-065410f210c4', + 'season_number': 2008, + 'channel_id': '2cb2a70c-be50-46f5-93d7-84a1baabb4f7', + 'duration': 189, + 'release_timestamp': 1228317300, + 'release_date': '20081203', + }, + 'params': {'skip_download': True}, + }, { + # brightcove fallback extraction needed + 'url': 'https://roosterteeth.com/watch/lets-play-2013-126', + 'info_dict': { + 'id': '17845', + 'ext': 'mp4', + 'title': 'WWE \'13', + 'availability': 'public', + 'series': 'Let\'s Play', + 'episode_number': 10, + 'season_id': 'ffa23d9c-464d-11e7-a302-065410f210c4', + 'channel_id': '75ba87e8-06fd-4482-bad9-52a4da2c6181', + 'episode': 'WWE \'13', + 'episode_id': 'ffdbe55e-464d-11e7-a302-065410f210c4', + 'thumbnail': r're:^https?://.*\.(png|jpe?g)$', + 'tags': ['Gaming', 'Our Favorites'], + 'description': 'md5:b4a5226d2bbcf0dafbde11a2ba27262d', + 'display_id': 'lets-play-2013-126', + 'season_number': 3, + 'season': 'Season 3', + 'release_timestamp': 1359999840, + 'release_date': '20130204', + }, + 'expected_warnings': ['Direct m3u8 URL returned HTTP Error 403'], + 'params': {'skip_download': True}, + }, { + 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', + 'only_matching': True, + }, { + 'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts', + 'only_matching': True, + }, { + 'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow', + 'only_matching': True, + }, { + 'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better', + 'only_matching': True, + }, { + # only available for FIRST members + 'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one', + 'only_matching': True, + }, { + 'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'only_matching': True, + }, { + 'url': 'https://roosterteeth.com/bonus-feature/camp-camp-soundtrack-another-rap-song-about-foreign-cars-richie-branson', + 'only_matching': True, + }] + + _BRIGHTCOVE_ACCOUNT_ID = '6203312018001' + + def _extract_brightcove_formats_and_subtitles(self, bc_id, url, m3u8_url): + account_id = self._search_regex( + r'/accounts/(\d+)/videos/', m3u8_url, 'account id', default=self._BRIGHTCOVE_ACCOUNT_ID) + info = self._downloader.get_info_extractor('BrightcoveNew').extract(smuggle_url( + f'https://players.brightcove.net/{account_id}/default_default/index.html?videoId={bc_id}', + {'referrer': url})) + return info['formats'], info['subtitles'] + + def _real_extract(self, url): + display_id = self._match_id(url) + api_episode_url = f'{self._API_BASE_URL}/watch/{display_id}' + + try: + video_data = self._download_json( + api_episode_url + '/videos', display_id, 'Downloading video JSON metadata', + headers={'Client-Type': 'web'})['data'][0] # web client-type yields ad-free streams + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + if self._parse_json(e.cause.response.read().decode(), display_id).get('access') is False: + self.raise_login_required( + '%s is only available for FIRST members' % display_id) + raise + + # XXX: additional ad-free URL at video_data['links']['download'] but often gives 403 errors + m3u8_url = video_data['attributes']['url'] + is_brightcove = traverse_obj(video_data, ('attributes', 'encoding_pipeline')) == 'brightcove' + bc_id = traverse_obj(video_data, ('attributes', 'uid', {str})) + + try: + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls') + except ExtractorError as e: + if is_brightcove and bc_id and isinstance(e.cause, HTTPError) and e.cause.status == 403: + self.report_warning( + 'Direct m3u8 URL returned HTTP Error 403; retrying with Brightcove extraction') + formats, subtitles = self._extract_brightcove_formats_and_subtitles(bc_id, url, m3u8_url) + else: + raise + + episode = self._download_json( + api_episode_url, display_id, + 'Downloading episode JSON metadata')['data'][0] + + return { + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + **self._extract_video_info(episode) + } + + +class RoosterTeethSeriesIE(RoosterTeethBaseIE): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/series/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://roosterteeth.com/series/rwby?season=7', + 'playlist_count': 13, + 'info_dict': { + 'id': 'rwby-7', + 'title': 'RWBY - Season 7', + }, + }, { + 'url': 'https://roosterteeth.com/series/the-weird-place', + 'playlist_count': 7, + 'info_dict': { + 'id': 'the-weird-place', + 'title': 'The Weird Place', + }, + }, { + 'url': 'https://roosterteeth.com/series/role-initiative', + 'playlist_mincount': 16, + 'info_dict': { + 'id': 'role-initiative', + 'title': 'Role Initiative', + }, + }, { + 'url': 'https://roosterteeth.com/series/let-s-play-minecraft?season=9', + 'playlist_mincount': 50, + 'info_dict': { + 'id': 'let-s-play-minecraft-9', + 'title': 'Let\'s Play Minecraft - Season 9', + }, + }] + + def _entries(self, series_id, season_number): + display_id = join_nonempty(series_id, season_number) + + def yield_episodes(data): + for episode in traverse_obj(data, ('data', lambda _, v: v['canonical_links']['self'])): + yield self.url_result( + urljoin('https://www.roosterteeth.com', episode['canonical_links']['self']), + RoosterTeethIE, **self._extract_video_info(episode)) + + series_data = self._download_json( + f'{self._API_BASE_URL}/shows/{series_id}/seasons?order=asc&order_by', display_id) + for season_data in traverse_obj(series_data, ('data', lambda _, v: v['links']['episodes'])): + idx = traverse_obj(season_data, ('attributes', 'number')) + if season_number is not None and idx != season_number: + continue + yield from yield_episodes(self._download_json( + urljoin(self._API_BASE, season_data['links']['episodes']), display_id, + f'Downloading season {idx} JSON metadata', query={'per_page': 1000})) + + if season_number is None: # extract series-level bonus features + yield from yield_episodes(self._download_json( + f'{self._API_BASE_URL}/shows/{series_id}/bonus_features?order=asc&order_by&per_page=1000', + display_id, 'Downloading bonus features JSON metadata', fatal=False)) + + def _real_extract(self, url): + series_id = self._match_id(url) + season_number = traverse_obj(parse_qs(url), ('season', 0), expected_type=int_or_none) + + entries = LazyList(self._entries(series_id, season_number)) + return self.playlist_result( + entries, + join_nonempty(series_id, season_number), + join_nonempty(entries[0].get('series'), season_number, delim=' - Season ')) diff --git a/yt_dlp/extractor/rottentomatoes.py b/yt_dlp/extractor/rottentomatoes.py new file mode 100644 index 0000000..e357175 --- /dev/null +++ b/yt_dlp/extractor/rottentomatoes.py @@ -0,0 +1,80 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + float_or_none, + get_element_by_class, + join_nonempty, + traverse_obj, + url_or_none, +) + + +class RottenTomatoesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/(?P<playlist>[^/]+)(?:/(?P<tr>trailers)(?:/(?P<id>\w+))?)?' + + _TESTS = [{ + 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', + 'info_dict': { + 'id': '11028566', + 'ext': 'mp4', + 'title': 'Toy Story 3', + 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.' + }, + 'skip': 'No longer available', + }, { + 'url': 'https://www.rottentomatoes.com/m/toy_story_3/trailers/VycaVoBKhGuk', + 'info_dict': { + 'id': 'VycaVoBKhGuk', + 'ext': 'mp4', + 'title': 'Toy Story 3: Trailer 2', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 149.941 + }, + }, { + 'url': 'http://www.rottentomatoes.com/m/toy_story_3', + 'info_dict': { + 'id': 'toy_story_3', + 'title': 'Toy Story 3', + }, + 'playlist_mincount': 4, + }, { + 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers', + 'info_dict': { + 'id': 'toy_story_3-trailers', + }, + 'playlist_mincount': 5, + }] + + def _extract_videos(self, data, display_id): + for video in traverse_obj(data, (lambda _, v: v['publicId'] and v['file'] and v['type'] == 'hls')): + yield { + 'formats': self._extract_m3u8_formats( + video['file'], display_id, 'mp4', m3u8_id='hls', fatal=False), + **traverse_obj(video, { + 'id': 'publicId', + 'title': 'title', + 'description': 'description', + 'duration': ('durationInSeconds', {float_or_none}), + 'thumbnail': ('image', {url_or_none}), + }), + } + + def _real_extract(self, url): + playlist_id, trailers, video_id = self._match_valid_url(url).group('playlist', 'tr', 'id') + playlist_id = join_nonempty(playlist_id, trailers) + webpage = self._download_webpage(url, playlist_id) + data = self._search_json( + r'<script[^>]+\bid=["\'](?:heroV|v)ideos["\'][^>]*>', webpage, + 'data', playlist_id, contains_pattern=r'\[{(?s:.+)}\]') + + if video_id: + video_data = traverse_obj(data, lambda _, v: v['publicId'] == video_id) + if not video_data: + raise ExtractorError('Unable to extract video from webpage') + return next(self._extract_videos(video_data, video_id)) + + return self.playlist_result( + self._extract_videos(data, playlist_id), playlist_id, + clean_html(get_element_by_class('scoreboard__title', webpage))) diff --git a/yt_dlp/extractor/rozhlas.py b/yt_dlp/extractor/rozhlas.py new file mode 100644 index 0000000..411a625 --- /dev/null +++ b/yt_dlp/extractor/rozhlas.py @@ -0,0 +1,363 @@ +import itertools + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + extract_attributes, + int_or_none, + remove_start, + str_or_none, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class RozhlasIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?prehravac\.rozhlas\.cz/audio/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://prehravac.rozhlas.cz/audio/3421320', + 'md5': '504c902dbc9e9a1fd50326eccf02a7e2', + 'info_dict': { + 'id': '3421320', + 'ext': 'mp3', + 'title': 'Echo Pavla Klusáka (30.06.2015 21:00)', + 'description': 'Osmdesátiny Terryho Rileyho jsou skvělou příležitostí proletět se elektronickými i akustickými díly zakladatatele minimalismu, který je aktivní už přes padesát let' + } + }, { + 'url': 'http://prehravac.rozhlas.cz/audio/3421320/embed', + 'only_matching': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://prehravac.rozhlas.cz/audio/%s' % audio_id, audio_id) + + title = self._html_search_regex( + r'<h3>(.+?)</h3>\s*<p[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track', + webpage, 'title', default=None) or remove_start( + self._og_search_title(webpage), 'Radio Wave - ') + description = self._html_search_regex( + r'<p[^>]+title=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track', + webpage, 'description', fatal=False, group='url') + duration = int_or_none(self._search_regex( + r'data-duration=["\'](\d+)', webpage, 'duration', default=None)) + + return { + 'id': audio_id, + 'url': 'http://media.rozhlas.cz/_audio/%s.mp3' % audio_id, + 'title': title, + 'description': description, + 'duration': duration, + 'vcodec': 'none', + } + + +class RozhlasBaseIE(InfoExtractor): + def _extract_formats(self, entry, audio_id): + formats = [] + for audio in traverse_obj(entry, ('audioLinks', lambda _, v: url_or_none(v['url']))): + ext = audio.get('variant') + for retry in self.RetryManager(): + if retry.attempt > 1: + self._sleep(1, audio_id) + try: + if ext == 'dash': + formats.extend(self._extract_mpd_formats( + audio['url'], audio_id, mpd_id=ext)) + elif ext == 'hls': + formats.extend(self._extract_m3u8_formats( + audio['url'], audio_id, 'm4a', m3u8_id=ext)) + else: + formats.append({ + 'url': audio['url'], + 'ext': ext, + 'format_id': ext, + 'abr': int_or_none(audio.get('bitrate')), + 'acodec': ext, + 'vcodec': 'none', + }) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 429: + retry.error = e.cause + else: + self.report_warning(e.msg) + + return formats + + +class RozhlasVltavaIE(RozhlasBaseIE): + _VALID_URL = r'https?://(?:\w+\.rozhlas|english\.radio)\.cz/[\w-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://wave.rozhlas.cz/papej-masicko-porcujeme-a-bilancujeme-filmy-a-serialy-ktere-letos-zabily-8891337', + 'md5': 'ba2fdbc1242fc16771c7695d271ec355', + 'info_dict': { + 'id': '8891337', + 'title': 'md5:21f99739d04ab49d8c189ec711eef4ec', + }, + 'playlist_count': 1, + 'playlist': [{ + 'md5': 'ba2fdbc1242fc16771c7695d271ec355', + 'info_dict': { + 'id': '10520988', + 'ext': 'mp3', + 'title': 'Papej masíčko! Porcujeme a bilancujeme filmy a seriály, které to letos zabily', + 'description': 'md5:1c6d29fb9564e1f17fc1bb83ae7da0bc', + 'duration': 1574, + 'artist': 'Aleš Stuchlý', + 'channel_id': 'radio-wave', + }, + }] + }, { + 'url': 'https://wave.rozhlas.cz/poslechnete-si-neklid-podcastovy-thriller-o-vine-strachu-a-vztahu-ktery-zasel-8554744', + 'info_dict': { + 'id': '8554744', + 'title': 'Poslechněte si Neklid. Podcastový thriller o vině, strachu a vztahu, který zašel příliš daleko', + }, + 'playlist_count': 5, + 'playlist': [{ + 'md5': '93d4109cf8f40523699ae9c1d4600bdd', + 'info_dict': { + 'id': '9890713', + 'ext': 'mp3', + 'title': 'Neklid #1', + 'description': '1. díl: Neklid: 1. díl', + 'duration': 1025, + 'artist': 'Josef Kokta', + 'channel_id': 'radio-wave', + 'chapter': 'Neklid #1', + 'chapter_number': 1, + }, + }, { + 'md5': 'e9763235be4a6dcf94bc8a5bac1ca126', + 'info_dict': { + 'id': '9890716', + 'ext': 'mp3', + 'title': 'Neklid #2', + 'description': '2. díl: Neklid: 2. díl', + 'duration': 768, + 'artist': 'Josef Kokta', + 'channel_id': 'radio-wave', + 'chapter': 'Neklid #2', + 'chapter_number': 2, + }, + }, { + 'md5': '00b642ea94b78cc949ac84da09f87895', + 'info_dict': { + 'id': '9890722', + 'ext': 'mp3', + 'title': 'Neklid #3', + 'description': '3. díl: Neklid: 3. díl', + 'duration': 607, + 'artist': 'Josef Kokta', + 'channel_id': 'radio-wave', + 'chapter': 'Neklid #3', + 'chapter_number': 3, + }, + }, { + 'md5': 'faef97b1b49da7df874740f118c19dea', + 'info_dict': { + 'id': '9890728', + 'ext': 'mp3', + 'title': 'Neklid #4', + 'description': '4. díl: Neklid: 4. díl', + 'duration': 621, + 'artist': 'Josef Kokta', + 'channel_id': 'radio-wave', + 'chapter': 'Neklid #4', + 'chapter_number': 4, + }, + }, { + 'md5': '6e729fa39b647325b868d419c76f3efa', + 'info_dict': { + 'id': '9890734', + 'ext': 'mp3', + 'title': 'Neklid #5', + 'description': '5. díl: Neklid: 5. díl', + 'duration': 908, + 'artist': 'Josef Kokta', + 'channel_id': 'radio-wave', + 'chapter': 'Neklid #5', + 'chapter_number': 5, + }, + }] + }, { + 'url': 'https://dvojka.rozhlas.cz/karel-siktanc-cerny-jezdec-bily-kun-napinava-pohadka-o-tajemnem-prizraku-8946969', + 'info_dict': { + 'id': '8946969', + 'title': 'Karel Šiktanc: Černý jezdec, bílý kůň. Napínavá pohádka o tajemném přízraku', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '10631121', + 'ext': 'm4a', + 'title': 'Karel Šiktanc: Černý jezdec, bílý kůň. Napínavá pohádka o tajemném přízraku', + 'description': 'Karel Šiktanc: Černý jezdec, bílý kůň', + 'duration': 2656, + 'artist': 'Tvůrčí skupina Drama a literatura', + 'channel_id': 'dvojka', + }, + }], + 'params': {'skip_download': 'dash'}, + }] + + def _extract_video(self, entry): + audio_id = entry['meta']['ga']['contentId'] + chapter_number = traverse_obj(entry, ('meta', 'ga', 'contentSerialPart', {int_or_none})) + + return { + 'id': audio_id, + 'chapter': traverse_obj(entry, ('meta', 'ga', 'contentNameShort')) if chapter_number else None, + 'chapter_number': chapter_number, + 'formats': self._extract_formats(entry, audio_id), + **traverse_obj(entry, { + 'title': ('meta', 'ga', 'contentName'), + 'description': 'title', + 'duration': ('duration', {int_or_none}), + 'artist': ('meta', 'ga', 'contentAuthor'), + 'channel_id': ('meta', 'ga', 'contentCreator'), + }) + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # FIXME: Use get_element_text_and_html_by_tag when it accepts less strict html + data = self._parse_json(extract_attributes(self._search_regex( + r'(<div class="mujRozhlasPlayer" data-player=\'[^\']+\'>)', + webpage, 'player'))['data-player'], video_id)['data'] + + return { + '_type': 'playlist', + 'id': str_or_none(data.get('embedId')) or video_id, + 'title': traverse_obj(data, ('series', 'title')), + 'entries': map(self._extract_video, data['playlist']), + } + + +class MujRozhlasIE(RozhlasBaseIE): + _VALID_URL = r'https?://(?:www\.)?mujrozhlas\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + # single episode extraction + 'url': 'https://www.mujrozhlas.cz/vykopavky/ach-jo-zase-teleci-rizek-je-mnohem-min-cesky-nez-jsme-si-mysleli', + 'md5': '6f8fd68663e64936623e67c152a669e0', + 'info_dict': { + 'id': '10787730', + 'ext': 'mp3', + 'title': 'Ach jo, zase to telecí! Řízek je mnohem míň český, než jsme si mysleli', + 'description': 'md5:db7141e9caaedc9041ec7cefb9a62908', + 'timestamp': 1684915200, + 'modified_timestamp': 1687550432, + 'series': 'Vykopávky', + 'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/84377046610af6ddc54d910b1dd7a22b.jpg', + 'channel_id': 'radio-wave', + 'upload_date': '20230524', + 'modified_date': '20230623', + }, + }, { + # serial extraction + 'url': 'https://www.mujrozhlas.cz/radiokniha/jaroslava-janackova-pribeh-tajemneho-psani-o-pramenech-genezi-babicky', + 'playlist_mincount': 7, + 'info_dict': { + 'id': 'bb2b5f4e-ffb4-35a6-a34a-046aa62d6f6b', + 'title': 'Jaroslava Janáčková: Příběh tajemného psaní. O pramenech a genezi Babičky', + 'description': 'md5:7434d8fac39ac9fee6df098e11dfb1be', + }, + }, { + # show extraction + 'url': 'https://www.mujrozhlas.cz/nespavci', + 'playlist_mincount': 14, + 'info_dict': { + 'id': '09db9b37-d0f4-368c-986a-d3439f741f08', + 'title': 'Nespavci', + 'description': 'md5:c430adcbf9e2b9eac88b745881e814dc', + }, + }, { + # serialPart + 'url': 'https://www.mujrozhlas.cz/povidka/gustavo-adolfo-becquer-hora-duchu', + 'info_dict': { + 'id': '8889035', + 'ext': 'm4a', + 'title': 'Gustavo Adolfo Bécquer: Hora duchů', + 'description': 'md5:343a15257b376c276e210b78e900ffea', + 'chapter': 'Hora duchů a Polibek – dva tajemné příběhy Gustava Adolfa Bécquera', + 'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/2adfe1387fb140634be725c1ccf26214.jpg', + 'timestamp': 1708173000, + 'episode': 'Episode 1', + 'episode_number': 1, + 'series': 'Povídka', + 'modified_date': '20240217', + 'upload_date': '20240217', + 'modified_timestamp': 1708173198, + 'channel_id': 'vltava', + }, + 'params': {'skip_download': 'dash'}, + }] + + def _call_api(self, path, item_id, msg='API JSON'): + return self._download_json( + f'https://api.mujrozhlas.cz/{path}/{item_id}', item_id, + note=f'Downloading {msg}', errnote=f'Failed to download {msg}')['data'] + + def _extract_audio_entry(self, entry): + audio_id = entry['meta']['ga']['contentId'] + + return { + 'id': audio_id, + 'formats': self._extract_formats(entry['attributes'], audio_id), + **traverse_obj(entry, { + 'title': ('attributes', 'title'), + 'description': ('attributes', 'description'), + 'episode_number': ('attributes', 'part'), + 'series': ('attributes', 'mirroredShow', 'title'), + 'chapter': ('attributes', 'mirroredSerial', 'title'), + 'artist': ('meta', 'ga', 'contentAuthor'), + 'channel_id': ('meta', 'ga', 'contentCreator'), + 'timestamp': ('attributes', 'since', {unified_timestamp}), + 'modified_timestamp': ('attributes', 'updated', {unified_timestamp}), + 'thumbnail': ('attributes', 'asset', 'url', {url_or_none}), + }) + } + + def _entries(self, api_url, playlist_id): + for page in itertools.count(1): + episodes = self._download_json( + api_url, playlist_id, note=f'Downloading episodes page {page}', + errnote=f'Failed to download episodes page {page}', fatal=False) + for episode in traverse_obj(episodes, ('data', lambda _, v: v['meta']['ga']['contentId'])): + yield self._extract_audio_entry(episode) + api_url = traverse_obj(episodes, ('links', 'next', {url_or_none})) + if not api_url: + break + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + info = self._search_json(r'\bvar\s+dl\s*=', webpage, 'info json', display_id) + + entity = info['siteEntityBundle'] + + if entity in ('episode', 'serialPart'): + return self._extract_audio_entry(self._call_api( + 'episodes', info['contentId'], 'episode info API JSON')) + + elif entity in ('show', 'serial'): + playlist_id = info['contentShow'].split(':')[0] if entity == 'show' else info['contentId'] + data = self._call_api(f'{entity}s', playlist_id, f'{entity} playlist JSON') + api_url = data['relationships']['episodes']['links']['related'] + return self.playlist_result( + self._entries(api_url, playlist_id), playlist_id, + **traverse_obj(data, ('attributes', { + 'title': 'title', + 'description': 'description', + }))) + + else: + # `entity == 'person'` not implemented yet by API, ref: + # https://api.mujrozhlas.cz/persons/8367e456-2a57-379a-91bb-e699619bea49/participation + raise ExtractorError(f'Unsupported entity type "{entity}"') diff --git a/yt_dlp/extractor/rte.py b/yt_dlp/extractor/rte.py new file mode 100644 index 0000000..7ba80d4 --- /dev/null +++ b/yt_dlp/extractor/rte.py @@ -0,0 +1,162 @@ +import re + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + float_or_none, + parse_iso8601, + str_or_none, + try_get, + unescapeHTML, + url_or_none, + ExtractorError, +) + + +class RteBaseIE(InfoExtractor): + def _real_extract(self, url): + item_id = self._match_id(url) + + info_dict = {} + formats = [] + + ENDPOINTS = ( + 'https://feeds.rasset.ie/rteavgen/player/playlist?type=iptv&format=json&showId=', + 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=', + ) + + for num, ep_url in enumerate(ENDPOINTS, start=1): + try: + data = self._download_json(ep_url + item_id, item_id) + except ExtractorError as ee: + if num < len(ENDPOINTS) or formats: + continue + if isinstance(ee.cause, HTTPError) and ee.cause.status == 404: + error_info = self._parse_json(ee.cause.response.read().decode(), item_id, fatal=False) + if error_info: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_info['message']), + expected=True) + raise + + # NB the string values in the JSON are stored using XML escaping(!) + show = try_get(data, lambda x: x['shows'][0], dict) + if not show: + continue + + if not info_dict: + title = unescapeHTML(show['title']) + description = unescapeHTML(show.get('description')) + thumbnail = show.get('thumbnail') + duration = float_or_none(show.get('duration'), 1000) + timestamp = parse_iso8601(show.get('published')) + info_dict = { + 'id': item_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + } + + mg = try_get(show, lambda x: x['media:group'][0], dict) + if not mg: + continue + + if mg.get('url'): + m = re.match(r'(?P<url>rtmpe?://[^/]+)/(?P<app>.+)/(?P<playpath>mp4:.*)', mg['url']) + if m: + m = m.groupdict() + formats.append({ + 'url': m['url'] + '/' + m['app'], + 'app': m['app'], + 'play_path': m['playpath'], + 'player_url': url, + 'ext': 'flv', + 'format_id': 'rtmp', + }) + + if mg.get('hls_server') and mg.get('hls_url'): + formats.extend(self._extract_m3u8_formats( + mg['hls_server'] + mg['hls_url'], item_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + + if mg.get('hds_server') and mg.get('hds_url'): + formats.extend(self._extract_f4m_formats( + mg['hds_server'] + mg['hds_url'], item_id, + f4m_id='hds', fatal=False)) + + mg_rte_server = str_or_none(mg.get('rte:server')) + mg_url = str_or_none(mg.get('url')) + if mg_rte_server and mg_url: + hds_url = url_or_none(mg_rte_server + mg_url) + if hds_url: + formats.extend(self._extract_f4m_formats( + hds_url, item_id, f4m_id='hds', fatal=False)) + + info_dict['formats'] = formats + return info_dict + + +class RteIE(RteBaseIE): + IE_NAME = 'rte' + IE_DESC = 'Raidió Teilifís Éireann TV' + _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/', + 'md5': '4a76eb3396d98f697e6e8110563d2604', + 'info_dict': { + 'id': '10478715', + 'ext': 'mp4', + 'title': 'iWitness', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'The spirit of Ireland, one voice and one minute at a time.', + 'duration': 60.046, + 'upload_date': '20151012', + 'timestamp': 1444694160, + }, + } + + +class RteRadioIE(RteBaseIE): + IE_NAME = 'rte:radio' + IE_DESC = 'Raidió Teilifís Éireann radio' + # Radioplayer URLs have two distinct specifier formats, + # the old format #!rii=<channel_id>:<id>:<playable_item_id>:<date>: + # the new format #!rii=b<channel_id>_<id>_<playable_item_id>_<date>_ + # where the IDs are int/empty, the date is DD-MM-YYYY, and the specifier may be truncated. + # An <id> uniquely defines an individual recording, and is the only part we require. + _VALID_URL = r'https?://(?:www\.)?rte\.ie/radio/utils/radioplayer/rteradioweb\.html#!rii=(?:b?[0-9]*)(?:%3A|:|%5F|_)(?P<id>[0-9]+)' + + _TESTS = [{ + # Old-style player URL; HLS and RTMPE formats + 'url': 'http://www.rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=16:10507902:2414:27-12-2015:', + 'md5': 'c79ccb2c195998440065456b69760411', + 'info_dict': { + 'id': '10507902', + 'ext': 'mp4', + 'title': 'Gloria', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:9ce124a7fb41559ec68f06387cabddf0', + 'timestamp': 1451203200, + 'upload_date': '20151227', + 'duration': 7230.0, + }, + }, { + # New-style player URL; RTMPE formats only + 'url': 'http://rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=b16_3250678_8861_06-04-2012_', + 'info_dict': { + 'id': '3250678', + 'ext': 'flv', + 'title': 'The Lyric Concert with Paul Herriott', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': '', + 'timestamp': 1333742400, + 'upload_date': '20120406', + 'duration': 7199.016, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }] diff --git a/yt_dlp/extractor/rtl2.py b/yt_dlp/extractor/rtl2.py new file mode 100644 index 0000000..07e1aa3 --- /dev/null +++ b/yt_dlp/extractor/rtl2.py @@ -0,0 +1,95 @@ +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class RTL2IE(InfoExtractor): + IE_NAME = 'rtl2' + _VALID_URL = r'https?://(?:www\.)?rtl2\.de/sendung/[^/]+/(?:video/(?P<vico_id>\d+)[^/]+/(?P<vivi_id>\d+)-|folge/)(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0', + 'info_dict': { + 'id': 'folge-203-0', + 'ext': 'f4v', + 'title': 'GRIP sucht den Sommerkönig', + 'description': 'md5:e3adbb940fd3c6e76fa341b8748b562f' + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], + }, { + 'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/', + 'info_dict': { + 'id': 'anna-erwischt-alex', + 'ext': 'mp4', + 'title': 'Anna erwischt Alex!', + 'description': 'Anna nimmt ihrem Vater nicht ab, dass er nicht spielt. Und tatsächlich erwischt sie ihn auf frischer Tat.' + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], + }] + + def _real_extract(self, url): + vico_id, vivi_id, display_id = self._match_valid_url(url).groups() + if not vico_id: + webpage = self._download_webpage(url, display_id) + + mobj = re.search( + r'data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"', + webpage) + if mobj: + vico_id = mobj.group('vico_id') + vivi_id = mobj.group('vivi_id') + else: + vico_id = self._html_search_regex( + r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') + vivi_id = self._html_search_regex( + r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') + + info = self._download_json( + 'https://service.rtl2.de/api-player-vipo/video.php', + display_id, query={ + 'vico_id': vico_id, + 'vivi_id': vivi_id, + }) + video_info = info['video'] + title = video_info['titel'] + + formats = [] + + rtmp_url = video_info.get('streamurl') + if rtmp_url: + rtmp_url = rtmp_url.replace('\\', '') + stream_url = 'mp4:' + self._html_search_regex(r'/ondemand/(.+)', rtmp_url, 'stream URL') + rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0'] + + formats.append({ + 'format_id': 'rtmp', + 'url': rtmp_url, + 'play_path': stream_url, + 'player_url': 'https://www.rtl2.de/sites/default/modules/rtl2/jwplayer/jwplayer-7.6.0/jwplayer.flash.swf', + 'page_url': url, + 'flash_version': 'LNX 11,2,202,429', + 'rtmp_conn': rtmp_conn, + 'no_resume': True, + 'quality': 1, + }) + + m3u8_url = video_info.get('streamurl_hls') + if m3u8_url: + formats.extend(self._extract_akamai_formats(m3u8_url, display_id)) + + return { + 'id': display_id, + 'title': title, + 'thumbnail': video_info.get('image'), + 'description': video_info.get('beschreibung'), + 'duration': int_or_none(video_info.get('duration')), + 'formats': formats, + } diff --git a/yt_dlp/extractor/rtlnl.py b/yt_dlp/extractor/rtlnl.py new file mode 100644 index 0000000..724cb64 --- /dev/null +++ b/yt_dlp/extractor/rtlnl.py @@ -0,0 +1,294 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, +) + + +class RtlNlIE(InfoExtractor): + IE_NAME = 'rtl.nl' + IE_DESC = 'rtl.nl and rtlxl.nl' + _EMBED_REGEX = [r'<iframe[^>]+?\bsrc=(?P<q1>[\'"])(?P<url>(?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)(?P=q1)'] + _VALID_URL = r'''(?x) + https?://(?:(?:www|static)\.)? + (?: + rtlxl\.nl/(?:[^\#]*\#!|programma)/[^/]+/| + rtl\.nl/(?:(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html|embed)\b.+?\buuid=|video/)| + embed\.rtl\.nl/\#uuid= + ) + (?P<id>[0-9a-f-]+)''' + + _TESTS = [{ + # new URL schema + 'url': 'https://www.rtlxl.nl/programma/rtl-nieuws/0bd1384d-d970-3086-98bb-5c104e10c26f', + 'md5': '490428f1187b60d714f34e1f2e3af0b6', + 'info_dict': { + 'id': '0bd1384d-d970-3086-98bb-5c104e10c26f', + 'ext': 'mp4', + 'title': 'RTL Nieuws', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'timestamp': 1593293400, + 'upload_date': '20200627', + 'duration': 661.08, + }, + }, { + # old URL schema + 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416', + 'md5': '473d1946c1fdd050b2c0161a4b13c373', + 'info_dict': { + 'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416', + 'ext': 'mp4', + 'title': 'RTL Nieuws', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'timestamp': 1461951000, + 'upload_date': '20160429', + 'duration': 1167.96, + }, + 'skip': '404', + }, { + # best format available a3t + 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', + 'md5': 'dea7474214af1271d91ef332fb8be7ea', + 'info_dict': { + 'id': '84ae5571-ac25-4225-ae0c-ef8d9efb2aed', + 'ext': 'mp4', + 'timestamp': 1424039400, + 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag', + 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', + 'upload_date': '20150215', + 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', + } + }, { + # empty synopsis and missing episodes (see https://github.com/ytdl-org/youtube-dl/issues/6275) + # best format available nettv + 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false', + 'info_dict': { + 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', + 'ext': 'mp4', + 'title': 'RTL Nieuws - Meer beelden van overval juwelier', + 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', + 'timestamp': 1437233400, + 'upload_date': '20150718', + 'duration': 30.474, + }, + 'params': { + 'skip_download': True, + }, + }, { + # encrypted m3u8 streams, georestricted + 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7', + 'only_matching': True, + }, { + 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0', + 'only_matching': True, + }, { + 'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f', + 'only_matching': True, + }, { + 'url': 'https://www.rtl.nl/video/c603c9c2-601d-4b5e-8175-64f1e942dc7d/', + 'only_matching': True, + }, { + 'url': 'https://static.rtl.nl/embed/?uuid=1a2970fc-5c0b-43ff-9fdc-927e39e6d1bc&autoplay=false&publicatiepunt=rtlnieuwsnl', + 'only_matching': True, + }, { + # new embed URL schema + 'url': 'https://embed.rtl.nl/#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', + 'only_matching': True, + }] + + def _real_extract(self, url): + uuid = self._match_id(url) + info = self._download_json( + 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid, + uuid) + + material = info['material'][0] + title = info['abstracts'][0]['name'] + subtitle = material.get('title') + if subtitle: + title += ' - %s' % subtitle + description = material.get('synopsis') + + meta = info.get('meta', {}) + + videopath = material['videopath'] + m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath + + formats = self._extract_m3u8_formats( + m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False) + + thumbnails = [] + + for p in ('poster_base_url', '"thumb_base_url"'): + if not meta.get(p): + continue + + thumbnails.append({ + 'url': self._proto_relative_url(meta[p] + uuid), + 'width': int_or_none(self._search_regex( + r'/sz=([0-9]+)', meta[p], 'thumbnail width', fatal=False)), + 'height': int_or_none(self._search_regex( + r'/sz=[0-9]+x([0-9]+)', + meta[p], 'thumbnail height', fatal=False)) + }) + + return { + 'id': uuid, + 'title': title, + 'formats': formats, + 'timestamp': material['original_date'], + 'description': description, + 'duration': parse_duration(material.get('duration')), + 'thumbnails': thumbnails, + } + + +class RTLLuBaseIE(InfoExtractor): + _MEDIA_REGEX = { + 'video': r'<rtl-player\s[^>]*\bhls\s*=\s*"([^"]+)', + 'audio': r'<rtl-audioplayer\s[^>]*\bsrc\s*=\s*"([^"]+)', + 'thumbnail': r'<rtl-player\s[^>]*\bposter\s*=\s*"([^"]+)', + } + + def get_media_url(self, webpage, video_id, media_type): + return self._search_regex(self._MEDIA_REGEX[media_type], webpage, f'{media_type} url', default=None) + + def get_formats_and_subtitles(self, webpage, video_id): + video_url, audio_url = self.get_media_url(webpage, video_id, 'video'), self.get_media_url(webpage, video_id, 'audio') + + formats, subtitles = [], {} + if video_url is not None: + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id) + if audio_url is not None: + formats.append({'url': audio_url, 'ext': 'mp3', 'vcodec': 'none'}) + + return formats, subtitles + + def _real_extract(self, url): + video_id = self._match_id(url) + is_live = video_id in ('live', 'live-2', 'lauschteren') + + # TODO: extract comment from https://www.rtl.lu/comments?status=1&order=desc&context=news|article|<video_id> + # we can context from <rtl-comments context=<context> in webpage + webpage = self._download_webpage(url, video_id) + + formats, subtitles = self.get_formats_and_subtitles(webpage, video_id) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage, default=None), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': self.get_media_url(webpage, video_id, 'thumbnail') or self._og_search_thumbnail(webpage, default=None), + 'is_live': is_live, + } + + +class RTLLuTeleVODIE(RTLLuBaseIE): + IE_NAME = 'rtl.lu:tele-vod' + _VALID_URL = r'https?://(?:www\.)?rtl\.lu/(tele/(?P<slug>[\w-]+)/v/|video/)(?P<id>\d+)(\.html)?' + _TESTS = [{ + 'url': 'https://www.rtl.lu/tele/de-journal-vun-der-tele/v/3266757.html', + 'info_dict': { + 'id': '3266757', + 'title': 'Informatiounsversammlung Héichwaasser', + 'ext': 'mp4', + 'thumbnail': 'https://replay-assets.rtl.lu/2021/11/16/d3647fc4-470d-11ec-adc2-3a00abd6e90f_00008.jpg', + 'description': 'md5:b1db974408cc858c9fd241812e4a2a14', + } + }, { + 'url': 'https://www.rtl.lu/video/3295215', + 'info_dict': { + 'id': '3295215', + 'title': 'Kulturassisen iwwer d\'Bestandsopnam vum Lëtzebuerger Konscht', + 'ext': 'mp4', + 'thumbnail': 'https://replay-assets.rtl.lu/2022/06/28/0000_3295215_0000.jpg', + 'description': 'md5:85bcd4e0490aa6ec969d9bf16927437b', + } + }] + + +class RTLLuArticleIE(RTLLuBaseIE): + IE_NAME = 'rtl.lu:article' + _VALID_URL = r'https?://(?:(www|5minutes|today)\.)rtl\.lu/(?:[\w-]+)/(?:[\w-]+)/a/(?P<id>\d+)\.html' + _TESTS = [{ + # Audio-only + 'url': 'https://www.rtl.lu/sport/news/a/1934360.html', + 'info_dict': { + 'id': '1934360', + 'ext': 'mp3', + 'thumbnail': 'https://static.rtl.lu/rtl2008.lu/nt/p/2022/06/28/19/e4b37d66ddf00bab4c45617b91a5bb9b.jpeg', + 'description': 'md5:5eab4a2a911c1fff7efc1682a38f9ef7', + 'title': 'md5:40aa85f135578fbd549d3c9370321f99', + } + }, { + # 5minutes + 'url': 'https://5minutes.rtl.lu/espace-frontaliers/frontaliers-en-questions/a/1853173.html', + 'info_dict': { + 'id': '1853173', + 'ext': 'mp4', + 'description': 'md5:ac031da0740e997a5cf4633173634fee', + 'title': 'md5:87e17722ed21af0f24be3243f4ec0c46', + 'thumbnail': 'https://replay-assets.rtl.lu/2022/01/26/screenshot_20220126104933_3274749_12b249833469b0d6e4440a1dec83cdfa.jpg', + } + }, { + # today.lu + 'url': 'https://today.rtl.lu/entertainment/news/a/1936203.html', + 'info_dict': { + 'id': '1936203', + 'ext': 'mp4', + 'title': 'Once Upon A Time...zu Lëtzebuerg: The Three Witches\' Tower', + 'description': 'The witchy theme continues in the latest episode of Once Upon A Time...', + 'thumbnail': 'https://replay-assets.rtl.lu/2022/07/02/screenshot_20220702122859_3290019_412dc5185951b7f6545a4039c8be9235.jpg', + } + }] + + +class RTLLuLiveIE(RTLLuBaseIE): + _VALID_URL = r'https?://www\.rtl\.lu/(?:tele|radio)/(?P<id>live(?:-\d+)?|lauschteren)' + _TESTS = [{ + # Tele:live + 'url': 'https://www.rtl.lu/tele/live', + 'info_dict': { + 'id': 'live', + 'ext': 'mp4', + 'live_status': 'is_live', + 'title': r're:RTL - Télé LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'thumbnail': 'https://static.rtl.lu/livestream/channel1.jpg', + } + }, { + # Tele:live-2 + 'url': 'https://www.rtl.lu/tele/live-2', + 'info_dict': { + 'id': 'live-2', + 'ext': 'mp4', + 'live_status': 'is_live', + 'title': r're:RTL - Télé LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'thumbnail': 'https://static.rtl.lu/livestream/channel2.jpg', + } + }, { + # Radio:lauschteren + 'url': 'https://www.rtl.lu/radio/lauschteren', + 'info_dict': { + 'id': 'lauschteren', + 'ext': 'mp4', + 'live_status': 'is_live', + 'title': r're:RTL - Radio LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'thumbnail': 'https://static.rtl.lu/livestream/rtlradiowebtv.jpg', + } + }] + + +class RTLLuRadioIE(RTLLuBaseIE): + _VALID_URL = r'https?://www\.rtl\.lu/radio/(?:[\w-]+)/s/(?P<id>\d+)(\.html)?' + _TESTS = [{ + 'url': 'https://www.rtl.lu/radio/5-vir-12/s/4033058.html', + 'info_dict': { + 'id': '4033058', + 'ext': 'mp3', + 'description': 'md5:f855a4f3e3235393ae47ed1db5d934b9', + 'title': '5 vir 12 - Stau um Stau', + 'thumbnail': 'https://static.rtl.lu/rtlg//2022/06/24/c9c19e5694a14be46a3647a3760e1f62.jpg', + } + }] diff --git a/yt_dlp/extractor/rtnews.py b/yt_dlp/extractor/rtnews.py new file mode 100644 index 0000000..6be9945 --- /dev/null +++ b/yt_dlp/extractor/rtnews.py @@ -0,0 +1,196 @@ +import re + +from .common import InfoExtractor +from ..utils import js_to_json + + +class RTNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rt\.com/[^/]+/(?:[^/]+/)?(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://www.rt.com/sport/546301-djokovic-arrives-belgrade-crowds/', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '546301', + 'title': 'Crowds gather to greet deported Djokovic as he returns to Serbia (VIDEO)', + 'description': 'md5:1d5bfe1a988d81fd74227cfdf93d314d', + 'thumbnail': 'https://cdni.rt.com/files/2022.01/article/61e587a085f540102c3386c1.png' + }, + }, { + 'url': 'https://www.rt.com/shows/in-question/535980-plot-to-assassinate-julian-assange/', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '535980', + 'title': 'The plot to assassinate Julian Assange', + 'description': 'md5:55279ce5e4441dc1d16e2e4a730152cd', + 'thumbnail': 'https://cdni.rt.com/files/2021.09/article/615226f42030274e8879b53d.png' + }, + 'playlist': [{ + 'info_dict': { + 'id': '6152271d85f5400464496162', + 'ext': 'mp4', + 'title': '6152271d85f5400464496162', + }, + }] + }] + + def _entries(self, webpage): + video_urls = set(re.findall(r'https://cdnv\.rt\.com/.*[a-f0-9]+\.mp4', webpage)) + for v_url in video_urls: + v_id = re.search(r'([a-f0-9]+)\.mp4', v_url).group(1) + if v_id: + yield { + 'id': v_id, + 'title': v_id, + 'url': v_url, + } + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + + return { + '_type': 'playlist', + 'id': id, + 'entries': self._entries(webpage), + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + } + + +class RTDocumentryIE(InfoExtractor): + _VALID_URL = r'https?://rtd\.rt\.com/(?:(?:series|shows)/[^/]+|films)/(?P<id>[^/?$&#]+)' + + _TESTS = [{ + 'url': 'https://rtd.rt.com/films/escobars-hitman/', + 'info_dict': { + 'id': 'escobars-hitman', + 'ext': 'mp4', + 'title': "Escobar's Hitman. Former drug-gang killer, now loved and loathed in Colombia", + 'description': 'md5:647c76984b7cb9a8b52a567e87448d88', + 'thumbnail': 'https://cdni.rt.com/rtd-files/films/escobars-hitman/escobars-hitman_11.jpg', + 'average_rating': 8.53, + 'duration': 3134.0 + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://rtd.rt.com/shows/the-kalashnikova-show-military-secrets-anna-knishenko/iskander-tactical-system-natos-headache/', + 'info_dict': { + 'id': 'iskander-tactical-system-natos-headache', + 'ext': 'mp4', + 'title': "Iskander tactical system. NATO's headache | The Kalashnikova Show. Episode 10", + 'description': 'md5:da7c24a0aa67bc2bb88c86658508ca87', + 'thumbnail': 'md5:89de8ce38c710b7c501ff02d47e2aa89', + 'average_rating': 9.27, + 'duration': 274.0, + 'timestamp': 1605726000, + 'view_count': int, + 'upload_date': '20201118' + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://rtd.rt.com/series/i-am-hacked-trailer/introduction-to-safe-digital-life-ep2/', + 'info_dict': { + 'id': 'introduction-to-safe-digital-life-ep2', + 'ext': 'mp4', + 'title': 'How to Keep your Money away from Hackers | I am Hacked. Episode 2', + 'description': 'md5:c46fa9a5af86c0008c45a3940a8cce87', + 'thumbnail': 'md5:a5e81b9bf5aed8f5e23d9c053601b825', + 'average_rating': 10.0, + 'duration': 1524.0, + 'timestamp': 1636977600, + 'view_count': int, + 'upload_date': '20211115' + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + ld_json = self._search_json_ld(webpage, None, fatal=False) + if not ld_json: + self.raise_no_formats('No video/audio found at the provided url.', expected=True) + media_json = self._parse_json( + self._search_regex(r'(?s)\'Med\'\s*:\s*\[\s*({.+})\s*\]\s*};', webpage, 'media info'), + id, transform_source=js_to_json) + if 'title' not in ld_json and 'title' in media_json: + ld_json['title'] = media_json['title'] + formats = [{'url': src['file']} for src in media_json.get('sources') or [] if src.get('file')] + + return { + 'id': id, + 'thumbnail': media_json.get('image'), + 'formats': formats, + **ld_json + } + + +class RTDocumentryPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://rtd\.rt\.com/(?:series|shows)/(?P<id>[^/]+)/$' + + _TESTS = [{ + 'url': 'https://rtd.rt.com/series/i-am-hacked-trailer/', + 'playlist_mincount': 6, + 'info_dict': { + 'id': 'i-am-hacked-trailer', + }, + }, { + 'url': 'https://rtd.rt.com/shows/the-kalashnikova-show-military-secrets-anna-knishenko/', + 'playlist_mincount': 34, + 'info_dict': { + 'id': 'the-kalashnikova-show-military-secrets-anna-knishenko', + }, + }] + + def _entries(self, webpage, id): + video_urls = set(re.findall(r'list-2__link\s*"\s*href="([^"]+)"', webpage)) + for v_url in video_urls: + if id not in v_url: + continue + yield self.url_result( + 'https://rtd.rt.com%s' % v_url, + ie=RTDocumentryIE.ie_key()) + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + + return { + '_type': 'playlist', + 'id': id, + 'entries': self._entries(webpage, id), + } + + +class RuptlyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ruptly\.tv/[a-z]{2}/videos/(?P<id>\d+-\d+)' + + _TESTS = [{ + 'url': 'https://www.ruptly.tv/en/videos/20220112-020-Japan-Double-trouble-Tokyo-zoo-presents-adorable-panda-twins', + 'info_dict': { + 'id': '20220112-020', + 'ext': 'mp4', + 'title': 'Japan: Double trouble! Tokyo zoo presents adorable panda twins | Video Ruptly', + 'description': 'md5:85a8da5fdb31486f0562daf4360ce75a', + 'thumbnail': 'https://storage.ruptly.tv/thumbnails/20220112-020/i6JQKnTNpYuqaXsR/i6JQKnTNpYuqaXsR.jpg' + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + m3u8_url = self._search_regex(r'preview_url"\s?:\s?"(https?://storage\.ruptly\.tv/video_projects/.+\.m3u8)"', webpage, 'm3u8 url', fatal=False) + if not m3u8_url: + self.raise_no_formats('No video/audio found at the provided url.', expected=True) + formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, id, ext='mp4') + return { + 'id': id, + 'formats': formats, + 'subtitles': subs, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + } diff --git a/yt_dlp/extractor/rtp.py b/yt_dlp/extractor/rtp.py new file mode 100644 index 0000000..5928a20 --- /dev/null +++ b/yt_dlp/extractor/rtp.py @@ -0,0 +1,97 @@ +from .common import InfoExtractor +from ..utils import js_to_json +import re +import json +import urllib.parse +import base64 + + +class RTPIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?' + _TESTS = [{ + 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', + 'md5': 'e736ce0c665e459ddb818546220b4ef8', + 'info_dict': { + 'id': 'e174042', + 'ext': 'mp3', + 'title': 'Paixões Cruzadas', + 'description': 'As paixões musicais de António Cartaxo e António Macedo', + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, { + 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', + 'only_matching': True, + }] + + _RX_OBFUSCATION = re.compile(r'''(?xs) + atob\s*\(\s*decodeURIComponent\s*\(\s* + (\[[0-9A-Za-z%,'"]*\]) + \s*\.\s*join\(\s*(?:""|'')\s*\)\s*\)\s*\) + ''') + + def __unobfuscate(self, data, *, video_id): + if data.startswith('{'): + data = self._RX_OBFUSCATION.sub( + lambda m: json.dumps( + base64.b64decode(urllib.parse.unquote( + ''.join(self._parse_json(m.group(1), video_id)) + )).decode('iso-8859-1')), + data) + return js_to_json(data) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta( + 'twitter:title', webpage, display_name='title', fatal=True) + + f, config = self._search_regex( + r'''(?sx) + var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s* + var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/) + ''', webpage, + 'player config', group=('f', 'config')) + + f = self._parse_json( + f, video_id, + lambda data: self.__unobfuscate(data, video_id=video_id)) + config = self._parse_json( + config, video_id, + lambda data: self.__unobfuscate(data, video_id=video_id)) + + formats = [] + if isinstance(f, dict): + f_hls = f.get('hls') + if f_hls is not None: + formats.extend(self._extract_m3u8_formats( + f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')) + + f_dash = f.get('dash') + if f_dash is not None: + formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash')) + else: + formats.append({ + 'format_id': 'f', + 'url': f, + 'vcodec': 'none' if config.get('mediaType') == 'audio' else None, + }) + + subtitles = {} + + vtt = config.get('vtt') + if vtt is not None: + for lcode, lname, url in vtt: + subtitles.setdefault(lcode, []).append({ + 'name': lname, + 'url': url, + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': self._html_search_meta(['description', 'twitter:description'], webpage), + 'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage), + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/rtrfm.py b/yt_dlp/extractor/rtrfm.py new file mode 100644 index 0000000..7381d82 --- /dev/null +++ b/yt_dlp/extractor/rtrfm.py @@ -0,0 +1,65 @@ +from .common import InfoExtractor + + +class RTRFMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtrfm\.com\.au/(?:shows|show-episode)/(?P<id>[^/?\#&]+)' + _TESTS = [ + { + 'url': 'https://rtrfm.com.au/shows/breakfast/', + 'md5': '46168394d3a5ce237cf47e85d0745413', + 'info_dict': { + 'id': 'breakfast-2021-11-16', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': r're:^Breakfast with Taylah \d{4}-\d{2}-\d{2}$', + 'description': 'md5:0979c3ab1febfbec3f1ccb743633c611', + }, + 'skip': 'ID and md5 changes daily', + }, + { + 'url': 'https://rtrfm.com.au/show-episode/breakfast-2021-11-11/', + 'md5': '396bedf1e40f96c62b30d4999202a790', + 'info_dict': { + 'id': 'breakfast-2021-11-11', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': 'Breakfast with Taylah 2021-11-11', + 'description': 'md5:0979c3ab1febfbec3f1ccb743633c611', + }, + }, + { + 'url': 'https://rtrfm.com.au/show-episode/breakfast-2020-06-01/', + 'md5': '594027f513ec36a24b15d65007a24dff', + 'info_dict': { + 'id': 'breakfast-2020-06-01', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': 'Breakfast with Taylah 2020-06-01', + 'description': r're:^Breakfast with Taylah ', + }, + 'skip': 'This audio has expired', + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show, date, title = self._search_regex( + r'''\.playShow(?:From)?\(['"](?P<show>[^'"]+)['"],\s*['"](?P<date>[0-9]{4}-[0-9]{2}-[0-9]{2})['"],\s*['"](?P<title>[^'"]+)['"]''', + webpage, 'details', group=('show', 'date', 'title')) + url = self._download_json( + 'https://restreams.rtrfm.com.au/rzz', + show, 'Downloading MP3 URL', query={'n': show, 'd': date})['u'] + # This is the only indicator of an error until trying to download the URL and + # downloads of mp4 URLs always fail (403 for current episodes, 404 for missing). + if '.mp4' in url: + url = None + self.raise_no_formats('Expired or no episode on this date', expected=True) + return { + 'id': '%s-%s' % (show, date), + 'title': '%s %s' % (title, date), + 'series': title, + 'url': url, + 'release_date': date, + 'description': self._og_search_description(webpage), + } diff --git a/yt_dlp/extractor/rts.py b/yt_dlp/extractor/rts.py new file mode 100644 index 0000000..bce5cba --- /dev/null +++ b/yt_dlp/extractor/rts.py @@ -0,0 +1,232 @@ +import re + +from .srgssr import SRGSSRIE +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + unescapeHTML, + urljoin, +) + + +class RTSIE(SRGSSRIE): # XXX: Do not subclass from concrete IE + _WORKING = False + IE_DESC = 'RTS.ch' + _VALID_URL = r'rts:(?P<rts_id>\d+)|https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html' + + _TESTS = [ + { + 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', + 'md5': '753b877968ad8afaeddccc374d4256a5', + 'info_dict': { + 'id': '3449373', + 'display_id': 'les-enfants-terribles', + 'ext': 'mp4', + 'duration': 1488, + 'title': 'Les Enfants Terribles', + 'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.', + 'uploader': 'Divers', + 'upload_date': '19680921', + 'timestamp': -40280400, + 'thumbnail': r're:^https?://.*\.image', + 'view_count': int, + }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], + }, + { + 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', + 'info_dict': { + 'id': '5624065', + 'title': 'Passe-moi les jumelles', + }, + 'playlist_mincount': 4, + }, + { + 'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html', + 'info_dict': { + 'id': '5745975', + 'display_id': '1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski', + 'ext': 'mp4', + 'duration': 48, + 'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski', + 'description': 'Hockey - Playoff', + 'uploader': 'Hockey', + 'upload_date': '20140403', + 'timestamp': 1396556882, + 'thumbnail': r're:^https?://.*\.image', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], + 'skip': 'Blocked outside Switzerland', + }, + { + 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', + 'md5': '9bb06503773c07ce83d3cbd793cebb91', + 'info_dict': { + 'id': '5745356', + 'display_id': 'londres-cachee-par-un-epais-smog', + 'ext': 'mp4', + 'duration': 33, + 'title': 'Londres cachée par un épais smog', + 'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.', + 'uploader': 'L\'actu en vidéo', + 'upload_date': '20140403', + 'timestamp': 1396537322, + 'thumbnail': r're:^https?://.*\.image', + 'view_count': int, + }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], + }, + { + 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', + 'md5': 'dd8ef6a22dff163d063e2a52bc8adcae', + 'info_dict': { + 'id': '5706148', + 'display_id': 'urban-hippie-de-damien-krisl-03-04-2014', + 'ext': 'mp3', + 'duration': 123, + 'title': '"Urban Hippie", de Damien Krisl', + 'description': 'Des Hippies super glam.', + 'upload_date': '20140403', + 'timestamp': 1396551600, + }, + }, + { + # article with videos on rhs + 'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html', + 'info_dict': { + 'id': '6693917', + 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse', + }, + 'playlist_mincount': 5, + }, + { + 'url': 'http://pages.rts.ch/emissions/passe-moi-les-jumelles/5624065-entre-ciel-et-mer.html', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + m = self._match_valid_url(url) + media_id = m.group('rts_id') or m.group('id') + display_id = m.group('display_id') or media_id + + def download_json(internal_id): + return self._download_json( + 'http://www.rts.ch/a/%s.html?f=json/article' % internal_id, + display_id) + + all_info = download_json(media_id) + + # media_id extracted out of URL is not always a real id + if 'video' not in all_info and 'audio' not in all_info: + entries = [] + + for item in all_info.get('items', []): + item_url = item.get('url') + if not item_url: + continue + entries.append(self.url_result(item_url, 'RTS')) + + if not entries: + page, urlh = self._download_webpage_handle(url, display_id) + if re.match(self._VALID_URL, urlh.url).group('id') != media_id: + return self.url_result(urlh.url, 'RTS') + + # article with videos on rhs + videos = re.findall( + r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:([^"]+)"', + page) + if not videos: + videos = re.findall( + r'(?s)<iframe[^>]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"', + page) + if videos: + entries = [self.url_result('srgssr:%s' % video_urn, 'SRGSSR') for video_urn in videos] + + if entries: + return self.playlist_result(entries, media_id, all_info.get('title')) + + internal_id = self._html_search_regex( + r'<(?:video|audio) data-id="([0-9]+)"', page, + 'internal video id') + all_info = download_json(internal_id) + + media_type = 'video' if 'video' in all_info else 'audio' + + # check for errors + self._get_media_data('rts', media_type, media_id) + + info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] + + title = info['title'] + + def extract_bitrate(url): + return int_or_none(self._search_regex( + r'-([0-9]+)k\.', url, 'bitrate', default=None)) + + formats = [] + streams = info.get('streams', {}) + for format_id, format_url in streams.items(): + if format_id == 'hds_sd' and 'hds' in streams: + continue + if format_id == 'hls_sd' and 'hls' in streams: + continue + ext = determine_ext(format_url) + if ext in ('m3u8', 'f4m'): + format_url = self._get_tokenized_src(format_url, media_id, format_id) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url + ('?' if '?' not in format_url else '&') + 'hdcore=3.4.0', + media_id, f4m_id=format_id, fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'tbr': extract_bitrate(format_url), + }) + + download_base = 'http://rtsww%s-d.rts.ch/' % ('-a' if media_type == 'audio' else '') + for media in info.get('media', []): + media_url = media.get('url') + if not media_url or re.match(r'https?://', media_url): + continue + rate = media.get('rate') + ext = media.get('ext') or determine_ext(media_url, 'mp4') + format_id = ext + if rate: + format_id += '-%dk' % rate + formats.append({ + 'format_id': format_id, + 'url': urljoin(download_base, media_url), + 'tbr': rate or extract_bitrate(media_url), + }) + + self._check_formats(formats, media_id) + + duration = info.get('duration') or info.get('cutout') or info.get('cutduration') + if isinstance(duration, compat_str): + duration = parse_duration(duration) + + return { + 'id': media_id, + 'display_id': display_id, + 'formats': formats, + 'title': title, + 'description': info.get('intro'), + 'duration': duration, + 'view_count': int_or_none(info.get('plays')), + 'uploader': info.get('programName'), + 'timestamp': parse_iso8601(info.get('broadcast_date')), + 'thumbnail': unescapeHTML(info.get('preview_image_url')), + } diff --git a/yt_dlp/extractor/rtvcplay.py b/yt_dlp/extractor/rtvcplay.py new file mode 100644 index 0000000..741c472 --- /dev/null +++ b/yt_dlp/extractor/rtvcplay.py @@ -0,0 +1,285 @@ +import re + +from .common import InfoExtractor, ExtractorError +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + float_or_none, + js_to_json, + mimetype2ext, + traverse_obj, + urljoin, + url_or_none, +) + + +class RTVCPlayBaseIE(InfoExtractor): + _BASE_VALID_URL = r'https?://(?:www\.)?rtvcplay\.co' + + def _extract_player_config(self, webpage, video_id): + return self._search_json( + r'<script\b[^>]*>[^<]*(?:var|let|const)\s+config\s*=', re.sub(r'"\s*\+\s*"', '', webpage), + 'player_config', video_id, transform_source=js_to_json) + + def _extract_formats_and_subtitles_player_config(self, player_config, video_id): + formats, subtitles = [], {} + for source in traverse_obj(player_config, ('sources', ..., lambda _, v: url_or_none(v['url']))): + ext = mimetype2ext(source.get('mimetype'), default=determine_ext(source['url'])) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + source['url'], video_id, 'mp4', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': source['url'], + 'ext': ext, + }) + + return formats, subtitles + + +class RTVCPlayIE(RTVCPlayBaseIE): + _VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/(?P<category>(?!embed)[^/]+)/(?:[^?#]+/)?(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://www.rtvcplay.co/en-vivo/canal-institucional', + 'info_dict': { + 'id': 'canal-institucional', + 'title': r're:^Canal Institucional', + 'description': 'md5:eff9e548394175928059320c006031ea', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.rtvcplay.co/en-vivo/senal-colombia', + 'info_dict': { + 'id': 'senal-colombia', + 'title': r're:^Señal Colombia', + 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.rtvcplay.co/en-vivo/radio-nacional', + 'info_dict': { + 'id': 'radio-nacional', + 'title': r're:^Radio Nacional', + 'description': 'md5:5de009bc6a9fa79d2a6cf0b73f977d53', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.rtvcplay.co/peliculas-ficcion/senoritas', + 'md5': '1288ee6f6d1330d880f98bff2ed710a3', + 'info_dict': { + 'id': 'senoritas', + 'title': 'Señoritas', + 'description': 'md5:f095a2bb52cb6cf279daf6302f86fb32', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa/james-regresa-clases-28022022', + 'md5': 'f040a7380a269ad633cf837384d5e9fc', + 'info_dict': { + 'id': 'james-regresa-clases-28022022', + 'title': 'James regresa a clases - 28/02/2022', + 'description': 'md5:c5dcdf757c7ab29305e8763c6007e675', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.rtvcplay.co/peliculas-documentales/llinas-el-cerebro-y-el-universo', + 'info_dict': { + 'id': 'llinas-el-cerebro-y-el-universo', + 'title': 'Llinás, el cerebro y el universo', + 'description': 'md5:add875bf2309bb52b3e8b9b06116d9b0', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa', + 'info_dict': { + 'id': 'profe-en-tu-casa', + 'title': 'Profe en tu casa', + 'description': 'md5:47dbe20e263194413b1db2a2805a4f2e', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 537, + }, { + 'url': 'https://www.rtvcplay.co/series-al-oido/relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura', + 'info_dict': { + 'id': 'relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura', + 'title': 'Relato de un náufrago: una travesía del periodismo a la literatura', + 'description': 'md5:6da28fdca4a5a568ea47ef65ef775603', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://www.rtvcplay.co/series-al-oido/diez-versiones', + 'info_dict': { + 'id': 'diez-versiones', + 'title': 'Diez versiones', + 'description': 'md5:997471ed971cb3fd8e41969457675306', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 20, + }] + + def _real_extract(self, url): + video_id, category = self._match_valid_url(url).group('id', 'category') + webpage = self._download_webpage(url, video_id) + + hydration = self._search_json( + r'window\.__RTVCPLAY_STATE__\s*=', webpage, 'hydration', + video_id, transform_source=js_to_json)['content']['currentContent'] + + asset_id = traverse_obj(hydration, ('video', 'assetid')) + if asset_id: + hls_url = hydration['base_url_hls'].replace('[node:field_asset_id]', asset_id) + else: + hls_url = traverse_obj(hydration, ('channel', 'hls')) + + metadata = traverse_obj(hydration, { + 'title': 'title', + 'description': 'description', + 'thumbnail': ((('channel', 'image', 'logo'), ('resource', 'image', 'cover_desktop')), 'path'), + }, get_all=False) + + # Probably it's a program's page + if not hls_url: + seasons = traverse_obj( + hydration, ('widgets', lambda _, y: y['type'] == 'seasonList', 'contents'), + get_all=False) + if not seasons: + podcast_episodes = hydration.get('audios') + if not podcast_episodes: + raise ExtractorError('Could not find asset_id nor program playlist nor podcast episodes') + + return self.playlist_result([ + self.url_result(episode['file'], url_transparent=True, **traverse_obj(episode, { + 'title': 'title', + 'description': ('description', {clean_html}), + 'episode_number': ('chapter_number', {float_or_none}, {int_or_none}), + 'season_number': ('season', {int_or_none}), + })) for episode in podcast_episodes], video_id, **metadata) + + entries = [self.url_result( + urljoin(url, episode['slug']), url_transparent=True, + **traverse_obj(season, { + 'season': 'title', + 'season_number': ('season', {int_or_none}), + }), **traverse_obj(episode, { + 'title': 'title', + 'thumbnail': ('image', 'cover', 'path'), + 'episode_number': ('chapter_number', {int_or_none}), + })) for season in seasons for episode in traverse_obj(season, ('contents', ...))] + + return self.playlist_result(entries, video_id, **metadata) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(hls_url, video_id, 'mp4') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': category == 'en-vivo', + **metadata, + } + + +class RTVCPlayEmbedIE(RTVCPlayBaseIE): + _VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/embed/(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://www.rtvcplay.co/embed/72b0e699-248b-4929-a4a8-3782702fa7f9', + 'md5': 'ed529aeaee7aa2a72afe91ac7d1177a8', + 'info_dict': { + 'id': '72b0e699-248b-4929-a4a8-3782702fa7f9', + 'title': 'Tráiler: Señoritas', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + player_config = self._extract_player_config(webpage, video_id) + formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id) + + asset_id = traverse_obj(player_config, ('rtvcplay', 'assetid')) + metadata = {} if not asset_id else self._download_json( + f'https://cms.rtvcplay.co/api/v1/video/asset-id/{asset_id}', video_id, fatal=False) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(metadata, { + 'title': 'title', + 'description': 'description', + 'thumbnail': ('image', ..., 'thumbnail', 'path'), + }, get_all=False) + } + + +class RTVCKalturaIE(RTVCPlayBaseIE): + _VALID_URL = r'https?://media\.rtvc\.gov\.co/kalturartvc/(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://media.rtvc.gov.co/kalturartvc/indexSC.html', + 'info_dict': { + 'id': 'indexSC', + 'title': r're:^Señal Colombia', + 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + player_config = self._extract_player_config(webpage, video_id) + formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id) + + channel_id = traverse_obj(player_config, ('rtvcplay', 'channelId')) + metadata = {} if not channel_id else self._download_json( + f'https://cms.rtvcplay.co/api/v1/taxonomy_term/streaming/{channel_id}', video_id, fatal=False) + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + traverse_obj(metadata, ('channel', 'hls')), video_id, 'mp4', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + **traverse_obj(metadata, { + 'title': 'title', + 'description': 'description', + 'thumbnail': ('channel', 'image', 'logo', 'path'), + }) + } diff --git a/yt_dlp/extractor/rtve.py b/yt_dlp/extractor/rtve.py new file mode 100644 index 0000000..a99a266 --- /dev/null +++ b/yt_dlp/extractor/rtve.py @@ -0,0 +1,344 @@ +import base64 +import io +import struct + +from .common import InfoExtractor +from ..compat import compat_b64decode +from ..utils import ( + ExtractorError, + determine_ext, + float_or_none, + qualities, + remove_end, + remove_start, + try_get, +) + + +class RTVEALaCartaIE(InfoExtractor): + IE_NAME = 'rtve.es:alacarta' + IE_DESC = 'RTVE a la carta' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', + 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', + 'info_dict': { + 'id': '2491869', + 'ext': 'mp4', + 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', + 'duration': 5024.566, + 'series': 'Balonmano', + }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], + }, { + 'note': 'Live stream', + 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', + 'info_dict': { + 'id': '1694255', + 'ext': 'mp4', + 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': 'live stream', + }, + }, { + 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', + 'md5': 'd850f3c8731ea53952ebab489cf81cbf', + 'info_dict': { + 'id': '4236788', + 'ext': 'mp4', + 'title': 'Servir y proteger - Capítulo 104', + 'duration': 3222.0, + }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], + }, { + 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', + 'only_matching': True, + }, { + 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/', + 'only_matching': True, + }] + + def _real_initialize(self): + user_agent_b64 = base64.b64encode(self.get_param('http_headers')['User-Agent'].encode('utf-8')).decode('utf-8') + self._manager = self._download_json( + 'http://www.rtve.es/odin/loki/' + user_agent_b64, + None, 'Fetching manager info')['manager'] + + @staticmethod + def _decrypt_url(png): + encrypted_data = io.BytesIO(compat_b64decode(png)[8:]) + while True: + length = struct.unpack('!I', encrypted_data.read(4))[0] + chunk_type = encrypted_data.read(4) + if chunk_type == b'IEND': + break + data = encrypted_data.read(length) + if chunk_type == b'tEXt': + alphabet_data, text = data.split(b'\0') + quality, url_data = text.split(b'%%') + alphabet = [] + e = 0 + d = 0 + for l in alphabet_data.decode('iso-8859-1'): + if d == 0: + alphabet.append(l) + d = e = (e + 1) % 4 + else: + d -= 1 + url = '' + f = 0 + e = 3 + b = 1 + for letter in url_data.decode('iso-8859-1'): + if f == 0: + l = int(letter) * 10 + f = 1 + else: + if e == 0: + l += int(letter) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + + yield quality.decode(), url + encrypted_data.read(4) # CRC + + def _extract_png_formats(self, video_id): + png = self._download_webpage( + 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id), + video_id, 'Downloading url information', query={'q': 'v2'}) + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + formats = [] + for quality, video_url in self._decrypt_url(png): + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, 'dash', fatal=False)) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': video_url, + }) + return formats + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_json( + 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, + video_id)['page']['items'][0] + if info['state'] == 'DESPU': + raise ExtractorError('The video is no longer available', expected=True) + title = info['title'].strip() + formats = self._extract_png_formats(video_id) + + subtitles = None + sbt_file = info.get('sbtFile') + if sbt_file: + subtitles = self.extract_subtitles(video_id, sbt_file) + + is_live = info.get('live') is True + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': info.get('image'), + 'subtitles': subtitles, + 'duration': float_or_none(info.get('duration'), 1000), + 'is_live': is_live, + 'series': info.get('programTitle'), + } + + def _get_subtitles(self, video_id, sub_file): + subs = self._download_json( + sub_file + '.json', video_id, + 'Downloading subtitles info')['page']['items'] + return dict( + (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) + for s in subs) + + +class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE + IE_NAME = 'rtve.es:audio' + IE_DESC = 'RTVE audio' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/[^/]+/[^/]+/(?P<id>[0-9]+)' + + _TESTS = [{ + 'url': 'https://www.rtve.es/alacarta/audios/a-hombros-de-gigantes/palabra-ingeniero-codigos-informaticos-27-04-21/5889192/', + 'md5': 'ae06d27bff945c4e87a50f89f6ce48ce', + 'info_dict': { + 'id': '5889192', + 'ext': 'mp3', + 'title': 'Códigos informáticos', + 'thumbnail': r're:https?://.+/1598856591583.jpg', + 'duration': 349.440, + 'series': 'A hombros de gigantes', + }, + }, { + 'url': 'https://www.rtve.es/play/audios/en-radio-3/ignatius-farray/5791165/', + 'md5': '072855ab89a9450e0ba314c717fa5ebc', + 'info_dict': { + 'id': '5791165', + 'ext': 'mp3', + 'title': 'Ignatius Farray', + 'thumbnail': r're:https?://.+/1613243011863.jpg', + 'duration': 3559.559, + 'series': 'En Radio 3' + }, + }, { + 'url': 'https://www.rtve.es/play/audios/frankenstein-o-el-moderno-prometeo/capitulo-26-ultimo-muerte-victor-juan-jose-plans-mary-shelley/6082623/', + 'md5': '0eadab248cc8dd193fa5765712e84d5c', + 'info_dict': { + 'id': '6082623', + 'ext': 'mp3', + 'title': 'Capítulo 26 y último: La muerte de Victor', + 'thumbnail': r're:https?://.+/1632147445707.jpg', + 'duration': 3174.086, + 'series': 'Frankenstein o el moderno Prometeo' + }, + }] + + def _extract_png_formats(self, audio_id): + """ + This function retrieves media related png thumbnail which obfuscate + valuable information about the media. This information is decrypted + via base class _decrypt_url function providing media quality and + media url + """ + png = self._download_webpage( + 'http://www.rtve.es/ztnr/movil/thumbnail/%s/audios/%s.png' % + (self._manager, audio_id), + audio_id, 'Downloading url information', query={'q': 'v2'}) + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + formats = [] + for quality, audio_url in self._decrypt_url(png): + ext = determine_ext(audio_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + audio_url, audio_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + audio_url, audio_id, 'dash', fatal=False)) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': audio_url, + }) + return formats + + def _real_extract(self, url): + audio_id = self._match_id(url) + info = self._download_json( + 'https://www.rtve.es/api/audios/%s.json' % audio_id, + audio_id)['page']['items'][0] + + return { + 'id': audio_id, + 'title': info['title'].strip(), + 'thumbnail': info.get('thumbnail'), + 'duration': float_or_none(info.get('duration'), 1000), + 'series': try_get(info, lambda x: x['programInfo']['title']), + 'formats': self._extract_png_formats(audio_id), + } + + +class RTVEInfantilIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE + IE_NAME = 'rtve.es:infantil' + IE_DESC = 'RTVE infantil' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P<id>[0-9]+)/' + + _TESTS = [{ + 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', + 'md5': '5747454717aedf9f9fdf212d1bcfc48d', + 'info_dict': { + 'id': '3040283', + 'ext': 'mp4', + 'title': 'Maneras de vivir', + 'thumbnail': r're:https?://.+/1426182947956\.JPG', + 'duration': 357.958, + }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], + }] + + +class RTVELiveIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE + IE_NAME = 'rtve.es:live' + IE_DESC = 'RTVE.es live streams' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' + + _TESTS = [{ + 'url': 'http://www.rtve.es/directo/la-1/', + 'info_dict': { + 'id': 'la-1', + 'ext': 'mp4', + 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + 'skip_download': 'live stream', + } + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') + title = remove_start(title, 'Estoy viendo ') + + vidplayer_id = self._search_regex( + (r'playerId=player([0-9]+)', + r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', + r'data-id=["\'](\d+)'), + webpage, 'internal video ID') + + return { + 'id': video_id, + 'title': title, + 'formats': self._extract_png_formats(vidplayer_id), + 'is_live': True, + } + + +class RTVETelevisionIE(InfoExtractor): + IE_NAME = 'rtve.es:television' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml' + + _TEST = { + 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', + 'info_dict': { + 'id': '3069778', + 'ext': 'mp4', + 'title': 'Documentos TV - La revolución del móvil', + 'duration': 3496.948, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + alacarta_url = self._search_regex( + r'data-location="alacarta_videos"[^<]+url":"(http://www\.rtve\.es/alacarta.+?)&', + webpage, 'alacarta url', default=None) + if alacarta_url is None: + raise ExtractorError( + 'The webpage doesn\'t contain any video', expected=True) + + return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key()) diff --git a/yt_dlp/extractor/rtvs.py b/yt_dlp/extractor/rtvs.py new file mode 100644 index 0000000..a84a78d --- /dev/null +++ b/yt_dlp/extractor/rtvs.py @@ -0,0 +1,85 @@ +import re + +from .common import InfoExtractor + +from ..utils import ( + parse_duration, + traverse_obj, + unified_timestamp, +) + + +class RTVSIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv(?:/\d+)?/(?P<id>\d+)/?(?:[#?]|$)' + _TESTS = [{ + # radio archive + 'url': 'http://www.rtvs.sk/radio/archiv/11224/414872', + 'md5': '134d5d6debdeddf8a5d761cbc9edacb8', + 'info_dict': { + 'id': '414872', + 'ext': 'mp3', + 'title': 'Ostrov pokladov 1 časť.mp3', + 'duration': 2854, + 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0000/b1R8.rtvs.jpg', + 'display_id': '135331', + } + }, { + # tv archive + 'url': 'http://www.rtvs.sk/televizia/archiv/8249/63118', + 'info_dict': { + 'id': '63118', + 'ext': 'mp4', + 'title': 'Amaro Džives - Náš deň', + 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.', + 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0031/L7Qm.amaro_dzives_png.jpg', + 'timestamp': 1428555900, + 'upload_date': '20150409', + 'duration': 4986, + } + }, { + # tv archive + 'url': 'https://www.rtvs.sk/televizia/archiv/18083?utm_source=web&utm_medium=rozcestnik&utm_campaign=Robin', + 'info_dict': { + 'id': '18083', + 'ext': 'mp4', + 'title': 'Robin', + 'description': 'md5:2f70505a7b8364491003d65ff7a0940a', + 'timestamp': 1636652760, + 'display_id': '307655', + 'duration': 831, + 'upload_date': '20211111', + 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0916/robin.jpg', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + iframe_id = self._search_regex( + r'<iframe[^>]+id\s*=\s*"player_[^_]+_([0-9]+)"', webpage, 'Iframe ID') + iframe_url = self._search_regex( + fr'<iframe[^>]+id\s*=\s*"player_[^_]+_{re.escape(iframe_id)}"[^>]+src\s*=\s*"([^"]+)"', webpage, 'Iframe URL') + + webpage = self._download_webpage(iframe_url, video_id, 'Downloading iframe') + json_url = self._search_regex(r'var\s+url\s*=\s*"([^"]+)"\s*\+\s*ruurl', webpage, 'json URL') + data = self._download_json(f'https:{json_url}b=mozilla&p=win&v=97&f=0&d=1', video_id) + + if data.get('clip'): + data['playlist'] = [data['clip']] + + if traverse_obj(data, ('playlist', 0, 'sources', 0, 'type')) == 'audio/mp3': + formats = [{'url': traverse_obj(data, ('playlist', 0, 'sources', 0, 'src'))}] + else: + formats = self._extract_m3u8_formats(traverse_obj(data, ('playlist', 0, 'sources', 0, 'src')), video_id) + + return { + 'id': video_id, + 'display_id': iframe_id, + 'title': traverse_obj(data, ('playlist', 0, 'title')), + 'description': traverse_obj(data, ('playlist', 0, 'description')), + 'duration': parse_duration(traverse_obj(data, ('playlist', 0, 'length'))), + 'thumbnail': traverse_obj(data, ('playlist', 0, 'image')), + 'timestamp': unified_timestamp(traverse_obj(data, ('playlist', 0, 'datetime_create'))), + 'formats': formats + } diff --git a/yt_dlp/extractor/rtvslo.py b/yt_dlp/extractor/rtvslo.py new file mode 100644 index 0000000..39ace7c --- /dev/null +++ b/yt_dlp/extractor/rtvslo.py @@ -0,0 +1,166 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + parse_duration, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class RTVSLOIE(InfoExtractor): + IE_NAME = 'rtvslo.si' + _VALID_URL = r'''(?x) + https?://(?: + (?:365|4d)\.rtvslo.si/arhiv/[^/?#&;]+| + (?:www\.)?rtvslo\.si/rtv365/arhiv + )/(?P<id>\d+)''' + _GEO_COUNTRIES = ['SI'] + + _API_BASE = 'https://api.rtvslo.si/ava/{}/{}?client_id=82013fb3a531d5414f478747c1aca622' + SUB_LANGS_MAP = {'Slovenski': 'sl'} + + _TESTS = [ + { + 'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv', + 'info_dict': { + 'id': '174842550', + 'ext': 'mp4', + 'release_timestamp': 1643140032, + 'upload_date': '20220125', + 'series': 'Dnevnik', + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/92/dnevnik_3_wide2.jpg', + 'description': 'md5:76a18692757aeb8f0f51221106277dd2', + 'timestamp': 1643137046, + 'title': 'Dnevnik', + 'series_id': '92', + 'release_date': '20220125', + 'duration': 1789, + }, + }, { + 'url': 'https://365.rtvslo.si/arhiv/utrip/174843754', + 'info_dict': { + 'id': '174843754', + 'ext': 'mp4', + 'series_id': '94', + 'release_date': '20220129', + 'timestamp': 1643484455, + 'title': 'Utrip', + 'duration': 813, + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/94/utrip_1_wide2.jpg', + 'description': 'md5:77f2892630c7b17bb7a5bb84319020c9', + 'release_timestamp': 1643485825, + 'upload_date': '20220129', + 'series': 'Utrip', + }, + }, { + 'url': 'https://365.rtvslo.si/arhiv/il-giornale-della-sera/174844609', + 'info_dict': { + 'id': '174844609', + 'ext': 'mp3', + 'series_id': '106615841', + 'title': 'Il giornale della sera', + 'duration': 1328, + 'series': 'Il giornale della sera', + 'timestamp': 1643743800, + 'release_timestamp': 1643745424, + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/il-giornale-della-sera_wide2.jpg', + 'upload_date': '20220201', + 'tbr': 128000, + 'release_date': '20220201', + }, + }, { + 'url': 'https://365.rtvslo.si/arhiv/razred-zase/148350750', + 'info_dict': { + 'id': '148350750', + 'ext': 'mp4', + 'title': 'Prvi šolski dan, mozaična oddaja za mlade', + 'series': 'Razred zase', + 'series_id': '148185730', + 'duration': 1481, + 'upload_date': '20121019', + 'timestamp': 1350672122, + 'release_date': '20121019', + 'release_timestamp': 1350672122, + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/148185730/razred_zase_2014_logo_4d_wide2.jpg', + }, + }, { + 'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550', + 'only_matching': True + } + ] + + def _real_extract(self, url): + v_id = self._match_id(url) + meta = self._download_json(self._API_BASE.format('getRecordingDrm', v_id), v_id)['response'] + + thumbs = [{'id': k, 'url': v, 'http_headers': {'Accept': 'image/jpeg'}} + for k, v in (meta.get('images') or {}).items()] + + subs = {} + for s in traverse_obj(meta, 'subs', 'subtitles', default=[]): + lang = self.SUB_LANGS_MAP.get(s.get('language'), s.get('language') or 'und') + subs.setdefault(lang, []).append({ + 'url': s.get('file'), + 'ext': traverse_obj(s, 'format', expected_type=str.lower), + }) + + jwt = meta.get('jwt') + if not jwt: + raise ExtractorError('Site did not provide an authentication token, cannot proceed.') + + media = self._download_json(self._API_BASE.format('getMedia', v_id), v_id, query={'jwt': jwt})['response'] + + formats = [] + skip_protocols = ['smil', 'f4m', 'dash'] + adaptive_url = traverse_obj(media, ('addaptiveMedia', 'hls_sec'), expected_type=url_or_none) + if adaptive_url: + formats = self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=skip_protocols) + + adaptive_url = traverse_obj(media, ('addaptiveMedia_sl', 'hls_sec'), expected_type=url_or_none) + if adaptive_url: + for f in self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=skip_protocols): + formats.append({ + **f, + 'format_id': 'sign-' + f['format_id'], + 'format_note': 'Sign language interpretation', 'preference': -10, + 'language': ( + 'slv' if f.get('language') == 'eng' and f.get('acodec') != 'none' + else f.get('language')) + }) + + for mediafile in traverse_obj(media, ('mediaFiles', lambda _, v: url_or_none(v['streams']['https']))): + formats.append(traverse_obj(mediafile, { + 'url': ('streams', 'https'), + 'ext': ('mediaType', {str.lower}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'tbr': ('bitrate', {int_or_none}), + 'filesize': ('filesize', {int_or_none}), + })) + + for mediafile in traverse_obj(media, ('mediaFiles', lambda _, v: url_or_none(v['streams']['hls_sec']))): + formats.extend(self._extract_wowza_formats( + mediafile['streams']['hls_sec'], v_id, skip_protocols=skip_protocols)) + + if any('intermission.mp4' in x['url'] for x in formats): + self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + if any('dummy_720p.mp4' in x.get('manifest_url', '') for x in formats) and meta.get('stub') == 'error': + raise ExtractorError(f'{self.IE_NAME} said: Clip not available', expected=True) + + return { + 'id': v_id, + 'webpage_url': ''.join(traverse_obj(meta, ('canonical', ('domain', 'path')))), + 'title': meta.get('title'), + 'formats': formats, + 'subtitles': subs, + 'thumbnails': thumbs, + 'description': meta.get('description'), + 'timestamp': unified_timestamp(traverse_obj(meta, 'broadcastDate', ('broadcastDates', 0))), + 'release_timestamp': unified_timestamp(meta.get('recordingDate')), + 'duration': meta.get('duration') or parse_duration(meta.get('length')), + 'tags': meta.get('genre'), + 'series': meta.get('showName'), + 'series_id': meta.get('showId'), + } diff --git a/yt_dlp/extractor/rudovideo.py b/yt_dlp/extractor/rudovideo.py new file mode 100644 index 0000000..1b85955 --- /dev/null +++ b/yt_dlp/extractor/rudovideo.py @@ -0,0 +1,135 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + js_to_json, + traverse_obj, + update_url_query, + url_or_none, +) + + +class RudoVideoIE(InfoExtractor): + _VALID_URL = r'https?://rudo\.video/(?P<type>vod|podcast|live)/(?P<id>[^/?&#]+)' + _EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)//rudo\.video/(?:vod|podcast|live)/[^\'"]+)'] + _TESTS = [{ + 'url': 'https://rudo.video/podcast/cz2wrUy8l0o', + 'md5': '28ed82b477708dc5e12e072da2449221', + 'info_dict': { + 'id': 'cz2wrUy8l0o', + 'title': 'Diego Cabot', + 'ext': 'mp4', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/podcast/bQkt07', + 'md5': '36b22a9863de0f47f00fc7532a32a898', + 'info_dict': { + 'id': 'bQkt07', + 'title': 'Tubular Bells', + 'ext': 'mp4', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/podcast/b42ZUznHX0', + 'md5': 'b91c70d832938871367f8ad10c895821', + 'info_dict': { + 'id': 'b42ZUznHX0', + 'title': 'Columna Ruperto Concha', + 'ext': 'mp3', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/vod/bN5AaJ', + 'md5': '01324a329227e2591530ecb4f555c881', + 'info_dict': { + 'id': 'bN5AaJ', + 'title': 'Ucrania 19.03', + 'creator': 'La Tercera', + 'ext': 'mp4', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/live/bbtv', + 'info_dict': { + 'id': 'bbtv', + 'ext': 'mp4', + 'creator': 'BioBioTV', + 'live_status': 'is_live', + 'title': r're:^LIVE BBTV\s\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}$', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/live/c13', + 'info_dict': { + 'id': 'c13', + 'title': 'CANAL13', + 'ext': 'mp4', + }, + 'skip': 'Geo-restricted to Chile', + }, { + 'url': 'https://rudo.video/live/t13-13cl', + 'info_dict': { + 'id': 't13-13cl', + 'title': 'T13', + 'ext': 'mp4', + }, + 'skip': 'Geo-restricted to Chile', + }] + + def _real_extract(self, url): + video_id, type_ = self._match_valid_url(url).group('id', 'type') + is_live = type_ == 'live' + + webpage = self._download_webpage(url, video_id) + if 'Streaming is not available in your area' in webpage: + self.raise_geo_restricted() + + media_url = ( + self._search_regex( + r'var\s+streamURL\s*=\s*[\'"]([^?\'"]+)', webpage, 'stream url', default=None) + # Source URL must be used only if streamURL is unavailable + or self._search_regex( + r'<source[^>]+src=[\'"]([^\'"]+)', webpage, 'source url', default=None)) + if not media_url: + youtube_url = self._search_regex(r'file:\s*[\'"]((?:https?:)//(?:www\.)?youtube\.com[^\'"]+)', + webpage, 'youtube url', default=None) + if youtube_url: + return self.url_result(youtube_url, 'Youtube') + raise ExtractorError('Unable to extract stream url') + + token_array = self._search_json( + r'<script>var\s+_\$_[a-zA-Z0-9]+\s*=', webpage, 'access token array', video_id, + contains_pattern=r'\[(?s:.+)\]', default=None, transform_source=js_to_json) + if token_array: + token_url = traverse_obj(token_array, (..., {url_or_none}), get_all=False) + if not token_url: + raise ExtractorError('Invalid access token array') + access_token = self._download_json( + token_url, video_id, note='Downloading access token')['data']['authToken'] + media_url = update_url_query(media_url, {'auth-token': access_token}) + + ext = determine_ext(media_url) + if ext == 'm3u8': + formats = self._extract_m3u8_formats(media_url, video_id, live=is_live) + elif ext == 'mp3': + formats = [{ + 'url': media_url, + 'vcodec': 'none', + }] + else: + formats = [{'url': media_url}] + + return { + 'id': video_id, + 'title': (self._search_regex(r'var\s+titleVideo\s*=\s*[\'"]([^\'"]+)', + webpage, 'title', default=None) + or self._og_search_title(webpage)), + 'creator': self._search_regex(r'var\s+videoAuthor\s*=\s*[\'"]([^?\'"]+)', + webpage, 'videoAuthor', default=None), + 'thumbnail': (self._search_regex(r'var\s+posterIMG\s*=\s*[\'"]([^?\'"]+)', + webpage, 'thumbnail', default=None) + or self._og_search_thumbnail(webpage)), + 'formats': formats, + 'is_live': is_live, + } diff --git a/yt_dlp/extractor/rule34video.py b/yt_dlp/extractor/rule34video.py new file mode 100644 index 0000000..11095b2 --- /dev/null +++ b/yt_dlp/extractor/rule34video.py @@ -0,0 +1,123 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + get_element_by_attribute, + get_element_by_class, + get_element_html_by_class, + get_elements_by_class, + int_or_none, + parse_count, + parse_duration, + unescapeHTML, +) +from ..utils.traversal import traverse_obj + + +class Rule34VideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rule34video\.com/videos?/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://rule34video.com/video/3065157/shot-it-mmd-hmv/', + 'md5': 'ffccac2c23799dabbd192621ae4d04f3', + 'info_dict': { + 'id': '3065157', + 'ext': 'mp4', + 'title': 'Shot It-(mmd hmv)', + 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065157/preview.jpg', + 'duration': 347.0, + 'age_limit': 18, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'timestamp': 1639872000, + 'description': 'https://discord.gg/aBqPrHSHvv', + 'upload_date': '20211219', + 'uploader': 'Sweet HMV', + 'uploader_url': 'https://rule34video.com/members/22119/', + 'categories': ['3D', 'MMD', 'iwara'], + 'tags': 'mincount:10' + } + }, + { + 'url': 'https://rule34video.com/videos/3065296/lara-in-trouble-ep-7-wildeerstudio/', + 'md5': '6bb5169f9f6b38cd70882bf2e64f6b86', + 'info_dict': { + 'id': '3065296', + 'ext': 'mp4', + 'title': 'Lara in Trouble Ep. 7 [WildeerStudio]', + 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065296/preview.jpg', + 'duration': 938.0, + 'age_limit': 18, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'timestamp': 1640131200, + 'description': '', + 'creators': ['WildeerStudio'], + 'upload_date': '20211222', + 'uploader': 'CerZule', + 'uploader_url': 'https://rule34video.com/members/36281/', + 'categories': ['3D', 'Tomb Raider'], + 'tags': 'mincount:40' + } + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + formats = [] + + for mobj in re.finditer(r'<a[^>]+href="(?P<video_url>[^"]+download=true[^"]+)".*>(?P<ext>[^\s]+) (?P<quality>[^<]+)p</a>', webpage): + url, ext, quality = mobj.groups() + formats.append({ + 'url': url, + 'ext': ext.lower(), + 'quality': quality, + }) + + categories, creators, uploader, uploader_url = [None] * 4 + for col in get_elements_by_class('col', webpage): + label = clean_html(get_element_by_class('label', col)) + if label == 'Categories:': + categories = list(map(clean_html, get_elements_by_class('item', col))) + elif label == 'Artist:': + creators = list(map(clean_html, get_elements_by_class('item', col))) + elif label == 'Uploaded By:': + uploader = clean_html(get_element_by_class('name', col)) + uploader_url = extract_attributes(get_element_html_by_class('name', col) or '').get('href') + + return { + **traverse_obj(self._search_json_ld(webpage, video_id, default={}), ({ + 'title': 'title', + 'view_count': 'view_count', + 'like_count': 'like_count', + 'duration': 'duration', + 'timestamp': 'timestamp', + 'description': 'description', + 'thumbnail': ('thumbnails', 0, 'url'), + })), + 'id': video_id, + 'formats': formats, + 'title': self._html_extract_title(webpage), + 'thumbnail': self._html_search_regex( + r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None), + 'duration': parse_duration(self._html_search_regex( + r'"icon-clock"></i>\s+<span>((?:\d+:?)+)', webpage, 'duration', default=None)), + 'view_count': int_or_none(self._html_search_regex( + r'"icon-eye"></i>\s+<span>([ \d]+)', webpage, 'views', default='').replace(' ', '')), + 'like_count': parse_count(get_element_by_class('voters count', webpage)), + 'comment_count': int_or_none(self._search_regex( + r'[^(]+\((\d+)\)', get_element_by_attribute('href', '#tab_comments', webpage), 'comment count', fatal=False)), + 'age_limit': 18, + 'creators': creators, + 'uploader': uploader, + 'uploader_url': uploader_url, + 'categories': categories, + 'tags': list(map(unescapeHTML, re.findall( + r'<a class="tag_item"[^>]+\bhref="https://rule34video\.com/tags/\d+/"[^>]*>(?P<tag>[^>]*)</a>', webpage))), + } diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py new file mode 100644 index 0000000..837a324 --- /dev/null +++ b/yt_dlp/extractor/rumble.py @@ -0,0 +1,390 @@ +import itertools +import re + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + UnsupportedError, + clean_html, + determine_ext, + format_field, + get_element_by_class, + int_or_none, + join_nonempty, + parse_count, + parse_iso8601, + traverse_obj, + unescapeHTML, +) + + +class RumbleEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)' + _EMBED_REGEX = [fr'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>{_VALID_URL})'] + _TESTS = [{ + 'url': 'https://rumble.com/embed/v5pv5f', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + 'channel_url': 'https://rumble.com/c/WMAR', + 'channel': 'WMAR', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.qR4e-small-WMAR-2-News-Latest-Headline.jpg', + 'duration': 234, + 'uploader': 'WMAR', + 'live_status': 'not_live', + } + }, { + 'url': 'https://rumble.com/embed/vslb7v', + 'md5': '7418035de1a30a178b8af34dc2b6a52b', + 'info_dict': { + 'id': 'vslb7v', + 'ext': 'mp4', + 'title': 'Defense Sec. says US Commitment to NATO Defense \'Ironclad\'', + 'timestamp': 1645142135, + 'upload_date': '20220217', + 'channel_url': 'https://rumble.com/c/CyberTechNews', + 'channel': 'CTNews', + 'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg', + 'duration': 901, + 'uploader': 'CTNews', + 'live_status': 'not_live', + } + }, { + 'url': 'https://rumble.com/embed/vunh1h', + 'info_dict': { + 'id': 'vunh1h', + 'ext': 'mp4', + 'title': '‘Gideon, op zoek naar de waarheid’ including ENG SUBS', + 'timestamp': 1647197663, + 'upload_date': '20220313', + 'channel_url': 'https://rumble.com/user/BLCKBX', + 'channel': 'BLCKBX', + 'thumbnail': r're:https://.+\.jpg', + 'duration': 5069, + 'uploader': 'BLCKBX', + 'live_status': 'not_live', + 'subtitles': { + 'en': [ + { + 'url': r're:https://.+\.vtt', + 'name': 'English', + 'ext': 'vtt' + } + ] + }, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://rumble.com/embed/v1essrt', + 'info_dict': { + 'id': 'v1essrt', + 'ext': 'mp4', + 'title': 'startswith:lofi hip hop radio 📚 - beats to relax/study to', + 'timestamp': 1661519399, + 'upload_date': '20220826', + 'channel_url': 'https://rumble.com/c/LofiGirl', + 'channel': 'Lofi Girl', + 'thumbnail': r're:https://.+\.jpg', + 'uploader': 'Lofi Girl', + 'live_status': 'is_live', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://rumble.com/embed/v1amumr', + 'info_dict': { + 'id': 'v1amumr', + 'ext': 'mp4', + 'fps': 60, + 'title': 'Turning Point USA 2022 Student Action Summit DAY 1 - Rumble Exclusive Live', + 'timestamp': 1658518457, + 'upload_date': '20220722', + 'channel_url': 'https://rumble.com/c/RumbleEvents', + 'channel': 'Rumble Events', + 'thumbnail': r're:https://.+\.jpg', + 'duration': 16427, + 'uploader': 'Rumble Events', + 'live_status': 'was_live', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', + 'only_matching': True, + }] + + _WEBPAGE_TESTS = [ + { + 'note': 'Rumble JS embed', + 'url': 'https://therightscoop.com/what-does-9-plus-1-plus-1-equal-listen-to-this-audio-of-attempted-kavanaugh-assassins-call-and-youll-get-it', + 'md5': '4701209ac99095592e73dbba21889690', + 'info_dict': { + 'id': 'v15eqxl', + 'ext': 'mp4', + 'channel': 'Mr Producer Media', + 'duration': 92, + 'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh', + 'channel_url': 'https://rumble.com/c/RichSementa', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.qR4e-small-911-Audio-From-The-Man-Who-.jpg', + 'timestamp': 1654892716, + 'uploader': 'Mr Producer Media', + 'upload_date': '20220610', + 'live_status': 'not_live', + } + }, + ] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + embeds = tuple(super()._extract_embed_urls(url, webpage)) + if embeds: + return embeds + return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer( + r'<script>[^<]*\bRumble\(\s*"play"\s*,\s*{[^}]*[\'"]?video[\'"]?\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._download_json( + 'https://rumble.com/embedJS/u3/', video_id, + query={'request': 'video', 'ver': 2, 'v': video_id}) + + sys_msg = traverse_obj(video, ('sys', 'msg')) + if sys_msg: + self.report_warning(sys_msg, video_id=video_id) + + if video.get('live') == 0: + live_status = 'not_live' if video.get('livestream_has_dvr') is None else 'was_live' + elif video.get('live') == 1: + live_status = 'is_upcoming' if video.get('livestream_has_dvr') else 'was_live' + elif video.get('live') == 2: + live_status = 'is_live' + else: + live_status = None + + formats = [] + for ext, ext_info in (video.get('ua') or {}).items(): + if isinstance(ext_info, dict): + for height, video_info in ext_info.items(): + if not traverse_obj(video_info, ('meta', 'h', {int_or_none})): + video_info.setdefault('meta', {})['h'] = height + ext_info = ext_info.values() + + for video_info in ext_info: + meta = video_info.get('meta') or {} + if not video_info.get('url'): + continue + if ext == 'hls': + if meta.get('live') is True and video.get('live') == 1: + live_status = 'post_live' + formats.extend(self._extract_m3u8_formats( + video_info['url'], video_id, + ext='mp4', m3u8_id='hls', fatal=False, live=live_status == 'is_live')) + continue + timeline = ext == 'timeline' + if timeline: + ext = determine_ext(video_info['url']) + formats.append({ + 'ext': ext, + 'acodec': 'none' if timeline else None, + 'url': video_info['url'], + 'format_id': join_nonempty(ext, format_field(meta, 'h', '%sp')), + 'format_note': 'Timeline' if timeline else None, + 'fps': None if timeline else video.get('fps'), + **traverse_obj(meta, { + 'tbr': 'bitrate', + 'filesize': 'size', + 'width': 'w', + 'height': 'h', + }, expected_type=lambda x: int(x) or None) + }) + + subtitles = { + lang: [{ + 'url': sub_info['path'], + 'name': sub_info.get('language') or '', + }] for lang, sub_info in (video.get('cc') or {}).items() if sub_info.get('path') + } + + author = video.get('author') or {} + thumbnails = traverse_obj(video, ('t', ..., {'url': 'i', 'width': 'w', 'height': 'h'})) + if not thumbnails and video.get('i'): + thumbnails = [{'url': video['i']}] + + if live_status in {'is_live', 'post_live'}: + duration = None + else: + duration = int_or_none(video.get('duration')) + + return { + 'id': video_id, + 'title': unescapeHTML(video.get('title')), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(video.get('pubDate')), + 'channel': author.get('name'), + 'channel_url': author.get('url'), + 'duration': duration, + 'uploader': author.get('name'), + 'live_status': live_status, + } + + +class RumbleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rumble\.com/(?P<id>v(?!ideos)[\w.-]+)[^/]*$' + _EMBED_REGEX = [ + r'<a class=video-item--a href=(?P<url>/v[\w.-]+\.html)>', + r'<a[^>]+class="videostream__link link"[^>]+href=(?P<url>/v[\w.-]+\.html)[^>]*>'] + _TESTS = [{ + 'add_ie': ['RumbleEmbed'], + 'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html', + 'md5': '53af34098a7f92c4e51cf0bd1c33f009', + 'info_dict': { + 'id': 'vb0ofn', + 'ext': 'mp4', + 'timestamp': 1612662578, + 'uploader': 'LovingMontana', + 'channel': 'LovingMontana', + 'upload_date': '20210207', + 'title': 'Winter-loving dog helps girls dig a snow fort ', + 'description': 'Moose the dog is more than happy to help with digging out this epic snow fort. Great job, Moose!', + 'channel_url': 'https://rumble.com/c/c-546523', + 'thumbnail': r're:https://.+\.jpg', + 'duration': 103, + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'live_status': 'not_live', + } + }, { + 'url': 'http://www.rumble.com/vDMUM1?key=value', + 'only_matching': True, + }, { + 'note': 'timeline format', + 'url': 'https://rumble.com/v2ea9qb-the-u.s.-cannot-hide-this-in-ukraine-anymore-redacted-with-natali-and-clayt.html', + 'md5': '40d61fec6c0945bca3d0e1dc1aa53d79', + 'params': {'format': 'wv'}, + 'info_dict': { + 'id': 'v2bou5f', + 'ext': 'mp4', + 'uploader': 'Redacted News', + 'upload_date': '20230322', + 'timestamp': 1679445010, + 'title': 'The U.S. CANNOT hide this in Ukraine anymore | Redacted with Natali and Clayton Morris', + 'duration': 892, + 'channel': 'Redacted News', + 'description': 'md5:aaad0c5c3426d7a361c29bdaaced7c42', + 'channel_url': 'https://rumble.com/c/Redacted', + 'live_status': 'not_live', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/d/x/2/O/dx2Oi.qR4e-small-The-U.S.-CANNOT-hide-this-i.jpg', + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + }, + }, { + 'url': 'https://rumble.com/v2e7fju-the-covid-twitter-files-drop-protecting-fauci-while-censoring-the-truth-wma.html', + 'info_dict': { + 'id': 'v2blzyy', + 'ext': 'mp4', + 'live_status': 'was_live', + 'release_timestamp': 1679446804, + 'description': 'md5:2ac4908ccfecfb921f8ffa4b30c1e636', + 'release_date': '20230322', + 'timestamp': 1679445692, + 'duration': 4435, + 'upload_date': '20230322', + 'title': 'The Covid Twitter Files Drop: Protecting Fauci While Censoring The Truth w/Matt Taibbi', + 'uploader': 'Kim Iversen', + 'channel_url': 'https://rumble.com/c/KimIversen', + 'channel': 'Kim Iversen', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/6/b/w/O/6bwOi.qR4e-small-The-Covid-Twitter-Files-Dro.jpg', + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + }, + }] + + _WEBPAGE_TESTS = [{ + 'url': 'https://rumble.com/videos?page=2', + 'playlist_mincount': 24, + 'info_dict': { + 'id': 'videos?page=2', + 'title': 'All videos', + 'description': 'Browse videos uploaded to Rumble.com', + 'age_limit': 0, + }, + }, { + 'url': 'https://rumble.com/browse/live', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 'live', + 'title': 'Browse', + 'age_limit': 0, + }, + }, { + 'url': 'https://rumble.com/search/video?q=rumble&sort=views', + 'playlist_mincount': 24, + 'info_dict': { + 'id': 'video?q=rumble&sort=views', + 'title': 'Search results for: rumble', + 'age_limit': 0, + }, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + url_info = next(RumbleEmbedIE.extract_from_webpage(self._downloader, url, webpage), None) + if not url_info: + raise UnsupportedError(url) + + return { + '_type': 'url_transparent', + 'ie_key': url_info['ie_key'], + 'url': url_info['url'], + 'release_timestamp': parse_iso8601(self._search_regex( + r'(?:Livestream begins|Streamed on):\s+<time datetime="([^"]+)', webpage, 'release date', default=None)), + 'view_count': int_or_none(self._search_regex( + r'"userInteractionCount"\s*:\s*(\d+)', webpage, 'view count', default=None)), + 'like_count': parse_count(self._search_regex( + r'<span data-js="rumbles_up_votes">\s*([\d,.KM]+)', webpage, 'like count', default=None)), + 'dislike_count': parse_count(self._search_regex( + r'<span data-js="rumbles_down_votes">\s*([\d,.KM]+)', webpage, 'dislike count', default=None)), + 'description': clean_html(get_element_by_class('media-description', webpage)) + } + + +class RumbleChannelIE(InfoExtractor): + _VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c|user)/(?P<id>[^&?#$/]+))' + + _TESTS = [{ + 'url': 'https://rumble.com/c/Styxhexenhammer666', + 'playlist_mincount': 1160, + 'info_dict': { + 'id': 'Styxhexenhammer666', + }, + }, { + 'url': 'https://rumble.com/user/goldenpoodleharleyeuna', + 'playlist_mincount': 4, + 'info_dict': { + 'id': 'goldenpoodleharleyeuna', + }, + }] + + def entries(self, url, playlist_id): + for page in itertools.count(1): + try: + webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: + break + raise + for video_url in re.findall(r'class="[^>"]*videostream__link[^>]+href="([^"]+\.html)"', webpage): + yield self.url_result('https://rumble.com' + video_url) + + def _real_extract(self, url): + url, playlist_id = self._match_valid_url(url).groups() + return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id) diff --git a/yt_dlp/extractor/rutube.py b/yt_dlp/extractor/rutube.py new file mode 100644 index 0000000..287824d --- /dev/null +++ b/yt_dlp/extractor/rutube.py @@ -0,0 +1,365 @@ +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + determine_ext, + bool_or_none, + int_or_none, + parse_qs, + try_get, + unified_timestamp, + url_or_none, +) + + +class RutubeBaseIE(InfoExtractor): + def _download_api_info(self, video_id, query=None): + if not query: + query = {} + query['format'] = 'json' + return self._download_json( + 'http://rutube.ru/api/video/%s/' % video_id, + video_id, 'Downloading video JSON', + 'Unable to download video JSON', query=query) + + def _extract_info(self, video, video_id=None, require_title=True): + title = video['title'] if require_title else video.get('title') + + age_limit = video.get('is_adult') + if age_limit is not None: + age_limit = 18 if age_limit is True else 0 + + uploader_id = try_get(video, lambda x: x['author']['id']) + category = try_get(video, lambda x: x['category']['name']) + description = video.get('description') + duration = int_or_none(video.get('duration')) + + return { + 'id': video.get('id') or video_id if video_id else video['id'], + 'title': title, + 'description': description, + 'thumbnail': video.get('thumbnail_url'), + 'duration': duration, + 'uploader': try_get(video, lambda x: x['author']['name']), + 'uploader_id': compat_str(uploader_id) if uploader_id else None, + 'timestamp': unified_timestamp(video.get('created_ts')), + 'categories': [category] if category else None, + 'age_limit': age_limit, + 'view_count': int_or_none(video.get('hits')), + 'comment_count': int_or_none(video.get('comments_count')), + 'is_live': bool_or_none(video.get('is_livestream')), + 'chapters': self._extract_chapters_from_description(description, duration), + } + + def _download_and_extract_info(self, video_id, query=None): + return self._extract_info( + self._download_api_info(video_id, query=query), video_id) + + def _download_api_options(self, video_id, query=None): + if not query: + query = {} + query['format'] = 'json' + return self._download_json( + 'http://rutube.ru/api/play/options/%s/' % video_id, + video_id, 'Downloading options JSON', + 'Unable to download options JSON', + headers=self.geo_verification_headers(), query=query) + + def _extract_formats(self, options, video_id): + formats = [] + for format_id, format_url in options['video_balancer'].items(): + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + return formats + + def _download_and_extract_formats(self, video_id, query=None): + return self._extract_formats( + self._download_api_options(video_id, query=query), video_id) + + +class RutubeIE(RutubeBaseIE): + IE_NAME = 'rutube' + IE_DESC = 'Rutube videos' + _VALID_URL = r'https?://rutube\.ru/(?:video(?:/private)?|(?:play/)?embed)/(?P<id>[\da-z]{32})' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/(?:play/)?embed/[\da-z]{32}.*?)\1'] + + _TESTS = [{ + 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', + 'md5': 'e33ac625efca66aba86cbec9851f2692', + 'info_dict': { + 'id': '3eac3b4561676c17df9132a9a1e62e3e', + 'ext': 'mp4', + 'title': 'Раненный кенгуру забежал в аптеку', + 'description': 'http://www.ntdtv.ru ', + 'duration': 81, + 'uploader': 'NTDRussian', + 'uploader_id': '29790', + 'timestamp': 1381943602, + 'upload_date': '20131016', + 'age_limit': 0, + 'view_count': int, + 'thumbnail': 'http://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg', + 'categories': ['Новости и СМИ'], + 'chapters': [], + }, + 'expected_warnings': ['Unable to download f4m'], + }, { + 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', + 'only_matching': True, + }, { + 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', + 'only_matching': True, + }, { + 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', + 'only_matching': True, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', + 'only_matching': True, + }, { + 'url': 'https://rutube.ru/video/private/884fb55f07a97ab673c7d654553e0f48/?p=x2QojCumHTS3rsKHWXN8Lg', + 'md5': 'd106225f15d625538fe22971158e896f', + 'info_dict': { + 'id': '884fb55f07a97ab673c7d654553e0f48', + 'ext': 'mp4', + 'title': 'Яцуноками, Nioh2', + 'description': 'Nioh2: финал сражения с боссом Яцуноками', + 'duration': 15, + 'uploader': 'mexus', + 'uploader_id': '24222106', + 'timestamp': 1670646232, + 'upload_date': '20221210', + 'age_limit': 0, + 'view_count': int, + 'thumbnail': 'http://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg', + 'categories': ['Видеоигры'], + 'chapters': [], + }, + 'expected_warnings': ['Unable to download f4m'], + }, { + 'url': 'https://rutube.ru/video/c65b465ad0c98c89f3b25cb03dcc87c6/', + 'info_dict': { + 'id': 'c65b465ad0c98c89f3b25cb03dcc87c6', + 'ext': 'mp4', + 'chapters': 'count:4', + 'categories': ['Бизнес и предпринимательство'], + 'description': 'md5:252feac1305257d8c1bab215cedde75d', + 'thumbnail': 'http://pic.rutubelist.ru/video/71/8f/718f27425ea9706073eb80883dd3787b.png', + 'duration': 782, + 'age_limit': 0, + 'uploader_id': '23491359', + 'timestamp': 1677153329, + 'view_count': int, + 'upload_date': '20230223', + 'title': 'Бизнес с нуля: найм сотрудников. Интервью с директором строительной компании', + 'uploader': 'Стас Быков', + }, + 'expected_warnings': ['Unable to download f4m'], + }] + + @classmethod + def suitable(cls, url): + return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + query = parse_qs(url) + info = self._download_and_extract_info(video_id, query) + info['formats'] = self._download_and_extract_formats(video_id, query) + return info + + +class RutubeEmbedIE(RutubeBaseIE): + IE_NAME = 'rutube:embed' + IE_DESC = 'Rutube embedded videos' + _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)' + + _TESTS = [{ + 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', + 'info_dict': { + 'id': 'a10e53b86e8f349080f718582ce4c661', + 'ext': 'mp4', + 'timestamp': 1387830582, + 'upload_date': '20131223', + 'uploader_id': '297833', + 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', + 'uploader': 'subziro89 ILya', + 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://rutube.ru/play/embed/8083783', + 'only_matching': True, + }, { + # private video + 'url': 'https://rutube.ru/play/embed/10631925?p=IbAigKqWd1do4mjaM5XLIQ', + 'only_matching': True, + }] + + def _real_extract(self, url): + embed_id = self._match_id(url) + # Query may contain private videos token and should be passed to API + # requests (see #19163) + query = parse_qs(url) + options = self._download_api_options(embed_id, query) + video_id = options['effective_video'] + formats = self._extract_formats(options, video_id) + info = self._download_and_extract_info(video_id, query) + info.update({ + 'extractor_key': 'Rutube', + 'formats': formats, + }) + return info + + +class RutubePlaylistBaseIE(RutubeBaseIE): + def _next_page_url(self, page_num, playlist_id, *args, **kwargs): + return self._PAGE_TEMPLATE % (playlist_id, page_num) + + def _entries(self, playlist_id, *args, **kwargs): + next_page_url = None + for pagenum in itertools.count(1): + page = self._download_json( + next_page_url or self._next_page_url( + pagenum, playlist_id, *args, **kwargs), + playlist_id, 'Downloading page %s' % pagenum) + + results = page.get('results') + if not results or not isinstance(results, list): + break + + for result in results: + video_url = url_or_none(result.get('video_url')) + if not video_url: + continue + entry = self._extract_info(result, require_title=False) + entry.update({ + '_type': 'url', + 'url': video_url, + 'ie_key': RutubeIE.ie_key(), + }) + yield entry + + next_page_url = page.get('next') + if not next_page_url or not page.get('has_next'): + break + + def _extract_playlist(self, playlist_id, *args, **kwargs): + return self.playlist_result( + self._entries(playlist_id, *args, **kwargs), + playlist_id, kwargs.get('playlist_name')) + + def _real_extract(self, url): + return self._extract_playlist(self._match_id(url)) + + +class RutubeTagsIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:tags' + IE_DESC = 'Rutube tags' + _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://rutube.ru/tags/video/1800/', + 'info_dict': { + 'id': '1800', + }, + 'playlist_mincount': 68, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' + + +class RutubeMovieIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:movie' + IE_DESC = 'Rutube movies' + _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)' + + _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' + _PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json' + + def _real_extract(self, url): + movie_id = self._match_id(url) + movie = self._download_json( + self._MOVIE_TEMPLATE % movie_id, movie_id, + 'Downloading movie JSON') + return self._extract_playlist( + movie_id, playlist_name=movie.get('name')) + + +class RutubePersonIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:person' + IE_DESC = 'Rutube person videos' + _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://rutube.ru/video/person/313878/', + 'info_dict': { + 'id': '313878', + }, + 'playlist_mincount': 37, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' + + +class RutubePlaylistIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:playlist' + IE_DESC = 'Rutube playlists' + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag', + 'info_dict': { + 'id': '3097', + }, + 'playlist_count': 27, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', + 'only_matching': True, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' + + @classmethod + def suitable(cls, url): + from ..utils import int_or_none, parse_qs + + if not super(RutubePlaylistIE, cls).suitable(url): + return False + params = parse_qs(url) + return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) + + def _next_page_url(self, page_num, playlist_id, item_kind): + return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num) + + def _real_extract(self, url): + qs = parse_qs(url) + playlist_kind = qs['pl_type'][0] + playlist_id = qs['pl_id'][0] + return self._extract_playlist(playlist_id, item_kind=playlist_kind) + + +class RutubeChannelIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:channel' + IE_DESC = 'Rutube channel' + _VALID_URL = r'https?://rutube\.ru/channel/(?P<id>\d+)/videos' + _TESTS = [{ + 'url': 'https://rutube.ru/channel/639184/videos/', + 'info_dict': { + 'id': '639184', + }, + 'playlist_mincount': 133, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' diff --git a/yt_dlp/extractor/rutv.py b/yt_dlp/extractor/rutv.py new file mode 100644 index 0000000..d7f9a73 --- /dev/null +++ b/yt_dlp/extractor/rutv.py @@ -0,0 +1,203 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + str_to_int +) + + +class RUTVIE(InfoExtractor): + IE_DESC = 'RUTV.RU' + _VALID_URL = r'''(?x) + https?:// + (?:test)?player\.(?:rutv\.ru|vgtrk\.com)/ + (?P<path> + flash\d+v/container\.swf\?id=| + iframe/(?P<type>swf|video|live)/id/| + index/iframe/cast_id/ + ) + (?P<id>\d+) + ''' + _EMBED_URLS = [ + r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', + r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', + ] + + _TESTS = [ + { + 'url': 'http://player.rutv.ru/flash2v/container.swf?id=774471&sid=kultura&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972347/video_id/978186/brand_id/31724', + 'info_dict': { + 'id': '774471', + 'ext': 'mp4', + 'title': 'Монологи на все времена', + 'description': 'md5:18d8b5e6a41fb1faa53819471852d5d5', + 'duration': 2906, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'https://player.vgtrk.com/flash2v/container.swf?id=774016&sid=russiatv&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972098/video_id/977760/brand_id/57638', + 'info_dict': { + 'id': '774016', + 'ext': 'mp4', + 'title': 'Чужой в семье Сталина', + 'description': '', + 'duration': 2539, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://player.rutv.ru/iframe/swf/id/766888/sid/hitech/?acc_video_id=4000', + 'info_dict': { + 'id': '766888', + 'ext': 'mp4', + 'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"', + 'description': 'md5:65ddd47f9830c4f42ed6475f8730c995', + 'duration': 279, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://player.rutv.ru/iframe/video/id/771852/start_zoom/true/showZoomBtn/false/sid/russiatv/?acc_video_id=episode_id/970443/video_id/975648/brand_id/5169', + 'info_dict': { + 'id': '771852', + 'ext': 'mp4', + 'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет', + 'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8', + 'duration': 3096, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://player.rutv.ru/iframe/live/id/51499/showZoomBtn/false/isPlay/true/sid/sochi2014', + 'info_dict': { + 'id': '51499', + 'ext': 'flv', + 'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ', + 'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c', + }, + 'skip': 'Translation has finished', + }, + { + 'url': 'http://player.rutv.ru/iframe/live/id/21/showZoomBtn/false/isPlay/true/', + 'info_dict': { + 'id': '21', + 'ext': 'mp4', + 'title': 're:^Россия 24. Прямой эфир [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201/showZoomBtn/false/isPlay/true/', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + video_path = mobj.group('path') + + if re.match(r'flash\d+v', video_path): + video_type = 'video' + elif video_path.startswith('iframe'): + video_type = mobj.group('type') + if video_type == 'swf': + video_type = 'video' + elif video_path.startswith('index/iframe/cast_id'): + video_type = 'live' + + is_live = video_type == 'live' + + json_data = self._download_json( + 'http://player.vgtrk.com/iframe/data%s/id/%s' % ('live' if is_live else 'video', video_id), + video_id, 'Downloading JSON') + + if json_data['errors']: + raise ExtractorError('%s said: %s' % (self.IE_NAME, json_data['errors']), expected=True) + + playlist = json_data['data']['playlist'] + medialist = playlist['medialist'] + media = medialist[0] + + if media['errors']: + raise ExtractorError('%s said: %s' % (self.IE_NAME, media['errors']), expected=True) + + view_count = int_or_none(playlist.get('count_views')) + priority_transport = playlist['priority_transport'] + + thumbnail = media['picture'] + width = int_or_none(media['width']) + height = int_or_none(media['height']) + description = media['anons'] + title = media['title'] + duration = int_or_none(media.get('duration')) + + formats = [] + subtitles = {} + + for transport, links in media['sources'].items(): + for quality, url in links.items(): + preference = -1 if priority_transport == transport else -2 + if transport == 'rtmp': + mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url) + if not mobj: + continue + fmt = { + 'url': mobj.group('url'), + 'play_path': mobj.group('playpath'), + 'app': mobj.group('app'), + 'page_url': 'http://player.rutv.ru', + 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', + 'rtmp_live': True, + 'ext': 'flv', + 'vbr': str_to_int(quality), + } + elif transport == 'm3u8': + fmt, subs = self._extract_m3u8_formats_and_subtitles( + url, video_id, 'mp4', quality=preference, m3u8_id='hls') + formats.extend(fmt) + self._merge_subtitles(subs, target=subtitles) + continue + else: + fmt = { + 'url': url + } + fmt.update({ + 'width': int_or_none(quality, default=height, invscale=width, scale=height), + 'height': int_or_none(quality, default=height), + 'format_id': '%s-%s' % (transport, quality), + 'source_preference': preference, + }) + formats.append(fmt) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'view_count': view_count, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + '_format_sort_fields': ('source', ), + } diff --git a/yt_dlp/extractor/ruutu.py b/yt_dlp/extractor/ruutu.py new file mode 100644 index 0000000..33f6652 --- /dev/null +++ b/yt_dlp/extractor/ruutu.py @@ -0,0 +1,262 @@ +import json +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse +from ..utils import ( + determine_ext, + ExtractorError, + find_xpath_attr, + int_or_none, + traverse_obj, + try_call, + unified_strdate, + url_or_none, + xpath_attr, + xpath_text, +) + + +class RuutuIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/| + static\.nelonenmedia\.fi/player/misc/embed_player\.html\?.*?\bnid= + ) + (?P<id>\d+) + ''' + _TESTS = [ + { + 'url': 'http://www.ruutu.fi/video/2058907', + 'md5': 'ab2093f39be1ca8581963451b3c0234f', + 'info_dict': { + 'id': '2058907', + 'ext': 'mp4', + 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!', + 'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 114, + 'age_limit': 0, + 'upload_date': '20150508', + }, + }, + { + 'url': 'http://www.ruutu.fi/video/2057306', + 'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9', + 'info_dict': { + 'id': '2057306', + 'ext': 'mp4', + 'title': 'Superpesis: katso koko kausi Ruudussa', + 'description': 'md5:bfb7336df2a12dc21d18fa696c9f8f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 40, + 'age_limit': 0, + 'upload_date': '20150507', + 'series': 'Superpesis', + 'categories': ['Urheilu'], + }, + }, + { + 'url': 'http://www.supla.fi/supla/2231370', + 'md5': 'df14e782d49a2c0df03d3be2a54ef949', + 'info_dict': { + 'id': '2231370', + 'ext': 'mp4', + 'title': 'Osa 1: Mikael Jungner', + 'description': 'md5:7d90f358c47542e3072ff65d7b1bcffe', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 0, + 'upload_date': '20151012', + 'series': 'Läpivalaisu', + }, + }, + # Episode where <SourceFile> is "NOT-USED", but has other + # downloadable sources available. + { + 'url': 'http://www.ruutu.fi/video/3193728', + 'only_matching': True, + }, + { + # audio podcast + 'url': 'https://www.supla.fi/supla/3382410', + 'md5': 'b9d7155fed37b2ebf6021d74c4b8e908', + 'info_dict': { + 'id': '3382410', + 'ext': 'mp3', + 'title': 'Mikä ihmeen poltergeist?', + 'description': 'md5:bbb6963df17dfd0ecd9eb9a61bf14b52', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 0, + 'upload_date': '20190320', + 'series': 'Mysteeritarinat', + 'duration': 1324, + }, + 'expected_warnings': [ + 'HTTP Error 502: Bad Gateway', + 'Failed to download m3u8 information', + ], + }, + { + 'url': 'http://www.supla.fi/audio/2231370', + 'only_matching': True, + }, + { + 'url': 'https://static.nelonenmedia.fi/player/misc/embed_player.html?nid=3618790', + 'only_matching': True, + }, + { + # episode + 'url': 'https://www.ruutu.fi/video/3401964', + 'info_dict': { + 'id': '3401964', + 'ext': 'mp4', + 'title': 'Temptation Island Suomi - Kausi 5 - Jakso 17', + 'description': 'md5:87cf01d5e1e88adf0c8a2937d2bd42ba', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2582, + 'age_limit': 12, + 'upload_date': '20190508', + 'series': 'Temptation Island Suomi', + 'season_number': 5, + 'episode_number': 17, + 'categories': ['Reality ja tositapahtumat', 'Kotimaiset suosikit', 'Romantiikka ja parisuhde'], + }, + 'params': { + 'skip_download': True, + }, + }, + { + # premium + 'url': 'https://www.ruutu.fi/video/3618715', + 'only_matching': True, + }, + ] + _API_BASE = 'https://gatling.nelonenmedia.fi' + + @classmethod + def _extract_embed_urls(cls, url, webpage): + # nelonen.fi + settings = try_call( + lambda: json.loads(re.search( + r'jQuery\.extend\(Drupal\.settings, ({.+?})\);', webpage).group(1), strict=False)) + if settings: + video_id = traverse_obj(settings, ( + 'mediaCrossbowSettings', 'file', 'field_crossbow_video_id', 'und', 0, 'value')) + if video_id: + return [f'http://www.ruutu.fi/video/{video_id}'] + # hs.fi and is.fi + settings = try_call( + lambda: json.loads(re.search( + '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', + webpage).group(1), strict=False)) + if settings: + video_ids = set(traverse_obj(settings, ( + 'props', 'pageProps', 'page', 'assetData', 'splitBody', ..., 'video', 'sourceId')) or []) + if video_ids: + return [f'http://www.ruutu.fi/video/{v}' for v in video_ids] + video_id = traverse_obj(settings, ( + 'props', 'pageProps', 'page', 'assetData', 'mainVideo', 'sourceId')) + if video_id: + return [f'http://www.ruutu.fi/video/{video_id}'] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_xml = self._download_xml( + '%s/media-xml-cache' % self._API_BASE, video_id, + query={'id': video_id}) + + formats = [] + processed_urls = [] + + def extract_formats(node): + for child in node: + if child.tag.endswith('Files'): + extract_formats(child) + elif child.tag.endswith('File'): + video_url = child.text + if (not video_url or video_url in processed_urls + or any(p in video_url for p in ('NOT_USED', 'NOT-USED'))): + continue + processed_urls.append(video_url) + ext = determine_ext(video_url) + auth_video_url = url_or_none(self._download_webpage( + '%s/auth/access/v2' % self._API_BASE, video_id, + note='Downloading authenticated %s stream URL' % ext, + fatal=False, query={'stream': video_url})) + if auth_video_url: + processed_urls.append(auth_video_url) + video_url = auth_video_url + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id='hds', fatal=False)) + elif ext == 'mpd': + # video-only and audio-only streams are of different + # duration resulting in out of sync issue + continue + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'mp3' or child.tag == 'AudioMediaFile': + formats.append({ + 'format_id': 'audio', + 'url': video_url, + 'vcodec': 'none', + }) + else: + proto = compat_urllib_parse_urlparse(video_url).scheme + if not child.tag.startswith('HTTP') and proto != 'rtmp': + continue + preference = -1 if proto == 'rtmp' else 1 + label = child.get('label') + tbr = int_or_none(child.get('bitrate')) + format_id = '%s-%s' % (proto, label if label else tbr) if label or tbr else proto + if not self._is_valid_url(video_url, video_id, format_id): + continue + width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')[:2]] + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'width': width, + 'height': height, + 'tbr': tbr, + 'preference': preference, + }) + + extract_formats(video_xml.find('./Clip')) + + def pv(name): + value = try_call(lambda: find_xpath_attr( + video_xml, './Clip/PassthroughVariables/variable', 'name', name).get('value')) + if value != 'NA': + return value or None + + if not formats: + if (not self.get_param('allow_unplayable_formats') + and xpath_text(video_xml, './Clip/DRM', default=None)): + self.report_drm(video_id) + ns_st_cds = pv('ns_st_cds') + if ns_st_cds != 'free': + raise ExtractorError('This video is %s.' % ns_st_cds, expected=True) + + themes = pv('themes') + + return { + 'id': video_id, + 'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True), + 'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'), + 'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'), + 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')) or int_or_none(pv('runtime')), + 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')), + 'upload_date': unified_strdate(pv('date_start')), + 'series': pv('series_name'), + 'season_number': int_or_none(pv('season_number')), + 'episode_number': int_or_none(pv('episode_number')), + 'categories': themes.split(',') if themes else None, + 'formats': formats, + } diff --git a/yt_dlp/extractor/ruv.py b/yt_dlp/extractor/ruv.py new file mode 100644 index 0000000..12499d6 --- /dev/null +++ b/yt_dlp/extractor/ruv.py @@ -0,0 +1,186 @@ +from .common import InfoExtractor +from ..utils import ( + determine_ext, + parse_duration, + traverse_obj, + unified_timestamp, +) + + +class RuvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ruv\.is/(?:sarpurinn/[^/]+|node)/(?P<id>[^/]+(?:/\d+)?)' + _TESTS = [{ + # m3u8 + 'url': 'http://ruv.is/sarpurinn/ruv-aukaras/fh-valur/20170516', + 'md5': '66347652f4e13e71936817102acc1724', + 'info_dict': { + 'id': '1144499', + 'display_id': 'fh-valur/20170516', + 'ext': 'mp4', + 'title': 'FH - Valur', + 'description': 'Bein útsending frá 3. leik FH og Vals í úrslitum Olísdeildar karla í handbolta.', + 'timestamp': 1494963600, + 'upload_date': '20170516', + }, + }, { + # mp3 + 'url': 'http://ruv.is/sarpurinn/ras-2/morgunutvarpid/20170619', + 'md5': '395ea250c8a13e5fdb39d4670ef85378', + 'info_dict': { + 'id': '1153630', + 'display_id': 'morgunutvarpid/20170619', + 'ext': 'mp3', + 'title': 'Morgunútvarpið', + 'description': 'md5:a4cf1202c0a1645ca096b06525915418', + 'timestamp': 1497855000, + 'upload_date': '20170619', + }, + }, { + 'url': 'http://ruv.is/sarpurinn/ruv/frettir/20170614', + 'only_matching': True, + }, { + 'url': 'http://www.ruv.is/node/1151854', + 'only_matching': True, + }, { + 'url': 'http://ruv.is/sarpurinn/klippa/secret-soltice-hefst-a-morgun', + 'only_matching': True, + }, { + 'url': 'http://ruv.is/sarpurinn/ras-1/morgunvaktin/20170619', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title(webpage) + + FIELD_RE = r'video\.%s\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1' + + media_url = self._html_search_regex( + FIELD_RE % 'src', webpage, 'video URL', group='url') + + video_id = self._search_regex( + r'<link\b[^>]+\bhref=["\']https?://www\.ruv\.is/node/(\d+)', + webpage, 'video id', default=display_id) + + ext = determine_ext(media_url) + + if ext == 'm3u8': + formats = self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + elif ext == 'mp3': + formats = [{ + 'format_id': 'mp3', + 'url': media_url, + 'vcodec': 'none', + }] + else: + formats = [{ + 'url': media_url, + }] + + description = self._og_search_description(webpage, default=None) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._search_regex( + FIELD_RE % 'poster', webpage, 'thumbnail', fatal=False) + timestamp = unified_timestamp(self._html_search_meta( + 'article:published_time', webpage, 'timestamp', fatal=False)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + } + + +class RuvSpilaIE(InfoExtractor): + IE_NAME = 'ruv.is:spila' + _VALID_URL = r'https?://(?:www\.)?ruv\.is/(?:(?:sjon|ut)varp|(?:krakka|ung)ruv)/spila/.+/(?P<series_id>[0-9]+)/(?P<id>[a-z0-9]+)' + _TESTS = [{ + 'url': 'https://www.ruv.is/sjonvarp/spila/ithrottir/30657/9jcnd4', + 'info_dict': { + 'id': '9jcnd4', + 'ext': 'mp4', + 'title': '01.02.2022', + 'chapters': 'count:4', + 'timestamp': 1643743500, + 'upload_date': '20220201', + 'thumbnail': 'https://d38kdhuogyllre.cloudfront.net/fit-in/1960x/filters:quality(65)/hd_posters/94boog-iti3jg.jpg', + 'description': 'Íþróttafréttir.', + 'age_limit': 0, + }, + }, { + 'url': 'https://www.ruv.is/utvarp/spila/i-ljosi-sogunnar/23795/7hqkre', + 'info_dict': { + 'id': '7hqkre', + 'ext': 'mp3', + 'thumbnail': 'https://d38kdhuogyllre.cloudfront.net/fit-in/1960x/filters:quality(65)/hd_posters/7hqkre-7uepao.jpg', + 'description': 'md5:8d7046549daff35e9a3190dc9901a120', + 'chapters': [], + 'upload_date': '20220204', + 'timestamp': 1643965500, + 'title': 'Nellie Bly II', + 'age_limit': 0, + }, + }, { + 'url': 'https://www.ruv.is/ungruv/spila/ungruv/28046/8beuph', + 'only_matching': True + }, { + 'url': 'https://www.ruv.is/krakkaruv/spila/krakkafrettir/30712/9jbgb0', + 'only_matching': True + }] + + def _real_extract(self, url): + display_id, series_id = self._match_valid_url(url).group('id', 'series_id') + program = self._download_json( + 'https://www.ruv.is/gql/', display_id, query={'query': '''{ + Program(id: %s){ + title image description short_description + episodes(id: {value: "%s"}) { + rating title duration file image firstrun description + clips { + time text + } + subtitles { + name value + } + } + } + }''' % (series_id, display_id)})['data']['Program'] + episode = program['episodes'][0] + + subs = {} + for trk in episode.get('subtitles'): + if trk.get('name') and trk.get('value'): + subs.setdefault(trk['name'], []).append({'url': trk['value'], 'ext': 'vtt'}) + + media_url = episode['file'] + if determine_ext(media_url) == 'm3u8': + formats = self._extract_m3u8_formats(media_url, display_id) + else: + formats = [{'url': media_url}] + + clips = [ + {'start_time': parse_duration(c.get('time')), 'title': c.get('text')} + for c in episode.get('clips') or []] + + return { + 'id': display_id, + 'title': traverse_obj(program, ('episodes', 0, 'title'), 'title'), + 'description': traverse_obj( + program, ('episodes', 0, 'description'), 'description', 'short_description', + expected_type=lambda x: x or None), + 'subtitles': subs, + 'thumbnail': episode.get('image', '').replace('$$IMAGESIZE$$', '1960') or None, + 'timestamp': unified_timestamp(episode.get('firstrun')), + 'formats': formats, + 'age_limit': episode.get('rating'), + 'chapters': clips + } diff --git a/yt_dlp/extractor/s4c.py b/yt_dlp/extractor/s4c.py new file mode 100644 index 0000000..67eff72 --- /dev/null +++ b/yt_dlp/extractor/s4c.py @@ -0,0 +1,103 @@ +from .common import InfoExtractor +from ..utils import traverse_obj, url_or_none + + +class S4CIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/programme/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.s4c.cymru/clic/programme/861362209', + 'info_dict': { + 'id': '861362209', + 'ext': 'mp4', + 'title': 'Y Swn', + 'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0', + 'duration': 5340, + 'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Y_Swn_2023S4C_099_ii.jpg' + }, + }, { + 'url': 'https://www.s4c.cymru/clic/programme/856636948', + 'info_dict': { + 'id': '856636948', + 'ext': 'mp4', + 'title': 'Am Dro', + 'duration': 2880, + 'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe', + 'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Am_Dro_2022-23S4C_P6_4005.jpg' + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + details = self._download_json( + f'https://www.s4c.cymru/df/full_prog_details?lang=e&programme_id={video_id}', + video_id, fatal=False) + + player_config = self._download_json( + 'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={ + 'programme_id': video_id, + 'signed': '0', + 'lang': 'en', + 'mode': 'od', + 'appId': 'clic', + 'streamName': '', + }, note='Downloading player config JSON') + subtitles = {} + for sub in traverse_obj(player_config, ('subtitles', lambda _, v: url_or_none(v['0']))): + subtitles.setdefault(sub.get('3', 'en'), []).append({ + 'url': sub['0'], + 'name': sub.get('1'), + }) + m3u8_url = self._download_json( + 'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={ + 'mode': 'od', + 'application': 'clic', + 'region': 'WW', + 'extra': 'false', + 'thirdParty': 'false', + 'filename': player_config['filename'], + }, note='Downloading streaming urls JSON')['hls'] + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls'), + 'subtitles': subtitles, + 'thumbnail': url_or_none(player_config.get('poster')), + **traverse_obj(details, ('full_prog_details', 0, { + 'title': (('programme_title', 'series_title'), {str}), + 'description': ('full_billing', {str.strip}), + 'duration': ('duration', {lambda x: int(x) * 60}), + }), get_all=False), + } + + +class S4CSeriesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/series/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.s4c.cymru/clic/series/864982911', + 'playlist_mincount': 6, + 'info_dict': { + 'id': '864982911', + 'title': 'Iaith ar Daith', + }, + }, { + 'url': 'https://www.s4c.cymru/clic/series/866852587', + 'playlist_mincount': 8, + 'info_dict': { + 'id': '866852587', + 'title': 'FFIT Cymru', + }, + }] + + def _real_extract(self, url): + series_id = self._match_id(url) + series_details = self._download_json( + 'https://www.s4c.cymru/df/series_details', series_id, query={ + 'lang': 'e', + 'series_id': series_id, + 'show_prog_in_series': 'Y' + }, note='Downloading series details JSON') + + return self.playlist_result( + [self.url_result(f'https://www.s4c.cymru/clic/programme/{episode_id}', S4CIE, episode_id) + for episode_id in traverse_obj(series_details, ('other_progs_in_series', ..., 'id'))], + series_id, traverse_obj(series_details, ('full_prog_details', 0, 'series_title', {str}))) diff --git a/yt_dlp/extractor/safari.py b/yt_dlp/extractor/safari.py new file mode 100644 index 0000000..8d322d7 --- /dev/null +++ b/yt_dlp/extractor/safari.py @@ -0,0 +1,259 @@ +import json +import re + +from .common import InfoExtractor + +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + update_url_query, +) + + +class SafariBaseIE(InfoExtractor): + _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/' + _NETRC_MACHINE = 'safari' + + _API_BASE = 'https://learning.oreilly.com/api/v1' + _API_FORMAT = 'json' + + LOGGED_IN = False + + def _perform_login(self, username, password): + _, urlh = self._download_webpage_handle( + 'https://learning.oreilly.com/accounts/login-check/', None, + 'Downloading login page') + + def is_logged(urlh): + return 'learning.oreilly.com/home/' in urlh.url + + if is_logged(urlh): + self.LOGGED_IN = True + return + + redirect_url = urlh.url + parsed_url = compat_urlparse.urlparse(redirect_url) + qs = compat_parse_qs(parsed_url.query) + next_uri = compat_urlparse.urljoin( + 'https://api.oreilly.com', qs['next'][0]) + + auth, urlh = self._download_json_handle( + 'https://www.oreilly.com/member/auth/login/', None, 'Logging in', + data=json.dumps({ + 'email': username, + 'password': password, + 'redirect_uri': next_uri, + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Referer': redirect_url, + }, expected_status=400) + + credentials = auth.get('credentials') + if (not auth.get('logged_in') and not auth.get('redirect_uri') + and credentials): + raise ExtractorError( + 'Unable to login: %s' % credentials, expected=True) + + # oreilly serves two same instances of the following cookies + # in Set-Cookie header and expects first one to be actually set + for cookie in ('groot_sessionid', 'orm-jwt', 'orm-rt'): + self._apply_first_set_cookie_header(urlh, cookie) + + _, urlh = self._download_webpage_handle( + auth.get('redirect_uri') or next_uri, None, 'Completing login',) + + if is_logged(urlh): + self.LOGGED_IN = True + return + + raise ExtractorError('Unable to log in') + + +class SafariIE(SafariBaseIE): + IE_NAME = 'safari' + IE_DESC = 'safaribooksonline.com online video' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ + (?: + library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html| + videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+) + ) + ''' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', + 'md5': 'dcc5a425e79f2564148652616af1f2a3', + 'info_dict': { + 'id': '0_qbqx90ic', + 'ext': 'mp4', + 'title': 'Introduction to Hadoop Fundamentals LiveLessons', + 'timestamp': 1437758058, + 'upload_date': '20150724', + 'uploader_id': 'stork', + }, + }, { + # non-digits in course id + 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', + 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro', + 'only_matching': True, + }, { + 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html', + 'only_matching': True, + }] + + _PARTNER_ID = '1926081' + _UICONF_ID = '29375172' + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + + reference_id = mobj.group('reference_id') + if reference_id: + video_id = reference_id + partner_id = self._PARTNER_ID + ui_id = self._UICONF_ID + else: + video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part')) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + mobj = re.match(self._VALID_URL, urlh.url) + reference_id = mobj.group('reference_id') + if not reference_id: + reference_id = self._search_regex( + r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura reference id', group='id') + partner_id = self._search_regex( + r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura widget id', default=self._PARTNER_ID, + group='id') + ui_id = self._search_regex( + r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'kaltura uiconf id', default=self._UICONF_ID, + group='id') + + query = { + 'wid': '_%s' % partner_id, + 'uiconf_id': ui_id, + 'flashvars[referenceId]': reference_id, + } + + if self.LOGGED_IN: + kaltura_session = self._download_json( + '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), + video_id, 'Downloading kaltura session JSON', + 'Unable to download kaltura session JSON', fatal=False, + headers={'Accept': 'application/json'}) + if kaltura_session: + session = kaltura_session.get('session') + if session: + query['flashvars[ks]'] = session + + return self.url_result(update_url_query( + 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query), + 'Kaltura') + + +class SafariApiIE(SafariBaseIE): + IE_NAME = 'safari:api' + _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + part = self._download_json( + url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')), + 'Downloading part JSON') + web_url = part['web_url'] + if 'library/view' in web_url: + web_url = web_url.replace('library/view', 'videos') + natural_keys = part['natural_key'] + web_url = f'{web_url.rsplit("/", 1)[0]}/{natural_keys[0]}-{natural_keys[1][:-5]}' + return self.url_result(web_url, SafariIE.ie_key()) + + +class SafariCourseIE(SafariBaseIE): + IE_NAME = 'safari:course' + IE_DESC = 'safaribooksonline.com online courses' + + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ + (?: + library/view/[^/]+| + api/v1/book| + videos/[^/]+ + )| + techbus\.safaribooksonline\.com + ) + /(?P<id>[^/]+) + ''' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', + 'info_dict': { + 'id': '9780133392838', + 'title': 'Hadoop Fundamentals LiveLessons', + }, + 'playlist_count': 22, + 'skip': 'Requires safaribooksonline account credentials', + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json', + 'only_matching': True, + }, { + 'url': 'http://techbus.safaribooksonline.com/9780134426365', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', + 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838', + 'only_matching': True, + }, { + 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url) + else super(SafariCourseIE, cls).suitable(url)) + + def _real_extract(self, url): + course_id = self._match_id(url) + + course_json = self._download_json( + '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), + course_id, 'Downloading course JSON') + + if 'chapters' not in course_json: + raise ExtractorError( + 'No chapters found for course %s' % course_id, expected=True) + + entries = [ + self.url_result(chapter, SafariApiIE.ie_key()) + for chapter in course_json['chapters']] + + course_title = course_json['title'] + + return self.playlist_result(entries, course_id, course_title) diff --git a/yt_dlp/extractor/saitosan.py b/yt_dlp/extractor/saitosan.py new file mode 100644 index 0000000..a5f05e1 --- /dev/null +++ b/yt_dlp/extractor/saitosan.py @@ -0,0 +1,75 @@ +from .common import InfoExtractor +from ..utils import ExtractorError, try_get + + +class SaitosanIE(InfoExtractor): + _WORKING = False + IE_NAME = 'Saitosan' + _VALID_URL = r'https?://(?:www\.)?saitosan\.net/bview.html\?id=(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.saitosan.net/bview.html?id=10031846', + 'info_dict': { + 'id': '10031846', + 'ext': 'mp4', + 'title': '井下原 和弥', + 'uploader': '井下原 和弥', + 'thumbnail': 'http://111.171.196.85:8088/921f916f-7f55-4c97-b92e-5d9d0fef8f5f/thumb', + 'is_live': True, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Broadcasts are ephemeral', + }, + { + 'url': 'http://www.saitosan.net/bview.html?id=10031795', + 'info_dict': { + 'id': '10031795', + 'ext': 'mp4', + 'title': '橋本', + 'uploader': '橋本', + 'thumbnail': 'http://111.171.196.85:8088/1a3933e1-a01a-483b-8931-af15f37f8082/thumb', + 'is_live': True, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Broadcasts are ephemeral', + }] + + def _real_extract(self, url): + b_id = self._match_id(url) + + base = 'http://hankachi.saitosan-api.net:8002/socket.io/?transport=polling&EIO=3' + sid = self._download_socket_json(base, b_id, note='Opening socket').get('sid') + base += '&sid=' + sid + + self._download_webpage(base, b_id, note='Polling socket') + payload = '420["room_start_join",{"room_id":"%s"}]' % b_id + payload = '%s:%s' % (len(payload), payload) + + self._download_webpage(base, b_id, data=payload, note='Polling socket with payload') + response = self._download_socket_json(base, b_id, note='Polling socket') + if not response.get('ok'): + err = response.get('error') or {} + raise ExtractorError( + '%s said: %s - %s' % (self.IE_NAME, err.get('code', '?'), err.get('msg', 'Unknown')) if err + else 'The socket reported that the broadcast could not be joined. Maybe it\'s offline or the URL is incorrect', + expected=True, video_id=b_id) + + self._download_webpage(base, b_id, data='26:421["room_finish_join",{}]', note='Polling socket') + b_data = self._download_socket_json(base, b_id, note='Getting broadcast metadata from socket') + m3u8_url = b_data.get('url') + + self._download_webpage(base, b_id, data='1:1', note='Closing socket', fatal=False) + + return { + 'id': b_id, + 'title': b_data.get('name'), + 'formats': self._extract_m3u8_formats(m3u8_url, b_id, 'mp4', live=True), + 'thumbnail': m3u8_url.replace('av.m3u8', 'thumb'), + 'uploader': try_get(b_data, lambda x: x['broadcast_user']['name']), # same as title + 'is_live': True + } diff --git a/yt_dlp/extractor/samplefocus.py b/yt_dlp/extractor/samplefocus.py new file mode 100644 index 0000000..e9f5c22 --- /dev/null +++ b/yt_dlp/extractor/samplefocus.py @@ -0,0 +1,97 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + get_element_by_attribute, + int_or_none, +) + + +class SampleFocusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?samplefocus\.com/samples/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://samplefocus.com/samples/lil-peep-sad-emo-guitar', + 'md5': '48c8d62d60be467293912e0e619a5120', + 'info_dict': { + 'id': '40316', + 'display_id': 'lil-peep-sad-emo-guitar', + 'ext': 'mp3', + 'title': 'Lil Peep Sad Emo Guitar', + 'thumbnail': r're:^https?://.+\.png', + 'license': 'Standard License', + 'uploader': 'CapsCtrl', + 'uploader_id': 'capsctrl', + 'like_count': int, + 'comment_count': int, + 'categories': ['Samples', 'Guitar', 'Electric guitar'], + }, + }, { + 'url': 'https://samplefocus.com/samples/dababy-style-bass-808', + 'only_matching': True + }, { + 'url': 'https://samplefocus.com/samples/young-chop-kick', + 'only_matching': True + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + sample_id = self._search_regex( + r'<input[^>]+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P<id>\d+)', + webpage, 'sample id', group='id') + + title = self._og_search_title(webpage, fatal=False) or self._html_search_regex( + r'<h1>(.+?)</h1>', webpage, 'title') + + mp3_url = self._search_regex( + r'<input[^>]+id=(["\'])sample_mp3\1[^>]+value=(["\'])(?P<url>(?:(?!\2).)+)', + webpage, 'mp3', fatal=False, group='url') or extract_attributes(self._search_regex( + r'<meta[^>]+itemprop=(["\'])contentUrl\1[^>]*>', + webpage, 'mp3 url', group=0))['content'] + + thumbnail = self._og_search_thumbnail(webpage) or self._html_search_regex( + r'<img[^>]+class=(?:["\'])waveform responsive-img[^>]+src=(["\'])(?P<url>(?:(?!\1).)+)', + webpage, 'mp3', fatal=False, group='url') + + comments = [] + for author_id, author, body in re.findall(r'(?s)<p[^>]+class="comment-author"><a[^>]+href="/users/([^"]+)">([^"]+)</a>.+?<p[^>]+class="comment-body">([^>]+)</p>', webpage): + comments.append({ + 'author': author, + 'author_id': author_id, + 'text': body, + }) + + uploader_id = uploader = None + mobj = re.search(r'>By <a[^>]+href="/users/([^"]+)"[^>]*>([^<]+)', webpage) + if mobj: + uploader_id, uploader = mobj.groups() + + breadcrumb = get_element_by_attribute('typeof', 'BreadcrumbList', webpage) + categories = [] + if breadcrumb: + for _, name in re.findall(r'<span[^>]+property=(["\'])name\1[^>]*>([^<]+)', breadcrumb): + categories.append(name) + + def extract_count(klass): + return int_or_none(self._html_search_regex( + r'<span[^>]+class=(?:["\'])?%s-count[^>]*>(\d+)' % klass, + webpage, klass, fatal=False)) + + return { + 'id': sample_id, + 'title': title, + 'url': mp3_url, + 'display_id': display_id, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'license': self._html_search_regex( + r'<a[^>]+href=(["\'])/license\1[^>]*>(?P<license>[^<]+)<', + webpage, 'license', fatal=False, group='license'), + 'uploader_id': uploader_id, + 'like_count': extract_count('sample-%s-favorites' % sample_id), + 'comment_count': extract_count('comments'), + 'comments': comments, + 'categories': categories, + } diff --git a/yt_dlp/extractor/sapo.py b/yt_dlp/extractor/sapo.py new file mode 100644 index 0000000..beffaee --- /dev/null +++ b/yt_dlp/extractor/sapo.py @@ -0,0 +1,114 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + unified_strdate, +) + + +class SapoIE(InfoExtractor): + IE_DESC = 'SAPO Vídeos' + _VALID_URL = r'https?://(?:(?:v2|www)\.)?videos\.sapo\.(?:pt|cv|ao|mz|tl)/(?P<id>[\da-zA-Z]{20})' + + _TESTS = [ + { + 'url': 'http://videos.sapo.pt/UBz95kOtiWYUMTA5Ghfi', + 'md5': '79ee523f6ecb9233ac25075dee0eda83', + 'note': 'SD video', + 'info_dict': { + 'id': 'UBz95kOtiWYUMTA5Ghfi', + 'ext': 'mp4', + 'title': 'Benfica - Marcas na Hitória', + 'description': 'md5:c9082000a128c3fd57bf0299e1367f22', + 'duration': 264, + 'uploader': 'tiago_1988', + 'upload_date': '20080229', + 'categories': ['benfica', 'cabral', 'desporto', 'futebol', 'geovanni', 'hooijdonk', 'joao', 'karel', 'lisboa', 'miccoli'], + }, + }, + { + 'url': 'http://videos.sapo.pt/IyusNAZ791ZdoCY5H5IF', + 'md5': '90a2f283cfb49193fe06e861613a72aa', + 'note': 'HD video', + 'info_dict': { + 'id': 'IyusNAZ791ZdoCY5H5IF', + 'ext': 'mp4', + 'title': 'Codebits VII - Report', + 'description': 'md5:6448d6fd81ce86feac05321f354dbdc8', + 'duration': 144, + 'uploader': 'codebits', + 'upload_date': '20140427', + 'categories': ['codebits', 'codebits2014'], + }, + }, + { + 'url': 'http://v2.videos.sapo.pt/yLqjzPtbTimsn2wWBKHz', + 'md5': 'e5aa7cc0bdc6db9b33df1a48e49a15ac', + 'note': 'v2 video', + 'info_dict': { + 'id': 'yLqjzPtbTimsn2wWBKHz', + 'ext': 'mp4', + 'title': 'Hipnose Condicionativa 4', + 'description': 'md5:ef0481abf8fb4ae6f525088a6dadbc40', + 'duration': 692, + 'uploader': 'sapozen', + 'upload_date': '20090609', + 'categories': ['condicionativa', 'heloisa', 'hipnose', 'miranda', 'sapo', 'zen'], + }, + }, + ] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + + item = self._download_xml( + 'http://rd3.videos.sapo.pt/%s/rss2' % video_id, video_id).find('./channel/item') + + title = item.find('./title').text + description = item.find('./{http://videos.sapo.pt/mrss/}synopse').text + thumbnail = item.find('./{http://search.yahoo.com/mrss/}content').get('url') + duration = parse_duration(item.find('./{http://videos.sapo.pt/mrss/}time').text) + uploader = item.find('./{http://videos.sapo.pt/mrss/}author').text + upload_date = unified_strdate(item.find('./pubDate').text) + view_count = int(item.find('./{http://videos.sapo.pt/mrss/}views').text) + comment_count = int(item.find('./{http://videos.sapo.pt/mrss/}comment_count').text) + tags = item.find('./{http://videos.sapo.pt/mrss/}tags').text + categories = tags.split() if tags else [] + age_limit = 18 if item.find('./{http://videos.sapo.pt/mrss/}m18').text == 'true' else 0 + + video_url = item.find('./{http://videos.sapo.pt/mrss/}videoFile').text + video_size = item.find('./{http://videos.sapo.pt/mrss/}videoSize').text.split('x') + + formats = [{ + 'url': video_url, + 'ext': 'mp4', + 'format_id': 'sd', + 'width': int(video_size[0]), + 'height': int(video_size[1]), + }] + + if item.find('./{http://videos.sapo.pt/mrss/}HD').text == 'true': + formats.append({ + 'url': re.sub(r'/mov/1$', '/mov/39', video_url), + 'ext': 'mp4', + 'format_id': 'hd', + 'width': 1280, + 'height': 720, + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': uploader, + 'upload_date': upload_date, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py new file mode 100644 index 0000000..8d61e22 --- /dev/null +++ b/yt_dlp/extractor/sbs.py @@ -0,0 +1,156 @@ +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import ( + float_or_none, + int_or_none, + parse_duration, + parse_iso8601, + traverse_obj, + update_url_query, + url_or_none, +) + + +class SBSIE(InfoExtractor): + IE_DESC = 'sbs.com.au' + _VALID_URL = r'''(?x) + https?://(?:www\.)?sbs\.com\.au/(?: + ondemand(?: + /video/(?:single/)?| + /(?:movie|tv-program)/[^/]+/| + /(?:tv|news)-series/(?:[^/]+/){3}| + .*?\bplay=|/watch/ + )|news/(?:embeds/)?video/ + )(?P<id>[0-9]+)''' + _EMBED_REGEX = [r'''(?x)] + (?: + <meta\s+property="og:video"\s+content=| + <iframe[^>]+?src= + ) + (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1'''] + + _TESTS = [{ + # Original URL is handled by the generic IE which finds the iframe: + # http://www.sbs.com.au/thefeed/blog/2014/08/21/dingo-conservation + 'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed', + 'md5': '31f84a7a19b53635db63c73f8ab0c4a7', + 'info_dict': { + 'id': '320403011771', # '_rFBPRPO4pMR', + 'ext': 'mp4', + 'title': 'Dingo Conservation (The Feed)', + 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', + 'thumbnail': r're:https?://.*\.jpg', + 'duration': 308, + 'timestamp': 1408613220, + 'upload_date': '20140821', + 'uploader': 'SBSC', + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, { + 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed', + 'only_matching': True, + }, { + 'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9', + 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/?play=1836638787723', + 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/program/inside-windsor-castle?play=1283505731842', + 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/news/embeds/video/1840778819866', + 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/watch/1698704451971', + 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/movie/coherence/1469404227931', + 'only_matching': True, + }, { + 'note': 'Live stream', + 'url': 'https://www.sbs.com.au/ondemand/video/1726824003663/sbs-24x7-live-stream-nsw', + 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/news-series/dateline/dateline-2022/dateline-s2022-ep26/2072245827515', + 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/tv-series/the-handmaids-tale/season-5/the-handmaids-tale-s5-ep1/2065631811776', + 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/tv-program/autun-romes-forgotten-sister/2116212803602', + 'only_matching': True, + }] + + _GEO_COUNTRIES = ['AU'] + _AUS_TV_PARENTAL_GUIDELINES = { + 'P': 0, + 'C': 7, + 'G': 0, + 'PG': 0, + 'M': 14, + 'MA15+': 15, + 'MAV15+': 15, + 'R18+': 18, + } + _PLAYER_API = 'https://www.sbs.com.au/api/v3' + + def _real_extract(self, url): + video_id = self._match_id(url) + formats, subtitles = self._extract_smil_formats_and_subtitles( + update_url_query(f'{self._PLAYER_API}/video_smil', {'id': video_id}), video_id) + + if not formats: + urlh = self._request_webpage( + HEADRequest('https://sbs-vod-prod-01.akamaized.net/'), video_id, + note='Checking geo-restriction', fatal=False, expected_status=403) + if urlh: + error_reasons = urlh.headers.get_all('x-error-reason') or [] + if 'geo-blocked' in error_reasons: + self.raise_geo_restricted(countries=['AU']) + self.raise_no_formats('No formats are available', video_id=video_id) + + media = traverse_obj(self._download_json( + f'{self._PLAYER_API}/video_stream', video_id, fatal=False, + query={'id': video_id, 'context': 'tv'}), ('video_object', {dict})) or {} + + media.update(self._download_json( + f'https://catalogue.pr.sbsod.com/mpx-media/{video_id}', + video_id, fatal=not media) or {}) + + # For named episodes, use the catalogue's title to set episode, rather than generic 'Episode N'. + if traverse_obj(media, ('partOfSeries', {dict})): + media['epName'] = traverse_obj(media, ('title', {str})) + + return { + 'id': video_id, + **traverse_obj(media, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'channel': ('taxonomy', 'channel', 'name', {str}), + 'series': ((('partOfSeries', 'name'), 'seriesTitle'), {str}), + 'series_id': ((('partOfSeries', 'uuid'), 'seriesID'), {str}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode': ('epName', {str}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'timestamp': (('datePublished', ('publication', 'startDate')), {parse_iso8601}), + 'release_year': ('releaseYear', {int_or_none}), + 'duration': ('duration', ({float_or_none}, {parse_duration})), + 'is_live': ('liveStream', {bool}), + 'age_limit': (('classificationID', 'contentRating'), {str.upper}, { + lambda x: self._AUS_TV_PARENTAL_GUIDELINES.get(x)}), # dict.get is unhashable in py3.7 + }, get_all=False), + **traverse_obj(media, { + 'categories': (('genres', ...), ('taxonomy', ('genre', 'subgenre'), 'name'), {str}), + 'tags': (('consumerAdviceTexts', ('sbsSubCertification', 'consumerAdvice')), ..., {str}), + 'thumbnails': ('thumbnails', lambda _, v: url_or_none(v['contentUrl']), { + 'id': ('name', {str}), + 'url': 'contentUrl', + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + 'formats': formats, + 'subtitles': subtitles, + 'uploader': 'SBSC', + } diff --git a/yt_dlp/extractor/sbscokr.py b/yt_dlp/extractor/sbscokr.py new file mode 100644 index 0000000..001d19e --- /dev/null +++ b/yt_dlp/extractor/sbscokr.py @@ -0,0 +1,200 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + parse_iso8601, + parse_resolution, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class SBSCoKrIE(InfoExtractor): + IE_NAME = 'sbs.co.kr' + _VALID_URL = [r'https?://allvod\.sbs\.co\.kr/allvod/vod(?:Package)?EndPage\.do\?(?:[^#]+&)?mdaId=(?P<id>\d+)', + r'https?://programs\.sbs\.co\.kr/(?:enter|drama|culture|sports|plus|mtv|kth)/[a-z0-9]+/(?:vod|clip|movie)/\d+/(?P<id>(?:OC)?\d+)'] + + _TESTS = [{ + 'url': 'https://programs.sbs.co.kr/enter/dongsang2/clip/52007/OC467706746?div=main_pop_clip', + 'md5': 'c3f6d45e1fb5682039d94cda23c36f19', + 'info_dict': { + 'id': 'OC467706746', + 'ext': 'mp4', + 'title': '‘아슬아슬’ 박군♥한영의 새 집 인테리어 대첩♨', + 'description': 'md5:6a71eb1979ee4a94ea380310068ccab4', + 'thumbnail': 'https://img2.sbs.co.kr/ops_clip_img/2023/10/10/34c4c0f9-a9a5-4ff6-a92e-9bb4b5f6fa65915w1280.jpg', + 'release_timestamp': 1696889400, + 'release_date': '20231009', + 'view_count': int, + 'like_count': int, + 'duration': 238, + 'age_limit': 15, + 'series': '동상이몽2_너는 내 운명', + 'episode': '레이디제인, ‘혼전임신설’ ‘3개월’ 앞당긴 결혼식 비하인드 스토리 최초 공개!', + 'episode_number': 311, + }, + }, { + 'url': 'https://allvod.sbs.co.kr/allvod/vodPackageEndPage.do?mdaId=22000489324&combiId=PA000000284&packageType=A&isFreeYN=', + 'md5': 'bf46b2e89fda7ae7de01f5743cef7236', + 'info_dict': { + 'id': '22000489324', + 'ext': 'mp4', + 'title': '[다시보기] 트롤리 15회', + 'description': 'md5:0e55d74bef1ac55c61ae90c73ac485f4', + 'thumbnail': 'https://img2.sbs.co.kr/img/sbs_cms/WE/2023/02/14/arC1676333794938-1280-720.jpg', + 'release_timestamp': 1676325600, + 'release_date': '20230213', + 'view_count': int, + 'like_count': int, + 'duration': 5931, + 'age_limit': 15, + 'series': '트롤리', + 'episode': '이거 다 거짓말이야', + 'episode_number': 15, + }, + }, { + 'url': 'https://programs.sbs.co.kr/enter/fourman/vod/69625/22000508948', + 'md5': '41e8ae4cc6c8424f4e4d76661a4becbf', + 'info_dict': { + 'id': '22000508948', + 'ext': 'mp4', + 'title': '[다시보기] 신발 벗고 돌싱포맨 104회', + 'description': 'md5:c6a247383c4dd661e4b956bf4d3b586e', + 'thumbnail': 'https://img2.sbs.co.kr/img/sbs_cms/WE/2023/08/30/2vb1693355446261-1280-720.jpg', + 'release_timestamp': 1693342800, + 'release_date': '20230829', + 'view_count': int, + 'like_count': int, + 'duration': 7036, + 'age_limit': 15, + 'series': '신발 벗고 돌싱포맨', + 'episode': '돌싱포맨 저격수들 등장!', + 'episode_number': 104, + }, + }] + + def _call_api(self, video_id, rscuse=''): + return self._download_json( + f'https://api.play.sbs.co.kr/1.0/sbs_vodall/{video_id}', video_id, + note=f'Downloading m3u8 information {rscuse}', + query={ + 'platform': 'pcweb', + 'protocol': 'download', + 'absolute_show': 'Y', + 'service': 'program', + 'ssl': 'Y', + 'rscuse': rscuse, + }) + + def _real_extract(self, url): + video_id = self._match_id(url) + + details = self._call_api(video_id) + source = traverse_obj(details, ('vod', 'source', 'mediasource', {dict})) or {} + + formats = [] + for stream in traverse_obj(details, ( + 'vod', 'source', 'mediasourcelist', lambda _, v: v['mediaurl'] or v['mediarscuse'] + ), default=[source]): + if not stream.get('mediaurl'): + new_source = traverse_obj( + self._call_api(video_id, rscuse=stream['mediarscuse']), + ('vod', 'source', 'mediasource', {dict})) or {} + if new_source.get('mediarscuse') == source.get('mediarscuse') or not new_source.get('mediaurl'): + continue + stream = new_source + formats.append({ + 'url': stream['mediaurl'], + 'format_id': stream.get('mediarscuse'), + 'format_note': stream.get('medianame'), + **parse_resolution(stream.get('quality')), + 'preference': int_or_none(stream.get('mediarscuse')) + }) + + caption_url = traverse_obj(details, ('vod', 'source', 'subtitle', {url_or_none})) + + return { + 'id': video_id, + **traverse_obj(details, ('vod', { + 'title': ('info', 'title'), + 'duration': ('info', 'duration', {int_or_none}), + 'view_count': ('info', 'viewcount', {int_or_none}), + 'like_count': ('info', 'likecount', {int_or_none}), + 'description': ('info', 'synopsis', {clean_html}), + 'episode': ('info', 'content', ('contenttitle', 'title')), + 'episode_number': ('info', 'content', 'number', {int_or_none}), + 'series': ('info', 'program', 'programtitle'), + 'age_limit': ('info', 'targetage', {int_or_none}), + 'release_timestamp': ('info', 'broaddate', {parse_iso8601}), + 'thumbnail': ('source', 'thumbnail', 'origin', {url_or_none}), + }), get_all=False), + 'formats': formats, + 'subtitles': {'ko': [{'url': caption_url}]} if caption_url else None, + } + + +class SBSCoKrAllvodProgramIE(InfoExtractor): + IE_NAME = 'sbs.co.kr:allvod_program' + _VALID_URL = r'https?://allvod\.sbs\.co\.kr/allvod/vod(?:Free)?ProgramDetail\.do\?(?:[^#]+&)?pgmId=(?P<id>P?\d+)' + + _TESTS = [{ + 'url': 'https://allvod.sbs.co.kr/allvod/vodFreeProgramDetail.do?type=legend&pgmId=22000010159&listOrder=vodCntAsc', + 'info_dict': { + '_type': 'playlist', + 'id': '22000010159', + }, + 'playlist_count': 18, + }, { + 'url': 'https://allvod.sbs.co.kr/allvod/vodProgramDetail.do?pgmId=P460810577', + 'info_dict': { + '_type': 'playlist', + 'id': 'P460810577', + }, + 'playlist_count': 13, + }] + + def _real_extract(self, url): + program_id = self._match_id(url) + + details = self._download_json( + 'https://allvod.sbs.co.kr/allvod/vodProgramDetail/vodProgramDetailAjax.do', + program_id, note='Downloading program details', + query={ + 'pgmId': program_id, + 'currentCount': '10000', + }) + + return self.playlist_result( + [self.url_result(f'https://allvod.sbs.co.kr/allvod/vodEndPage.do?mdaId={video_id}', SBSCoKrIE) + for video_id in traverse_obj(details, ('list', ..., 'mdaId'))], program_id) + + +class SBSCoKrProgramsVodIE(InfoExtractor): + IE_NAME = 'sbs.co.kr:programs_vod' + _VALID_URL = r'https?://programs\.sbs\.co\.kr/(?:enter|drama|culture|sports|plus|mtv)/(?P<id>[a-z0-9]+)/vods' + + _TESTS = [{ + 'url': 'https://programs.sbs.co.kr/culture/morningwide/vods/65007', + 'info_dict': { + '_type': 'playlist', + 'id': '00000210215', + }, + 'playlist_mincount': 9782, + }, { + 'url': 'https://programs.sbs.co.kr/enter/dongsang2/vods/52006', + 'info_dict': { + '_type': 'playlist', + 'id': '22000010476', + }, + 'playlist_mincount': 312, + }] + + def _real_extract(self, url): + program_slug = self._match_id(url) + + program_id = self._download_json( + f'https://static.apis.sbs.co.kr/program-api/1.0/menu/{program_slug}', program_slug, + note='Downloading program menu data')['program']['programid'] + + return self.url_result( + f'https://allvod.sbs.co.kr/allvod/vodProgramDetail.do?pgmId={program_id}', SBSCoKrAllvodProgramIE) diff --git a/yt_dlp/extractor/screen9.py b/yt_dlp/extractor/screen9.py new file mode 100644 index 0000000..5ab0b6c --- /dev/null +++ b/yt_dlp/extractor/screen9.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class Screen9IE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.screen9\.(?:tv|com)|play\.su\.se)/(?:embed|media)/(?P<id>[^?#/]+)' + _TESTS = [ + { + 'url': 'https://api.screen9.com/embed/8kTNEjvoXGM33dmWwF0uDA', + 'md5': 'd60d23f8980583b930724b01fa6ddb41', + 'info_dict': { + 'id': '8kTNEjvoXGM33dmWwF0uDA', + 'ext': 'mp4', + 'title': 'Östersjön i förändrat klimat', + 'thumbnail': r're:^https?://.+\.jpg', + }, + }, + { + 'url': 'https://folkhogskolekanalen.screen9.tv/media/gy35PKLHe-5K29RYHga2bw/ett-starkare-samhalle-en-snabbguide-om-sveriges-folkhogskolor', + 'md5': 'c9389806e78573ea34fc48b6f94465dc', + 'info_dict': { + 'id': 'gy35PKLHe-5K29RYHga2bw', + 'ext': 'mp4', + 'title': 'Ett starkare samhälle - en snabbguide om Sveriges folkhögskolor', + 'thumbnail': r're:^https?://.+\.jpg', + }, + }, + { + 'url': 'https://play.su.se/media/H1YA0EYNCxiesrSU1kaRBQ/baltic-breakfast', + 'md5': '2b817647c3058002526269deff4c0683', + 'info_dict': { + 'id': 'H1YA0EYNCxiesrSU1kaRBQ', + 'ext': 'mp4', + 'title': 'Baltic Breakfast', + 'thumbnail': r're:^https?://.+\.jpg', + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://api.screen9.com/embed/{video_id}', video_id) + config = self._search_json(r'var\s+config\s*=', webpage, 'config', video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + traverse_obj(config, ('src', lambda _, v: v['type'] == 'application/x-mpegURL', 'src'), get_all=False), + video_id, ext='mp4') + formats.append({ + 'url': traverse_obj(config, ('src', lambda _, v: v['type'] == 'video/mp4', 'src'), get_all=False), + 'format': 'mp4', + }) + + return { + 'id': video_id, + 'title': traverse_obj( + config, + ('plugins', (('title', 'title'), ('googleAnalytics', 'title'), ('share', 'mediaTitle'))), + get_all=False), + 'description': traverse_obj(config, ('plugins', 'title', 'description')), + 'thumbnail': traverse_obj(config, ('poster')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/screencast.py b/yt_dlp/extractor/screencast.py new file mode 100644 index 0000000..df5e79b --- /dev/null +++ b/yt_dlp/extractor/screencast.py @@ -0,0 +1,117 @@ +import urllib.request + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ExtractorError + + +class ScreencastIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?screencast\.com/t/(?P<id>[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'http://www.screencast.com/t/3ZEjQXlT', + 'md5': '917df1c13798a3e96211dd1561fded83', + 'info_dict': { + 'id': '3ZEjQXlT', + 'ext': 'm4v', + 'title': 'Color Measurement with Ocean Optics Spectrometers', + 'description': 'md5:240369cde69d8bed61349a199c5fb153', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', + } + }, { + 'url': 'http://www.screencast.com/t/V2uXehPJa1ZI', + 'md5': 'e8e4b375a7660a9e7e35c33973410d34', + 'info_dict': { + 'id': 'V2uXehPJa1ZI', + 'ext': 'mov', + 'title': 'The Amadeus Spectrometer', + 'description': 're:^In this video, our friends at.*To learn more about Amadeus, visit', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', + } + }, { + 'url': 'http://www.screencast.com/t/aAB3iowa', + 'md5': 'dedb2734ed00c9755761ccaee88527cd', + 'info_dict': { + 'id': 'aAB3iowa', + 'ext': 'mp4', + 'title': 'Google Earth Export', + 'description': 'Provides a demo of a CommunityViz export to Google Earth, one of the 3D viewing options.', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', + } + }, { + 'url': 'http://www.screencast.com/t/X3ddTrYh', + 'md5': '669ee55ff9c51988b4ebc0877cc8b159', + 'info_dict': { + 'id': 'X3ddTrYh', + 'ext': 'wmv', + 'title': 'Toolkit 6 User Group Webinar (2014-03-04) - Default Judgment and First Impression', + 'description': 'md5:7b9f393bc92af02326a5c5889639eab0', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', + } + }, { + 'url': 'http://screencast.com/t/aAB3iowa', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_url = self._html_search_regex( + r'<embed name="Video".*?src="([^"]+)"', webpage, + 'QuickTime embed', default=None) + + if video_url is None: + flash_vars_s = self._html_search_regex( + r'<param name="flashVars" value="([^"]+)"', webpage, 'flash vars', + default=None) + if not flash_vars_s: + flash_vars_s = self._html_search_regex( + r'<param name="initParams" value="([^"]+)"', webpage, 'flash vars', + default=None) + if flash_vars_s: + flash_vars_s = flash_vars_s.replace(',', '&') + if flash_vars_s: + flash_vars = compat_parse_qs(flash_vars_s) + video_url_raw = urllib.request.quote( + flash_vars['content'][0]) + video_url = video_url_raw.replace('http%3A', 'http:') + + if video_url is None: + video_meta = self._html_search_meta( + 'og:video', webpage, default=None) + if video_meta: + video_url = self._search_regex( + r'src=(.*?)(?:$|&)', video_meta, + 'meta tag video URL', default=None) + + if video_url is None: + video_url = self._html_search_regex( + r'MediaContentUrl["\']\s*:(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'video url', default=None, group='url') + + if video_url is None: + video_url = self._html_search_meta( + 'og:video', webpage, default=None) + + if video_url is None: + raise ExtractorError('Cannot find video') + + title = self._og_search_title(webpage, default=None) + if title is None: + title = self._html_search_regex( + [r'<b>Title:</b> ([^<]+)</div>', + r'class="tabSeperator">></span><span class="tabText">(.+?)<', + r'<title>([^<]+)'], + webpage, 'title') + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage, default=None) + if description is None: + description = self._html_search_meta('description', webpage) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/yt_dlp/extractor/screencastify.py b/yt_dlp/extractor/screencastify.py new file mode 100644 index 0000000..3c43043 --- /dev/null +++ b/yt_dlp/extractor/screencastify.py @@ -0,0 +1,70 @@ +import urllib.parse + +from .common import InfoExtractor +from ..utils import traverse_obj, update_url_query + + +class ScreencastifyIE(InfoExtractor): + _VALID_URL = [ + r'https?://watch\.screencastify\.com/v/(?P[^/?#]+)', + r'https?://app\.screencastify\.com/v[23]/watch/(?P[^/?#]+)', + ] + _TESTS = [{ + 'url': 'https://watch.screencastify.com/v/sYVkZip3quLKhHw4Ybk8', + 'info_dict': { + 'id': 'sYVkZip3quLKhHw4Ybk8', + 'ext': 'mp4', + 'title': 'Inserting and Aligning the Case Top and Bottom', + 'description': '', + 'uploader': 'Paul Gunn', + 'extra_param_to_segment_url': str, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://app.screencastify.com/v3/watch/J5N7H11wofDN1jZUCr3t', + 'info_dict': { + 'id': 'J5N7H11wofDN1jZUCr3t', + 'ext': 'mp4', + 'uploader': 'Scott Piesen', + 'description': '', + 'title': 'Lesson Recording 1-17 Burrr...', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://app.screencastify.com/v2/watch/BQ26VbUdfbQLhKzkktOk', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_json( + f'https://umbrella.svc.screencastify.com/api/umbrellaService/watch/{video_id}', video_id) + + query_string = traverse_obj(info, ('manifest', 'auth', 'query')) + query = urllib.parse.parse_qs(query_string) + formats = [] + dash_manifest_url = traverse_obj(info, ('manifest', 'url')) + if dash_manifest_url: + formats.extend( + self._extract_mpd_formats( + dash_manifest_url, video_id, mpd_id='dash', query=query, fatal=False)) + hls_manifest_url = traverse_obj(info, ('manifest', 'hlsUrl')) + if hls_manifest_url: + formats.extend( + self._extract_m3u8_formats( + hls_manifest_url, video_id, ext='mp4', m3u8_id='hls', query=query, fatal=False)) + for f in formats: + f['url'] = update_url_query(f['url'], query) + + return { + 'id': video_id, + 'title': info.get('title'), + 'description': info.get('description'), + 'uploader': info.get('userName'), + 'formats': formats, + 'extra_param_to_segment_url': query_string, + } diff --git a/yt_dlp/extractor/screencastomatic.py b/yt_dlp/extractor/screencastomatic.py new file mode 100644 index 0000000..28e25e9 --- /dev/null +++ b/yt_dlp/extractor/screencastomatic.py @@ -0,0 +1,72 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_class, + int_or_none, + remove_start, + strip_or_none, + unified_strdate, + urlencode_postdata, +) + + +class ScreencastOMaticIE(InfoExtractor): + _VALID_URL = r'https?://screencast-o-matic\.com/(?:(?:watch|player)/|embed\?.*?\bsc=)(?P[0-9a-zA-Z]+)' + _TESTS = [{ + 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', + 'md5': '483583cb80d92588f15ccbedd90f0c18', + 'info_dict': { + 'id': 'c2lD3BeOPl', + 'ext': 'mp4', + 'title': 'Welcome to 3-4 Philosophy @ DECV!', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', + 'duration': 369, + 'upload_date': '20141216', + } + }, { + 'url': 'http://screencast-o-matic.com/player/c2lD3BeOPl', + 'only_matching': True, + }, { + 'url': 'http://screencast-o-matic.com/embed?ff=true&sc=cbV2r4Q5TL&fromPH=true&a=1', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'https://screencast-o-matic.com/player/' + video_id, video_id) + + if (self._html_extract_title(webpage) == 'Protected Content' + or 'This video is private and requires a password' in webpage): + password = self.get_param('videopassword') + + if not password: + raise ExtractorError('Password protected video, use --video-password ', expected=True) + + form = self._search_regex( + r'(?is)]*>(?P
.+?)
', webpage, 'login form', group='form') + form_data = self._hidden_inputs(form) + form_data.update({ + 'scPassword': password, + }) + + webpage = self._download_webpage( + 'https://screencast-o-matic.com/player/password', video_id, 'Logging in', + data=urlencode_postdata(form_data)) + + if 'Invalid password' in webpage: + raise ExtractorError('Unable to login: Invalid password', expected=True) + + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info.update({ + 'id': video_id, + 'title': get_element_by_class('overlayTitle', webpage), + 'description': strip_or_none(get_element_by_class('overlayDescription', webpage)) or None, + 'duration': int_or_none(self._search_regex( + r'player\.duration\s*=\s*function\(\)\s*{\s*return\s+(\d+);\s*};', + webpage, 'duration', default=None)), + 'upload_date': unified_strdate(remove_start( + get_element_by_class('overlayPublished', webpage), 'Published: ')), + }) + return info diff --git a/yt_dlp/extractor/scrippsnetworks.py b/yt_dlp/extractor/scrippsnetworks.py new file mode 100644 index 0000000..3912f77 --- /dev/null +++ b/yt_dlp/extractor/scrippsnetworks.py @@ -0,0 +1,155 @@ +import json +import hashlib + +from .aws import AWSIE +from .anvato import AnvatoIE +from .common import InfoExtractor +from ..utils import ( + smuggle_url, + urlencode_postdata, + xpath_text, +) + + +class ScrippsNetworksWatchIE(AWSIE): + IE_NAME = 'scrippsnetworks:watch' + _VALID_URL = r'''(?x) + https?:// + watch\. + (?Pgeniuskitchen)\.com/ + (?: + player\.[A-Z0-9]+\.html\#| + show/(?:[^/]+/){2}| + player/ + ) + (?P\d+) + ''' + _TESTS = [{ + 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/', + 'info_dict': { + 'id': '4194875', + 'ext': 'mp4', + 'title': 'Ample Hills Ice Cream Bike', + 'description': 'Courtney Rada churns up a signature GK Now ice cream with The Scoopmaster.', + 'uploader': 'ANV', + 'upload_date': '20171011', + 'timestamp': 1507698000, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [AnvatoIE.ie_key()], + 'skip': '404 Not Found', + }] + + _SNI_TABLE = { + 'geniuskitchen': 'genius', + } + + _AWS_API_KEY = 'E7wSQmq0qK6xPrF13WmzKiHo4BQ7tip4pQcSXVl1' + _AWS_PROXY_HOST = 'web.api.video.snidigital.com' + + _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback' + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + site_id, video_id = mobj.group('site', 'id') + + aws_identity_id_json = json.dumps({ + 'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % self._AWS_REGION + }).encode('utf-8') + token = self._download_json( + 'https://cognito-identity.%s.amazonaws.com/' % self._AWS_REGION, video_id, + data=aws_identity_id_json, + headers={ + 'Accept': '*/*', + 'Content-Type': 'application/x-amz-json-1.1', + 'Referer': url, + 'X-Amz-Content-Sha256': hashlib.sha256(aws_identity_id_json).hexdigest(), + 'X-Amz-Target': 'AWSCognitoIdentityService.GetOpenIdToken', + 'X-Amz-User-Agent': self._AWS_USER_AGENT, + })['Token'] + + sts = self._download_xml( + 'https://sts.amazonaws.com/', video_id, data=urlencode_postdata({ + 'Action': 'AssumeRoleWithWebIdentity', + 'RoleArn': 'arn:aws:iam::710330595350:role/Cognito_WebAPIUnauth_Role', + 'RoleSessionName': 'web-identity', + 'Version': '2011-06-15', + 'WebIdentityToken': token, + }), headers={ + 'Referer': url, + 'X-Amz-User-Agent': self._AWS_USER_AGENT, + 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', + }) + + def get(key): + return xpath_text( + sts, './/{https://sts.amazonaws.com/doc/2011-06-15/}%s' % key, + fatal=True) + + mcp_id = self._aws_execute_api({ + 'uri': '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id), + 'access_key': get('AccessKeyId'), + 'secret_key': get('SecretAccessKey'), + 'session_token': get('SessionToken'), + }, video_id)['results'][0]['mcpId'] + + return self.url_result( + smuggle_url( + 'anvato:anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a:%s' % mcp_id, + {'geo_countries': ['US']}), + AnvatoIE.ie_key(), video_id=mcp_id) + + +class ScrippsNetworksIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?Pcookingchanneltv|discovery|(?:diy|food)network|hgtv|travelchannel)\.com/videos/[0-9a-z-]+-(?P\d+)' + _TESTS = [{ + 'url': 'https://www.cookingchanneltv.com/videos/the-best-of-the-best-0260338', + 'info_dict': { + 'id': '0260338', + 'ext': 'mp4', + 'title': 'The Best of the Best', + 'description': 'Catch a new episode of MasterChef Canada Tuedsay at 9/8c.', + 'timestamp': 1475678834, + 'upload_date': '20161005', + 'uploader': 'SCNI-SCND', + 'tags': 'count:10', + 'creator': 'Cooking Channel', + 'duration': 29.995, + 'chapters': [{'start_time': 0.0, 'end_time': 29.995, 'title': ''}], + 'thumbnail': 'https://images.dds.discovery.com/up/tp/Scripps_-_Food_Category_Prod/122/987/0260338_630x355.jpg', + }, + 'add_ie': ['ThePlatform'], + 'expected_warnings': ['No HLS formats found'], + }, { + 'url': 'https://www.diynetwork.com/videos/diy-barnwood-tablet-stand-0265790', + 'only_matching': True, + }, { + 'url': 'https://www.foodnetwork.com/videos/chocolate-strawberry-cake-roll-7524591', + 'only_matching': True, + }, { + 'url': 'https://www.hgtv.com/videos/cookie-decorating-101-0301929', + 'only_matching': True, + }, { + 'url': 'https://www.travelchannel.com/videos/two-climates-one-bag-5302184', + 'only_matching': True, + }, { + 'url': 'https://www.discovery.com/videos/guardians-of-the-glades-cooking-with-tom-cobb-5578368', + 'only_matching': True, + }] + _ACCOUNT_MAP = { + 'cookingchanneltv': 2433005105, + 'discovery': 2706091867, + 'diynetwork': 2433004575, + 'foodnetwork': 2433005105, + 'hgtv': 2433004575, + 'travelchannel': 2433005739, + } + _TP_TEMPL = 'https://link.theplatform.com/s/ip77QC/media/guid/%d/%s?mbr=true' + + def _real_extract(self, url): + site, guid = self._match_valid_url(url).groups() + return self.url_result(smuggle_url( + self._TP_TEMPL % (self._ACCOUNT_MAP[site], guid), + {'force_smil_url': True}), 'ThePlatform', guid) diff --git a/yt_dlp/extractor/scrolller.py b/yt_dlp/extractor/scrolller.py new file mode 100644 index 0000000..4f9fa14 --- /dev/null +++ b/yt_dlp/extractor/scrolller.py @@ -0,0 +1,102 @@ +import json + +from .common import InfoExtractor +from ..utils import determine_ext, int_or_none + + +class ScrolllerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?scrolller\.com/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://scrolller.com/a-helping-hand-1k9pxikxkw', + 'info_dict': { + 'id': 'a-helping-hand-1k9pxikxkw', + 'ext': 'mp4', + 'thumbnail': 'https://zepto.scrolller.com/a-helping-hand-3ty9q8x094-540x960.jpg', + 'title': 'A helping hand', + 'age_limit': 0, + } + }, { + 'url': 'https://scrolller.com/tigers-chasing-a-drone-c5d1f2so6j', + 'info_dict': { + 'id': 'tigers-chasing-a-drone-c5d1f2so6j', + 'ext': 'mp4', + 'thumbnail': 'https://zepto.scrolller.com/tigers-chasing-a-drone-az9pkpguwe-540x303.jpg', + 'title': 'Tigers chasing a drone', + 'age_limit': 0, + } + }, { + 'url': 'https://scrolller.com/baby-rhino-smells-something-9chhugsv9p', + 'info_dict': { + 'id': 'baby-rhino-smells-something-9chhugsv9p', + 'ext': 'mp4', + 'thumbnail': 'https://atto.scrolller.com/hmm-whats-that-smell-bh54mf2c52-300x224.jpg', + 'title': 'Baby rhino smells something', + 'age_limit': 0, + } + }, { + 'url': 'https://scrolller.com/its-all-fun-and-games-cco8jjmoh7', + 'info_dict': { + 'id': 'its-all-fun-and-games-cco8jjmoh7', + 'ext': 'mp4', + 'thumbnail': 'https://atto.scrolller.com/its-all-fun-and-games-3amk9vg7m3-540x649.jpg', + 'title': 'It\'s all fun and games...', + 'age_limit': 0, + } + }, { + 'url': 'https://scrolller.com/may-the-force-be-with-you-octokuro-yeytg1fs7a', + 'info_dict': { + 'id': 'may-the-force-be-with-you-octokuro-yeytg1fs7a', + 'ext': 'mp4', + 'thumbnail': 'https://thumbs2.redgifs.com/DarkStarchyNautilus-poster.jpg', + 'title': 'May the force be with you (Octokuro)', + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + query = { + 'query': '''{ + getSubredditPost(url:"/%s"){ + id + title + isNsfw + mediaSources{ + url + width + height + } + } + }''' % video_id + } + + video_data = self._download_json( + 'https://api.scrolller.com/api/v2/graphql', video_id, data=json.dumps(query).encode(), + headers={'Content-Type': 'application/json'})['data']['getSubredditPost'] + + formats, thumbnails = [], [] + for source in video_data['mediaSources']: + if determine_ext(source.get('url')) in ('jpg', 'png'): + thumbnails.append({ + 'url': source['url'], + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + elif source.get('url'): + formats.append({ + 'url': source['url'], + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + + if not formats: + self.raise_no_formats('There is no video.', expected=True, video_id=video_id) + + return { + 'id': video_id, + 'title': video_data.get('title'), + 'thumbnails': thumbnails, + 'formats': formats, + 'age_limit': 18 if video_data.get('isNsfw') else 0 + } diff --git a/yt_dlp/extractor/scte.py b/yt_dlp/extractor/scte.py new file mode 100644 index 0000000..9c2ca8c --- /dev/null +++ b/yt_dlp/extractor/scte.py @@ -0,0 +1,137 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + ExtractorError, + urlencode_postdata, +) + + +class SCTEBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx' + _NETRC_MACHINE = 'scte' + + def _perform_login(self, username, password): + login_popup = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']welcome\b', r'>Sign Out<')) + + # already logged in + if is_logged(login_popup): + return + + login_form = self._hidden_inputs(login_popup) + + login_form.update({ + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInUserName': username, + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInPassword': password, + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$RememberMe': 'on', + }) + + response = self._download_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form)) + + if '|pageRedirect|' not in response and not is_logged(response): + error = self._html_search_regex( + r'(?s)<[^>]+class=["\']AsiError["\'][^>]*>(.+?)\d+)' + _TESTS = [{ + 'url': 'https://learning.scte.org/mod/scorm/view.php?id=31484', + 'info_dict': { + 'title': 'Introduction to DOCSIS Engineering Professional', + 'id': '31484', + }, + 'playlist_count': 5, + 'skip': 'Requires account credentials', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._search_regex(r'

(.+?)

', webpage, 'title') + + context_id = self._search_regex(r'context-(\d+)', webpage, video_id) + content_base = 'https://learning.scte.org/pluginfile.php/%s/mod_scorm/content/8/' % context_id + context = decode_packed_codes(self._download_webpage( + '%smobile/data.js' % content_base, video_id)) + + data = self._parse_xml( + self._search_regex( + r'CreateData\(\s*"(.+?)"', context, 'data').replace(r"\'", "'"), + video_id) + + entries = [] + for asset in data.findall('.//asset'): + asset_url = asset.get('url') + if not asset_url or not asset_url.endswith('.mp4'): + continue + asset_id = self._search_regex( + r'video_([^_]+)_', asset_url, 'asset id', default=None) + if not asset_id: + continue + entries.append({ + 'id': asset_id, + 'title': title, + 'url': content_base + asset_url, + }) + + return self.playlist_result(entries, video_id, title) + + +class SCTECourseIE(SCTEBaseIE): + _WORKING = False + _VALID_URL = r'https?://learning\.scte\.org/(?:mod/sub)?course/view\.php?.*?\bid=(?P\d+)' + _TESTS = [{ + 'url': 'https://learning.scte.org/mod/subcourse/view.php?id=31491', + 'only_matching': True, + }, { + 'url': 'https://learning.scte.org/course/view.php?id=3639', + 'only_matching': True, + }, { + 'url': 'https://learning.scte.org/course/view.php?id=3073', + 'only_matching': True, + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + + webpage = self._download_webpage(url, course_id) + + title = self._search_regex( + r'

(.+?)

', webpage, 'title', default=None) + + entries = [] + for mobj in re.finditer( + r'''(?x) + ]+ + href=(["\']) + (?P + https?://learning\.scte\.org/mod/ + (?Pscorm|subcourse)/view\.php?(?:(?!\1).)*? + \bid=\d+ + ) + ''', + webpage): + item_url = mobj.group('url') + if item_url == url: + continue + ie = (SCTEIE.ie_key() if mobj.group('kind') == 'scorm' + else SCTECourseIE.ie_key()) + entries.append(self.url_result(item_url, ie=ie)) + + return self.playlist_result(entries, course_id, title) diff --git a/yt_dlp/extractor/sejmpl.py b/yt_dlp/extractor/sejmpl.py new file mode 100644 index 0000000..29cb015 --- /dev/null +++ b/yt_dlp/extractor/sejmpl.py @@ -0,0 +1,218 @@ +import datetime + +from .common import InfoExtractor +from .redge import RedCDNLivxIE +from ..utils import ( + clean_html, + join_nonempty, + js_to_json, + strip_or_none, + update_url_query, +) +from ..utils.traversal import traverse_obj + + +def is_dst(date): + last_march = datetime.datetime(date.year, 3, 31) + last_october = datetime.datetime(date.year, 10, 31) + last_sunday_march = last_march - datetime.timedelta(days=last_march.isoweekday() % 7) + last_sunday_october = last_october - datetime.timedelta(days=last_october.isoweekday() % 7) + return last_sunday_march.replace(hour=2) <= date <= last_sunday_october.replace(hour=3) + + +def rfc3339_to_atende(date): + date = datetime.datetime.fromisoformat(date) + date = date + datetime.timedelta(hours=1 if is_dst(date) else 0) + return int((date.timestamp() - 978307200) * 1000) + + +class SejmIE(InfoExtractor): + _VALID_URL = ( + r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P\d+)\.nsf/transmisje(?:_arch)?\.xsp(?:\?[^#]*)?#(?P[\dA-F]+)', + r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P\d+)\.nsf/transmisje(?:_arch)?\.xsp\?(?:[^#]+&)?unid=(?P[\dA-F]+)', + r'https?://sejm-embed\.redcdn\.pl/[Ss]ejm(?P\d+)\.nsf/VideoFrame\.xsp/(?P[\dA-F]+)', + ) + IE_NAME = 'sejm' + + _TESTS = [{ + # multiple cameras, polish SL iterpreter + 'url': 'https://www.sejm.gov.pl/Sejm10.nsf/transmisje_arch.xsp#6181EF1AD9CEEBB5C1258A6D006452B5', + 'info_dict': { + 'id': '6181EF1AD9CEEBB5C1258A6D006452B5', + 'title': '1. posiedzenie Sejmu X kadencji', + 'duration': 20145, + 'live_status': 'was_live', + 'location': 'Sala Posiedzeń', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'ENC01-722340000000-722360145000', + 'ext': 'mp4', + 'duration': 20145, + 'title': '1. posiedzenie Sejmu X kadencji - ENC01', + 'live_status': 'was_live', + }, + }, { + 'info_dict': { + 'id': 'ENC30-722340000000-722360145000', + 'ext': 'mp4', + 'duration': 20145, + 'title': '1. posiedzenie Sejmu X kadencji - ENC30', + 'live_status': 'was_live', + }, + }, { + 'info_dict': { + 'id': 'ENC31-722340000000-722360145000', + 'ext': 'mp4', + 'duration': 20145, + 'title': '1. posiedzenie Sejmu X kadencji - ENC31', + 'live_status': 'was_live', + }, + }, { + 'info_dict': { + 'id': 'ENC32-722340000000-722360145000', + 'ext': 'mp4', + 'duration': 20145, + 'title': '1. posiedzenie Sejmu X kadencji - ENC32', + 'live_status': 'was_live', + }, + }, { + # sign lang interpreter + 'info_dict': { + 'id': 'Migacz-ENC01-1-722340000000-722360145000', + 'ext': 'mp4', + 'duration': 20145, + 'title': '1. posiedzenie Sejmu X kadencji - Migacz-ENC01', + 'live_status': 'was_live', + }, + }], + }, { + 'url': 'https://www.sejm.gov.pl/Sejm8.nsf/transmisje.xsp?unid=9377A9D65518E9A5C125808E002E9FF2', + 'info_dict': { + 'id': '9377A9D65518E9A5C125808E002E9FF2', + 'title': 'Debata "Lepsza Polska: obywatelska"', + 'description': 'KP .Nowoczesna', + 'duration': 8770, + 'live_status': 'was_live', + 'location': 'sala kolumnowa im. Kazimierza Pużaka (bud. C-D)', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'ENC08-1-503831270000-503840040000', + 'ext': 'mp4', + 'duration': 8770, + 'title': 'Debata "Lepsza Polska: obywatelska" - ENC08', + 'live_status': 'was_live', + }, + }], + }, { + # 7th term is very special, since it does not use redcdn livx + 'url': 'https://www.sejm.gov.pl/sejm7.nsf/transmisje_arch.xsp?rok=2015&month=11#A6E6D475ECCC6FE5C1257EF90034817F', + 'info_dict': { + 'id': 'A6E6D475ECCC6FE5C1257EF90034817F', + 'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu', + 'description': 'SLD - Biuro Prasowe Klubu', + 'duration': 514, + 'location': 'sala 101/bud. C', + 'live_status': 'was_live', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'A6E6D475ECCC6FE5C1257EF90034817F', + 'ext': 'mp4', + 'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu', + 'duration': 514, + }, + }], + }, { + 'url': 'https://sejm-embed.redcdn.pl/Sejm10.nsf/VideoFrame.xsp/FED58EABB97FBD53C1258A7400386492', + 'only_matching': True, + }] + + def _real_extract(self, url): + term, video_id = self._match_valid_url(url).group('term', 'id') + frame = self._download_webpage( + f'https://sejm-embed.redcdn.pl/Sejm{term}.nsf/VideoFrame.xsp/{video_id}', + video_id) + # despite it says "transmisje_arch", it works for live streams too! + data = self._download_json( + f'https://www.sejm.gov.pl/Sejm{term}.nsf/transmisje_arch.xsp/json/{video_id}', + video_id) + params = data['params'] + + title = strip_or_none(data.get('title')) + + if data.get('status') == 'VIDEO_ENDED': + live_status = 'was_live' + elif data.get('status') == 'VIDEO_PLAYING': + live_status = 'is_live' + else: + live_status = None + self.report_warning(f'unknown status: {data.get("status")}') + + start_time = rfc3339_to_atende(params['start']) + # current streams have a stop time of *expected* end of session, but actual times + # can change during the transmission. setting a stop_time would artificially + # end the stream at that time, while the session actually keeps going. + if live_status == 'was_live': + stop_time = rfc3339_to_atende(params['stop']) + duration = (stop_time - start_time) // 1000 + else: + stop_time, duration = None, None + + entries = [] + + def add_entry(file, legacy_file=False): + if not file: + return + file = self._proto_relative_url(file) + if not legacy_file: + file = update_url_query(file, {'startTime': start_time}) + if stop_time is not None: + file = update_url_query(file, {'stopTime': stop_time}) + stream_id = self._search_regex(r'/o2/sejm/([^/]+)/[^./]+\.livx', file, 'stream id') + common_info = { + 'url': file, + 'duration': duration, + } + if legacy_file: + entries.append({ + **common_info, + 'id': video_id, + 'title': title, + }) + else: + entries.append({ + **common_info, + '_type': 'url_transparent', + 'ie_key': RedCDNLivxIE.ie_key(), + 'id': stream_id, + 'title': join_nonempty(title, stream_id, delim=' - '), + }) + + cameras = self._search_json( + r'var\s+cameras\s*=', frame, 'camera list', video_id, + contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json, + fatal=False) or [] + for camera_file in traverse_obj(cameras, (..., 'file', {dict})): + if camera_file.get('flv'): + add_entry(camera_file['flv']) + elif camera_file.get('mp4'): + # this is only a thing in 7th term. no streams before, and starting 8th it's redcdn livx + add_entry(camera_file['mp4'], legacy_file=True) + else: + self.report_warning('Unknown camera stream type found') + + if params.get('mig'): + add_entry(self._search_regex(r"var sliUrl\s*=\s*'([^']+)'", frame, 'sign language interpreter url', fatal=False)) + + return { + '_type': 'playlist', + 'entries': entries, + 'id': video_id, + 'title': title, + 'description': clean_html(data.get('desc')) or None, + 'duration': duration, + 'live_status': live_status, + 'location': strip_or_none(data.get('location')), + } diff --git a/yt_dlp/extractor/senalcolombia.py b/yt_dlp/extractor/senalcolombia.py new file mode 100644 index 0000000..b2f354f --- /dev/null +++ b/yt_dlp/extractor/senalcolombia.py @@ -0,0 +1,32 @@ +from .common import InfoExtractor +from .rtvcplay import RTVCKalturaIE + + +class SenalColombiaLiveIE(InfoExtractor): + _WORKING = False + _VALID_URL = r'https?://(?:www\.)?senalcolombia\.tv/(?Psenal-en-vivo)' + + _TESTS = [{ + 'url': 'https://www.senalcolombia.tv/senal-en-vivo', + 'info_dict': { + 'id': 'indexSC', + 'title': 're:^Señal Colombia', + 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + hydration = self._search_json( + r']*data-drupal-selector\s*=\s*"[^"]*drupal-settings-json[^"]*"[^>]*>', + webpage, 'hydration', display_id) + + return self.url_result(hydration['envivosrc'], RTVCKalturaIE, display_id) diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py new file mode 100644 index 0000000..7ff0cf5 --- /dev/null +++ b/yt_dlp/extractor/senategov.py @@ -0,0 +1,200 @@ +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + parse_qs, + unsmuggle_url, +) + +_COMMITTEES = { + 'ag': ('76440', 'http://ag-f.akamaihd.net'), + 'aging': ('76442', 'http://aging-f.akamaihd.net'), + 'approps': ('76441', 'http://approps-f.akamaihd.net'), + 'arch': ('', 'http://ussenate-f.akamaihd.net'), + 'armed': ('76445', 'http://armed-f.akamaihd.net'), + 'banking': ('76446', 'http://banking-f.akamaihd.net'), + 'budget': ('76447', 'http://budget-f.akamaihd.net'), + 'cecc': ('76486', 'http://srs-f.akamaihd.net'), + 'commerce': ('80177', 'http://commerce1-f.akamaihd.net'), + 'csce': ('75229', 'http://srs-f.akamaihd.net'), + 'dpc': ('76590', 'http://dpc-f.akamaihd.net'), + 'energy': ('76448', 'http://energy-f.akamaihd.net'), + 'epw': ('76478', 'http://epw-f.akamaihd.net'), + 'ethics': ('76449', 'http://ethics-f.akamaihd.net'), + 'finance': ('76450', 'http://finance-f.akamaihd.net'), + 'foreign': ('76451', 'http://foreign-f.akamaihd.net'), + 'govtaff': ('76453', 'http://govtaff-f.akamaihd.net'), + 'help': ('76452', 'http://help-f.akamaihd.net'), + 'indian': ('76455', 'http://indian-f.akamaihd.net'), + 'intel': ('76456', 'http://intel-f.akamaihd.net'), + 'intlnarc': ('76457', 'http://intlnarc-f.akamaihd.net'), + 'jccic': ('85180', 'http://jccic-f.akamaihd.net'), + 'jec': ('76458', 'http://jec-f.akamaihd.net'), + 'judiciary': ('76459', 'http://judiciary-f.akamaihd.net'), + 'rpc': ('76591', 'http://rpc-f.akamaihd.net'), + 'rules': ('76460', 'http://rules-f.akamaihd.net'), + 'saa': ('76489', 'http://srs-f.akamaihd.net'), + 'smbiz': ('76461', 'http://smbiz-f.akamaihd.net'), + 'srs': ('75229', 'http://srs-f.akamaihd.net'), + 'uscc': ('76487', 'http://srs-f.akamaihd.net'), + 'vetaff': ('76462', 'http://vetaff-f.akamaihd.net'), +} + + +class SenateISVPIE(InfoExtractor): + _IE_NAME = 'senate.gov:isvp' + _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P.+)' + _EMBED_REGEX = [r"]+src=['\"](?Phttps?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]"] + + _TESTS = [{ + 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', + 'info_dict': { + 'id': 'judiciary031715', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', + 'info_dict': { + 'id': 'commerce011514', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', + # checksum differs each time + 'info_dict': { + 'id': 'intel090613', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + } + }, { + # From http://www.c-span.org/video/?96791-1 + 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', + 'only_matching': True, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + qs = compat_parse_qs(self._match_valid_url(url).group('qs')) + if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = re.sub(r'.mp4$', '', qs['filename'][0]) + + webpage = self._download_webpage(url, video_id) + + if smuggled_data.get('force_title'): + title = smuggled_data['force_title'] + else: + title = self._html_extract_title(webpage) + poster = qs.get('poster') + thumbnail = poster[0] if poster else None + + video_type = qs['type'][0] + committee = video_type if video_type == 'arch' else qs['comm'][0] + + stream_num, domain = _COMMITTEES[committee] + + formats = [] + if video_type == 'arch': + filename = video_id if '.' in video_id else video_id + '.mp4' + m3u8_url = compat_urlparse.urljoin(domain, 'i/' + filename + '/master.m3u8') + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8') + else: + hdcore_sign = 'hdcore=3.1.0' + url_params = (domain, video_id, stream_num) + f4m_url = f'%s/z/%s_1@%s/manifest.f4m?{hdcore_sign}' % url_params + m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params + for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): + # URLs without the extra param induce an 404 error + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.append(entry) + for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): + mobj = re.search(r'(?P(?:-p|-b)).m3u8', entry['url']) + if mobj: + entry['format_id'] += mobj.group('tag') + formats.append(entry) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } + + +class SenateGovIE(InfoExtractor): + _IE_NAME = 'senate.gov' + _VALID_URL = r'https?:\/\/(?:www\.)?(help|appropriations|judiciary|banking|armed-services|finance)\.senate\.gov' + _TESTS = [{ + 'url': 'https://www.help.senate.gov/hearings/vaccines-saving-lives-ensuring-confidence-and-protecting-public-health', + 'info_dict': { + 'id': 'help090920', + 'display_id': 'vaccines-saving-lives-ensuring-confidence-and-protecting-public-health', + 'title': 'Vaccines: Saving Lives, Ensuring Confidence, and Protecting Public Health', + 'description': 'The U.S. Senate Committee on Health, Education, Labor & Pensions', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.appropriations.senate.gov/hearings/watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD', + 'info_dict': { + 'id': 'appropsA051518', + 'display_id': 'watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD', + 'title': 'Review of the FY2019 Budget Request for the U.S. Army', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.banking.senate.gov/hearings/21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization', + 'info_dict': { + 'id': 'banking041521', + 'display_id': '21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization', + 'title': '21st Century Communities: Public Transportation Infrastructure Investment and FAST Act Reauthorization', + 'description': 'The Official website of The United States Committee on Banking, Housing, and Urban Affairs', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._generic_id(url) + webpage = self._download_webpage(url, display_id) + parse_info = parse_qs(self._search_regex( + r'