You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

786 lines
28 KiB

10 years ago
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import itertools
  4. import re
  5. from .common import (
  6. InfoExtractor,
  7. SearchInfoExtractor
  8. )
  9. from ..compat import (
  10. compat_str,
  11. compat_urlparse,
  12. )
  13. from ..utils import (
  14. ExtractorError,
  15. float_or_none,
  16. HEADRequest,
  17. int_or_none,
  18. KNOWN_EXTENSIONS,
  19. mimetype2ext,
  20. str_or_none,
  21. try_get,
  22. unified_timestamp,
  23. update_url_query,
  24. url_or_none,
  25. )
  26. class SoundcloudEmbedIE(InfoExtractor):
  27. _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?url=(?P<id>.*)'
  28. @staticmethod
  29. def _extract_urls(webpage):
  30. return [m.group('url') for m in re.finditer(
  31. r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1',
  32. webpage)]
  33. def _real_extract(self, url):
  34. return self.url_result(compat_urlparse.parse_qs(
  35. compat_urlparse.urlparse(url).query)['url'][0])
  36. class SoundcloudIE(InfoExtractor):
  37. """Information extractor for soundcloud.com
  38. To access the media, the uid of the song and a stream token
  39. must be extracted from the page source and the script must make
  40. a request to media.soundcloud.com/crossdomain.xml. Then
  41. the media can be grabbed by requesting from an url composed
  42. of the stream token and uid
  43. """
  44. _VALID_URL = r'''(?x)^(?:https?://)?
  45. (?:(?:(?:www\.|m\.)?soundcloud\.com/
  46. (?!stations/track)
  47. (?P<uploader>[\w\d-]+)/
  48. (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
  49. (?P<title>[\w\d-]+)/?
  50. (?P<token>[^?]+?)?(?:[?].*)?$)
  51. |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+)
  52. (?:/?\?secret_token=(?P<secret_token>[^&]+))?)
  53. )
  54. '''
  55. IE_NAME = 'soundcloud'
  56. _TESTS = [
  57. {
  58. 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
  59. 'md5': 'ebef0a451b909710ed1d7787dddbf0d7',
  60. 'info_dict': {
  61. 'id': '62986583',
  62. 'ext': 'mp3',
  63. 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
  64. 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
  65. 'uploader': 'E.T. ExTerrestrial Music',
  66. 'uploader_id': '1571244',
  67. 'timestamp': 1349920598,
  68. 'upload_date': '20121011',
  69. 'duration': 143.216,
  70. 'license': 'all-rights-reserved',
  71. 'view_count': int,
  72. 'like_count': int,
  73. 'comment_count': int,
  74. 'repost_count': int,
  75. }
  76. },
  77. # not streamable song
  78. {
  79. 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
  80. 'info_dict': {
  81. 'id': '47127627',
  82. 'ext': 'mp3',
  83. 'title': 'Goldrushed',
  84. 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
  85. 'uploader': 'The Royal Concept',
  86. 'uploader_id': '9615865',
  87. 'timestamp': 1337635207,
  88. 'upload_date': '20120521',
  89. 'duration': 30,
  90. 'license': 'all-rights-reserved',
  91. 'view_count': int,
  92. 'like_count': int,
  93. 'comment_count': int,
  94. 'repost_count': int,
  95. },
  96. 'params': {
  97. # rtmp
  98. 'skip_download': True,
  99. },
  100. 'skip': 'Preview',
  101. },
  102. # private link
  103. {
  104. 'url': 'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp',
  105. 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604',
  106. 'info_dict': {
  107. 'id': '123998367',
  108. 'ext': 'mp3',
  109. 'title': 'Youtube - Dl Test Video \'\' Ä↭',
  110. 'description': 'test chars: \"\'/\\ä↭',
  111. 'uploader': 'jaimeMF',
  112. 'uploader_id': '69767071',
  113. 'timestamp': 1386604920,
  114. 'upload_date': '20131209',
  115. 'duration': 9.927,
  116. 'license': 'all-rights-reserved',
  117. 'view_count': int,
  118. 'like_count': int,
  119. 'comment_count': int,
  120. 'repost_count': int,
  121. },
  122. },
  123. # private link (alt format)
  124. {
  125. 'url': 'https://api.soundcloud.com/tracks/123998367?secret_token=s-8Pjrp',
  126. 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604',
  127. 'info_dict': {
  128. 'id': '123998367',
  129. 'ext': 'mp3',
  130. 'title': 'Youtube - Dl Test Video \'\' Ä↭',
  131. 'description': 'test chars: \"\'/\\ä↭',
  132. 'uploader': 'jaimeMF',
  133. 'uploader_id': '69767071',
  134. 'timestamp': 1386604920,
  135. 'upload_date': '20131209',
  136. 'duration': 9.927,
  137. 'license': 'all-rights-reserved',
  138. 'view_count': int,
  139. 'like_count': int,
  140. 'comment_count': int,
  141. 'repost_count': int,
  142. },
  143. },
  144. # downloadable song
  145. {
  146. 'url': 'https://soundcloud.com/oddsamples/bus-brakes',
  147. 'md5': '7624f2351f8a3b2e7cd51522496e7631',
  148. 'info_dict': {
  149. 'id': '128590877',
  150. 'ext': 'mp3',
  151. 'title': 'Bus Brakes',
  152. 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66',
  153. 'uploader': 'oddsamples',
  154. 'uploader_id': '73680509',
  155. 'timestamp': 1389232924,
  156. 'upload_date': '20140109',
  157. 'duration': 17.346,
  158. 'license': 'cc-by-sa',
  159. 'view_count': int,
  160. 'like_count': int,
  161. 'comment_count': int,
  162. 'repost_count': int,
  163. },
  164. },
  165. # private link, downloadable format
  166. {
  167. 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd',
  168. 'md5': '64a60b16e617d41d0bef032b7f55441e',
  169. 'info_dict': {
  170. 'id': '340344461',
  171. 'ext': 'wav',
  172. 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]',
  173. 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366',
  174. 'uploader': 'Ori Uplift Music',
  175. 'uploader_id': '12563093',
  176. 'timestamp': 1504206263,
  177. 'upload_date': '20170831',
  178. 'duration': 7449.096,
  179. 'license': 'all-rights-reserved',
  180. 'view_count': int,
  181. 'like_count': int,
  182. 'comment_count': int,
  183. 'repost_count': int,
  184. },
  185. },
  186. # no album art, use avatar pic for thumbnail
  187. {
  188. 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real',
  189. 'md5': '59c7872bc44e5d99b7211891664760c2',
  190. 'info_dict': {
  191. 'id': '309699954',
  192. 'ext': 'mp3',
  193. 'title': 'Sideways (Prod. Mad Real)',
  194. 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
  195. 'uploader': 'garyvee',
  196. 'uploader_id': '2366352',
  197. 'timestamp': 1488152409,
  198. 'upload_date': '20170226',
  199. 'duration': 207.012,
  200. 'thumbnail': r're:https?://.*\.jpg',
  201. 'license': 'all-rights-reserved',
  202. 'view_count': int,
  203. 'like_count': int,
  204. 'comment_count': int,
  205. 'repost_count': int,
  206. },
  207. 'params': {
  208. 'skip_download': True,
  209. },
  210. },
  211. # not available via api.soundcloud.com/i1/tracks/id/streams
  212. {
  213. 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer',
  214. 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7',
  215. 'info_dict': {
  216. 'id': '583011102',
  217. 'ext': 'mp3',
  218. 'title': 'Mezzo Valzer',
  219. 'description': 'md5:4138d582f81866a530317bae316e8b61',
  220. 'uploader': 'Giovanni Sarani',
  221. 'uploader_id': '3352531',
  222. 'timestamp': 1551394171,
  223. 'upload_date': '20190228',
  224. 'duration': 180.157,
  225. 'thumbnail': r're:https?://.*\.jpg',
  226. 'license': 'all-rights-reserved',
  227. 'view_count': int,
  228. 'like_count': int,
  229. 'comment_count': int,
  230. 'repost_count': int,
  231. },
  232. 'expected_warnings': ['Unable to download JSON metadata'],
  233. }
  234. ]
  235. _API_BASE = 'https://api.soundcloud.com/'
  236. _API_V2_BASE = 'https://api-v2.soundcloud.com/'
  237. _BASE_URL = 'https://soundcloud.com/'
  238. _CLIENT_ID = 'UW9ajvMgVdMMW3cdeBi8lPfN6dvOVGji'
  239. _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg'
  240. _ARTWORK_MAP = {
  241. 'mini': 16,
  242. 'tiny': 20,
  243. 'small': 32,
  244. 'badge': 47,
  245. 't67x67': 67,
  246. 'large': 100,
  247. 't300x300': 300,
  248. 'crop': 400,
  249. 't500x500': 500,
  250. 'original': 0,
  251. }
  252. @classmethod
  253. def _resolv_url(cls, url):
  254. return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url + '&client_id=' + cls._CLIENT_ID
  255. def _extract_info_dict(self, info, full_title=None, secret_token=None, version=2):
  256. track_id = compat_str(info['id'])
  257. title = info['title']
  258. track_base_url = self._API_BASE + 'tracks/%s' % track_id
  259. format_urls = set()
  260. formats = []
  261. query = {'client_id': self._CLIENT_ID}
  262. if secret_token:
  263. query['secret_token'] = secret_token
  264. if info.get('downloadable') and info.get('has_downloads_left'):
  265. format_url = update_url_query(
  266. info.get('download_url') or track_base_url + '/download', query)
  267. format_urls.add(format_url)
  268. if version == 2:
  269. v1_info = self._download_json(
  270. track_base_url, track_id, query=query, fatal=False) or {}
  271. else:
  272. v1_info = info
  273. formats.append({
  274. 'format_id': 'download',
  275. 'ext': v1_info.get('original_format') or 'mp3',
  276. 'filesize': int_or_none(v1_info.get('original_content_size')),
  277. 'url': format_url,
  278. 'preference': 10,
  279. })
  280. def invalid_url(url):
  281. return not url or url in format_urls or re.search(r'/(?:preview|playlist)/0/30/', url)
  282. def add_format(f, protocol):
  283. mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url)
  284. if mobj:
  285. for k, v in mobj.groupdict().items():
  286. if not f.get(k):
  287. f[k] = v
  288. format_id_list = []
  289. if protocol:
  290. format_id_list.append(protocol)
  291. for k in ('ext', 'abr'):
  292. v = f.get(k)
  293. if v:
  294. format_id_list.append(v)
  295. abr = f.get('abr')
  296. if abr:
  297. f['abr'] = int(abr)
  298. f.update({
  299. 'format_id': '_'.join(format_id_list),
  300. 'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
  301. })
  302. formats.append(f)
  303. # New API
  304. transcodings = try_get(
  305. info, lambda x: x['media']['transcodings'], list) or []
  306. for t in transcodings:
  307. if not isinstance(t, dict):
  308. continue
  309. format_url = url_or_none(t.get('url'))
  310. if not format_url or t.get('snipped') or '/preview/' in format_url:
  311. continue
  312. stream = self._download_json(
  313. format_url, track_id, query=query, fatal=False)
  314. if not isinstance(stream, dict):
  315. continue
  316. stream_url = url_or_none(stream.get('url'))
  317. if invalid_url(stream_url):
  318. continue
  319. format_urls.add(stream_url)
  320. stream_format = t.get('format') or {}
  321. protocol = stream_format.get('protocol')
  322. if protocol != 'hls' and '/hls' in format_url:
  323. protocol = 'hls'
  324. ext = None
  325. preset = str_or_none(t.get('preset'))
  326. if preset:
  327. ext = preset.split('_')[0]
  328. if ext not in KNOWN_EXTENSIONS:
  329. ext = mimetype2ext(stream_format.get('mime_type'))
  330. add_format({
  331. 'url': stream_url,
  332. 'ext': ext,
  333. }, 'http' if protocol == 'progressive' else protocol)
  334. if not formats:
  335. # Old API, does not work for some tracks (e.g.
  336. # https://soundcloud.com/giovannisarani/mezzo-valzer)
  337. # and might serve preview URLs (e.g.
  338. # http://www.soundcloud.com/snbrn/ele)
  339. format_dict = self._download_json(
  340. track_base_url + '/streams', track_id,
  341. 'Downloading track url', query=query, fatal=False) or {}
  342. for key, stream_url in format_dict.items():
  343. if invalid_url(stream_url):
  344. continue
  345. format_urls.add(stream_url)
  346. mobj = re.search(r'(http|hls)_([^_]+)_(\d+)_url', key)
  347. if mobj:
  348. protocol, ext, abr = mobj.groups()
  349. add_format({
  350. 'abr': abr,
  351. 'ext': ext,
  352. 'url': stream_url,
  353. }, protocol)
  354. if not formats:
  355. # We fallback to the stream_url in the original info, this
  356. # cannot be always used, sometimes it can give an HTTP 404 error
  357. urlh = self._request_webpage(
  358. HEADRequest(info.get('stream_url') or track_base_url + '/stream'),
  359. track_id, query=query, fatal=False)
  360. if urlh:
  361. stream_url = urlh.geturl()
  362. if not invalid_url(stream_url):
  363. add_format({'url': stream_url}, 'http')
  364. for f in formats:
  365. f['vcodec'] = 'none'
  366. self._sort_formats(formats)
  367. user = info.get('user') or {}
  368. thumbnails = []
  369. artwork_url = info.get('artwork_url')
  370. thumbnail = artwork_url or user.get('avatar_url')
  371. if isinstance(thumbnail, compat_str):
  372. if re.search(self._IMAGE_REPL_RE, thumbnail):
  373. for image_id, size in self._ARTWORK_MAP.items():
  374. i = {
  375. 'id': image_id,
  376. 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail),
  377. }
  378. if image_id == 'tiny' and not artwork_url:
  379. size = 18
  380. elif image_id == 'original':
  381. i['preference'] = 10
  382. if size:
  383. i.update({
  384. 'width': size,
  385. 'height': size,
  386. })
  387. thumbnails.append(i)
  388. else:
  389. thumbnails = [{'url': thumbnail}]
  390. def extract_count(key):
  391. return int_or_none(info.get('%s_count' % key))
  392. return {
  393. 'id': track_id,
  394. 'uploader': user.get('username'),
  395. 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'),
  396. 'uploader_url': user.get('permalink_url'),
  397. 'timestamp': unified_timestamp(info.get('created_at')),
  398. 'title': title,
  399. 'description': info.get('description'),
  400. 'thumbnails': thumbnails,
  401. 'duration': float_or_none(info.get('duration'), 1000),
  402. 'webpage_url': info.get('permalink_url'),
  403. 'license': info.get('license'),
  404. 'view_count': extract_count('playback'),
  405. 'like_count': extract_count('favoritings') or extract_count('likes'),
  406. 'comment_count': extract_count('comment'),
  407. 'repost_count': extract_count('reposts'),
  408. 'genre': info.get('genre'),
  409. 'formats': formats
  410. }
  411. def _real_extract(self, url):
  412. mobj = re.match(self._VALID_URL, url)
  413. track_id = mobj.group('track_id')
  414. query = {
  415. 'client_id': self._CLIENT_ID,
  416. }
  417. if track_id:
  418. info_json_url = self._API_V2_BASE + 'tracks/' + track_id
  419. full_title = track_id
  420. token = mobj.group('secret_token')
  421. if token:
  422. query['secret_token'] = token
  423. else:
  424. full_title = resolve_title = '%s/%s' % mobj.group('uploader', 'title')
  425. token = mobj.group('token')
  426. if token:
  427. resolve_title += '/%s' % token
  428. info_json_url = self._resolv_url(self._BASE_URL + resolve_title)
  429. version = 2
  430. info = self._download_json(
  431. info_json_url, full_title, 'Downloading info JSON', query=query, fatal=False)
  432. if not info:
  433. info = self._download_json(
  434. info_json_url.replace(self._API_V2_BASE, self._API_BASE),
  435. full_title, 'Downloading info JSON', query=query)
  436. version = 1
  437. return self._extract_info_dict(info, full_title, token, version)
  438. class SoundcloudPlaylistBaseIE(SoundcloudIE):
  439. def _extract_track_entries(self, tracks, token=None):
  440. entries = []
  441. for track in tracks:
  442. track_id = str_or_none(track.get('id'))
  443. url = track.get('permalink_url')
  444. if not url:
  445. if not track_id:
  446. continue
  447. url = self._API_V2_BASE + 'tracks/' + track_id
  448. if token:
  449. url += '?secret_token=' + token
  450. entries.append(self.url_result(
  451. url, SoundcloudIE.ie_key(), track_id))
  452. return entries
  453. class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
  454. _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?'
  455. IE_NAME = 'soundcloud:set'
  456. _TESTS = [{
  457. 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep',
  458. 'info_dict': {
  459. 'id': '2284613',
  460. 'title': 'The Royal Concept EP',
  461. },
  462. 'playlist_mincount': 5,
  463. }, {
  464. 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token',
  465. 'only_matching': True,
  466. }]
  467. def _real_extract(self, url):
  468. mobj = re.match(self._VALID_URL, url)
  469. full_title = '%s/sets/%s' % mobj.group('uploader', 'slug_title')
  470. token = mobj.group('token')
  471. if token:
  472. full_title += '/' + token
  473. info = self._download_json(self._resolv_url(
  474. self._BASE_URL + full_title), full_title)
  475. if 'errors' in info:
  476. msgs = (compat_str(err['error_message']) for err in info['errors'])
  477. raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs))
  478. entries = self._extract_track_entries(info['tracks'], token)
  479. return self.playlist_result(
  480. entries, str_or_none(info.get('id')), info.get('title'))
  481. class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE):
  482. def _extract_playlist(self, base_url, playlist_id, playlist_title):
  483. COMMON_QUERY = {
  484. 'limit': 2000000000,
  485. 'client_id': self._CLIENT_ID,
  486. 'linked_partitioning': '1',
  487. }
  488. query = COMMON_QUERY.copy()
  489. query['offset'] = 0
  490. next_href = base_url
  491. entries = []
  492. for i in itertools.count():
  493. response = self._download_json(
  494. next_href, playlist_id,
  495. 'Downloading track page %s' % (i + 1), query=query)
  496. collection = response['collection']
  497. if not isinstance(collection, list):
  498. collection = []
  499. # Empty collection may be returned, in this case we proceed
  500. # straight to next_href
  501. def resolve_entry(candidates):
  502. for cand in candidates:
  503. if not isinstance(cand, dict):
  504. continue
  505. permalink_url = url_or_none(cand.get('permalink_url'))
  506. if not permalink_url:
  507. continue
  508. return self.url_result(
  509. permalink_url,
  510. SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None,
  511. str_or_none(cand.get('id')), cand.get('title'))
  512. for e in collection:
  513. entry = resolve_entry((e, e.get('track'), e.get('playlist')))
  514. if entry:
  515. entries.append(entry)
  516. next_href = response.get('next_href')
  517. if not next_href:
  518. break
  519. next_href = response['next_href']
  520. parsed_next_href = compat_urlparse.urlparse(next_href)
  521. query = compat_urlparse.parse_qs(parsed_next_href.query)
  522. query.update(COMMON_QUERY)
  523. return {
  524. '_type': 'playlist',
  525. 'id': playlist_id,
  526. 'title': playlist_title,
  527. 'entries': entries,
  528. }
  529. class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
  530. _VALID_URL = r'''(?x)
  531. https?://
  532. (?:(?:www|m)\.)?soundcloud\.com/
  533. (?P<user>[^/]+)
  534. (?:/
  535. (?P<rsrc>tracks|albums|sets|reposts|likes|spotlight)
  536. )?
  537. /?(?:[?#].*)?$
  538. '''
  539. IE_NAME = 'soundcloud:user'
  540. _TESTS = [{
  541. 'url': 'https://soundcloud.com/soft-cell-official',
  542. 'info_dict': {
  543. 'id': '207965082',
  544. 'title': 'Soft Cell (All)',
  545. },
  546. 'playlist_mincount': 28,
  547. }, {
  548. 'url': 'https://soundcloud.com/soft-cell-official/tracks',
  549. 'info_dict': {
  550. 'id': '207965082',
  551. 'title': 'Soft Cell (Tracks)',
  552. },
  553. 'playlist_mincount': 27,
  554. }, {
  555. 'url': 'https://soundcloud.com/soft-cell-official/albums',
  556. 'info_dict': {
  557. 'id': '207965082',
  558. 'title': 'Soft Cell (Albums)',
  559. },
  560. 'playlist_mincount': 1,
  561. }, {
  562. 'url': 'https://soundcloud.com/jcv246/sets',
  563. 'info_dict': {
  564. 'id': '12982173',
  565. 'title': 'Jordi / cv (Sets)',
  566. },
  567. 'playlist_mincount': 2,
  568. }, {
  569. 'url': 'https://soundcloud.com/jcv246/reposts',
  570. 'info_dict': {
  571. 'id': '12982173',
  572. 'title': 'Jordi / cv (Reposts)',
  573. },
  574. 'playlist_mincount': 6,
  575. }, {
  576. 'url': 'https://soundcloud.com/clalberg/likes',
  577. 'info_dict': {
  578. 'id': '11817582',
  579. 'title': 'clalberg (Likes)',
  580. },
  581. 'playlist_mincount': 5,
  582. }, {
  583. 'url': 'https://soundcloud.com/grynpyret/spotlight',
  584. 'info_dict': {
  585. 'id': '7098329',
  586. 'title': 'Grynpyret (Spotlight)',
  587. },
  588. 'playlist_mincount': 1,
  589. }]
  590. _BASE_URL_MAP = {
  591. 'all': 'stream/users/%s',
  592. 'tracks': 'users/%s/tracks',
  593. 'albums': 'users/%s/albums',
  594. 'sets': 'users/%s/playlists',
  595. 'reposts': 'stream/users/%s/reposts',
  596. 'likes': 'users/%s/likes',
  597. 'spotlight': 'users/%s/spotlight',
  598. }
  599. def _real_extract(self, url):
  600. mobj = re.match(self._VALID_URL, url)
  601. uploader = mobj.group('user')
  602. user = self._download_json(
  603. self._resolv_url(self._BASE_URL + uploader),
  604. uploader, 'Downloading user info')
  605. resource = mobj.group('rsrc') or 'all'
  606. return self._extract_playlist(
  607. self._API_V2_BASE + self._BASE_URL_MAP[resource] % user['id'],
  608. str_or_none(user.get('id')),
  609. '%s (%s)' % (user['username'], resource.capitalize()))
  610. class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
  611. _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)'
  612. IE_NAME = 'soundcloud:trackstation'
  613. _TESTS = [{
  614. 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text',
  615. 'info_dict': {
  616. 'id': '286017854',
  617. 'title': 'Track station: your text',
  618. },
  619. 'playlist_mincount': 47,
  620. }]
  621. def _real_extract(self, url):
  622. track_name = self._match_id(url)
  623. track = self._download_json(self._resolv_url(url), track_name)
  624. track_id = self._search_regex(
  625. r'soundcloud:track-stations:(\d+)', track['id'], 'track id')
  626. return self._extract_playlist(
  627. self._API_V2_BASE + 'stations/%s/tracks' % track['id'],
  628. track_id, 'Track station: %s' % track['title'])
  629. class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
  630. _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$'
  631. IE_NAME = 'soundcloud:playlist'
  632. _TESTS = [{
  633. 'url': 'https://api.soundcloud.com/playlists/4110309',
  634. 'info_dict': {
  635. 'id': '4110309',
  636. 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]',
  637. 'description': 're:.*?TILT Brass - Bowery Poetry Club',
  638. },
  639. 'playlist_count': 6,
  640. }]
  641. def _real_extract(self, url):
  642. mobj = re.match(self._VALID_URL, url)
  643. playlist_id = mobj.group('id')
  644. query = {
  645. 'client_id': self._CLIENT_ID,
  646. }
  647. token = mobj.group('token')
  648. if token:
  649. query['secret_token'] = token
  650. data = self._download_json(
  651. self._API_V2_BASE + 'playlists/' + playlist_id,
  652. playlist_id, 'Downloading playlist', query=query)
  653. entries = self._extract_track_entries(data['tracks'], token)
  654. return self.playlist_result(
  655. entries, playlist_id, data.get('title'), data.get('description'))
  656. class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
  657. IE_NAME = 'soundcloud:search'
  658. IE_DESC = 'Soundcloud search'
  659. _MAX_RESULTS = float('inf')
  660. _TESTS = [{
  661. 'url': 'scsearch15:post-avant jazzcore',
  662. 'info_dict': {
  663. 'title': 'post-avant jazzcore',
  664. },
  665. 'playlist_count': 15,
  666. }]
  667. _SEARCH_KEY = 'scsearch'
  668. _MAX_RESULTS_PER_PAGE = 200
  669. _DEFAULT_RESULTS_PER_PAGE = 50
  670. def _get_collection(self, endpoint, collection_id, **query):
  671. limit = min(
  672. query.get('limit', self._DEFAULT_RESULTS_PER_PAGE),
  673. self._MAX_RESULTS_PER_PAGE)
  674. query.update({
  675. 'limit': limit,
  676. 'client_id': self._CLIENT_ID,
  677. 'linked_partitioning': 1,
  678. 'offset': 0,
  679. })
  680. next_url = update_url_query(self._API_V2_BASE + endpoint, query)
  681. collected_results = 0
  682. for i in itertools.count(1):
  683. response = self._download_json(
  684. next_url, collection_id, 'Downloading page {0}'.format(i),
  685. 'Unable to download API page')
  686. collection = response.get('collection', [])
  687. if not collection:
  688. break
  689. collection = list(filter(bool, collection))
  690. collected_results += len(collection)
  691. for item in collection:
  692. yield self.url_result(item['uri'], SoundcloudIE.ie_key())
  693. if not collection or collected_results >= limit:
  694. break
  695. next_url = response.get('next_href')
  696. if not next_url:
  697. break
  698. def _get_n_results(self, query, n):
  699. tracks = self._get_collection('search/tracks', query, limit=n, q=query)
  700. return self.playlist_result(tracks, playlist_title=query)