You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

354 lines
12 KiB

  1. from __future__ import unicode_literals
  2. import json
  3. import random
  4. import re
  5. import time
  6. from .common import InfoExtractor
  7. from ..compat import (
  8. compat_str,
  9. compat_urlparse,
  10. )
  11. from ..utils import (
  12. ExtractorError,
  13. float_or_none,
  14. int_or_none,
  15. KNOWN_EXTENSIONS,
  16. parse_filesize,
  17. unescapeHTML,
  18. update_url_query,
  19. unified_strdate,
  20. )
  21. class BandcampIE(InfoExtractor):
  22. _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
  23. _TESTS = [{
  24. 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
  25. 'md5': 'c557841d5e50261777a6585648adf439',
  26. 'info_dict': {
  27. 'id': '1812978515',
  28. 'ext': 'mp3',
  29. 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
  30. 'duration': 9.8485,
  31. },
  32. '_skip': 'There is a limit of 200 free downloads / month for the test song'
  33. }, {
  34. 'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
  35. 'md5': '0369ace6b939f0927e62c67a1a8d9fa7',
  36. 'info_dict': {
  37. 'id': '2650410135',
  38. 'ext': 'aiff',
  39. 'title': 'Ben Prunty - Lanius (Battle)',
  40. 'uploader': 'Ben Prunty',
  41. },
  42. }]
  43. def _real_extract(self, url):
  44. mobj = re.match(self._VALID_URL, url)
  45. title = mobj.group('title')
  46. webpage = self._download_webpage(url, title)
  47. thumbnail = self._html_search_meta('og:image', webpage, default=None)
  48. m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
  49. if not m_download:
  50. m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
  51. if m_trackinfo:
  52. json_code = m_trackinfo.group(1)
  53. data = json.loads(json_code)[0]
  54. track_id = compat_str(data['id'])
  55. if not data.get('file'):
  56. raise ExtractorError('Not streamable', video_id=track_id, expected=True)
  57. formats = []
  58. for format_id, format_url in data['file'].items():
  59. ext, abr_str = format_id.split('-', 1)
  60. formats.append({
  61. 'format_id': format_id,
  62. 'url': self._proto_relative_url(format_url, 'http:'),
  63. 'ext': ext,
  64. 'vcodec': 'none',
  65. 'acodec': ext,
  66. 'abr': int_or_none(abr_str),
  67. })
  68. self._sort_formats(formats)
  69. return {
  70. 'id': track_id,
  71. 'title': data['title'],
  72. 'thumbnail': thumbnail,
  73. 'formats': formats,
  74. 'duration': float_or_none(data.get('duration')),
  75. }
  76. else:
  77. raise ExtractorError('No free songs found')
  78. download_link = m_download.group(1)
  79. video_id = self._search_regex(
  80. r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
  81. webpage, 'video id')
  82. download_webpage = self._download_webpage(
  83. download_link, video_id, 'Downloading free downloads page')
  84. blob = self._parse_json(
  85. self._search_regex(
  86. r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
  87. 'blob', group='blob'),
  88. video_id, transform_source=unescapeHTML)
  89. info = blob['digital_items'][0]
  90. downloads = info['downloads']
  91. track = info['title']
  92. artist = info.get('artist')
  93. title = '%s - %s' % (artist, track) if artist else track
  94. download_formats = {}
  95. for f in blob['download_formats']:
  96. name, ext = f.get('name'), f.get('file_extension')
  97. if all(isinstance(x, compat_str) for x in (name, ext)):
  98. download_formats[name] = ext.strip('.')
  99. formats = []
  100. for format_id, f in downloads.items():
  101. format_url = f.get('url')
  102. if not format_url:
  103. continue
  104. # Stat URL generation algorithm is reverse engineered from
  105. # download_*_bundle_*.js
  106. stat_url = update_url_query(
  107. format_url.replace('/download/', '/statdownload/'), {
  108. '.rand': int(time.time() * 1000 * random.random()),
  109. })
  110. format_id = f.get('encoding_name') or format_id
  111. stat = self._download_json(
  112. stat_url, video_id, 'Downloading %s JSON' % format_id,
  113. transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
  114. fatal=False)
  115. if not stat:
  116. continue
  117. retry_url = stat.get('retry_url')
  118. if not isinstance(retry_url, compat_str):
  119. continue
  120. formats.append({
  121. 'url': self._proto_relative_url(retry_url, 'http:'),
  122. 'ext': download_formats.get(format_id),
  123. 'format_id': format_id,
  124. 'format_note': f.get('description'),
  125. 'filesize': parse_filesize(f.get('size_mb')),
  126. 'vcodec': 'none',
  127. })
  128. self._sort_formats(formats)
  129. return {
  130. 'id': video_id,
  131. 'title': title,
  132. 'thumbnail': info.get('thumb_url') or thumbnail,
  133. 'uploader': info.get('artist'),
  134. 'artist': artist,
  135. 'track': track,
  136. 'formats': formats,
  137. }
  138. class BandcampAlbumIE(InfoExtractor):
  139. IE_NAME = 'Bandcamp:album'
  140. _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
  141. _TESTS = [{
  142. 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
  143. 'playlist': [
  144. {
  145. 'md5': '39bc1eded3476e927c724321ddf116cf',
  146. 'info_dict': {
  147. 'id': '1353101989',
  148. 'ext': 'mp3',
  149. 'title': 'Intro',
  150. }
  151. },
  152. {
  153. 'md5': '1a2c32e2691474643e912cc6cd4bffaa',
  154. 'info_dict': {
  155. 'id': '38097443',
  156. 'ext': 'mp3',
  157. 'title': 'Kero One - Keep It Alive (Blazo remix)',
  158. }
  159. },
  160. ],
  161. 'info_dict': {
  162. 'title': 'Jazz Format Mixtape vol.1',
  163. 'id': 'jazz-format-mixtape-vol-1',
  164. 'uploader_id': 'blazo',
  165. },
  166. 'params': {
  167. 'playlistend': 2
  168. },
  169. 'skip': 'Bandcamp imposes download limits.'
  170. }, {
  171. 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
  172. 'info_dict': {
  173. 'title': 'Hierophany of the Open Grave',
  174. 'uploader_id': 'nightbringer',
  175. 'id': 'hierophany-of-the-open-grave',
  176. },
  177. 'playlist_mincount': 9,
  178. }, {
  179. 'url': 'http://dotscale.bandcamp.com',
  180. 'info_dict': {
  181. 'title': 'Loom',
  182. 'id': 'dotscale',
  183. 'uploader_id': 'dotscale',
  184. },
  185. 'playlist_mincount': 7,
  186. }, {
  187. # with escaped quote in title
  188. 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep',
  189. 'info_dict': {
  190. 'title': '"Entropy" EP',
  191. 'uploader_id': 'jstrecords',
  192. 'id': 'entropy-ep',
  193. },
  194. 'playlist_mincount': 3,
  195. }, {
  196. # not all tracks have songs
  197. 'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
  198. 'info_dict': {
  199. 'id': 'we-are-the-plague',
  200. 'title': 'WE ARE THE PLAGUE',
  201. 'uploader_id': 'insulters',
  202. },
  203. 'playlist_count': 2,
  204. }]
  205. @classmethod
  206. def suitable(cls, url):
  207. return (False
  208. if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url)
  209. else super(BandcampAlbumIE, cls).suitable(url))
  210. def _real_extract(self, url):
  211. mobj = re.match(self._VALID_URL, url)
  212. uploader_id = mobj.group('subdomain')
  213. album_id = mobj.group('album_id')
  214. playlist_id = album_id or uploader_id
  215. webpage = self._download_webpage(url, playlist_id)
  216. track_elements = re.findall(
  217. r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage)
  218. if not track_elements:
  219. raise ExtractorError('The page doesn\'t contain any tracks')
  220. # Only tracks with duration info have songs
  221. entries = [
  222. self.url_result(
  223. compat_urlparse.urljoin(url, t_path),
  224. ie=BandcampIE.ie_key(),
  225. video_title=self._search_regex(
  226. r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
  227. elem_content, 'track title', fatal=False))
  228. for elem_content, t_path in track_elements
  229. if self._html_search_meta('duration', elem_content, default=None)]
  230. title = self._html_search_regex(
  231. r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',
  232. webpage, 'title', fatal=False)
  233. if title:
  234. title = title.replace(r'\"', '"')
  235. return {
  236. '_type': 'playlist',
  237. 'uploader_id': uploader_id,
  238. 'id': playlist_id,
  239. 'title': title,
  240. 'entries': entries,
  241. }
  242. class BandcampWeeklyIE(InfoExtractor):
  243. IE_NAME = 'Bandcamp:weekly'
  244. _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
  245. _TESTS = [{
  246. 'url': 'https://bandcamp.com/?show=224',
  247. 'md5': 'b00df799c733cf7e0c567ed187dea0fd',
  248. 'info_dict': {
  249. 'id': '224',
  250. 'ext': 'opus',
  251. 'title': 'BC Weekly April 4th 2017 - Magic Moments',
  252. 'description': 'md5:5d48150916e8e02d030623a48512c874',
  253. 'duration': 5829.77,
  254. 'release_date': '20170404',
  255. 'series': 'Bandcamp Weekly',
  256. 'episode': 'Magic Moments',
  257. 'episode_number': 208,
  258. 'episode_id': '224',
  259. }
  260. }, {
  261. 'url': 'https://bandcamp.com/?blah/blah@&show=228',
  262. 'only_matching': True
  263. }]
  264. def _real_extract(self, url):
  265. video_id = self._match_id(url)
  266. webpage = self._download_webpage(url, video_id)
  267. blob = self._parse_json(
  268. self._search_regex(
  269. r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage,
  270. 'blob', group='blob'),
  271. video_id, transform_source=unescapeHTML)
  272. show = blob['bcw_show']
  273. # This is desired because any invalid show id redirects to `bandcamp.com`
  274. # which happens to expose the latest Bandcamp Weekly episode.
  275. show_id = int_or_none(show.get('show_id')) or int_or_none(video_id)
  276. formats = []
  277. for format_id, format_url in show['audio_stream'].items():
  278. if not isinstance(format_url, compat_str):
  279. continue
  280. for known_ext in KNOWN_EXTENSIONS:
  281. if known_ext in format_id:
  282. ext = known_ext
  283. break
  284. else:
  285. ext = None
  286. formats.append({
  287. 'format_id': format_id,
  288. 'url': format_url,
  289. 'ext': ext,
  290. 'vcodec': 'none',
  291. })
  292. self._sort_formats(formats)
  293. title = show.get('audio_title') or 'Bandcamp Weekly'
  294. subtitle = show.get('subtitle')
  295. if subtitle:
  296. title += ' - %s' % subtitle
  297. episode_number = None
  298. seq = blob.get('bcw_seq')
  299. if seq and isinstance(seq, list):
  300. try:
  301. episode_number = next(
  302. int_or_none(e.get('episode_number'))
  303. for e in seq
  304. if isinstance(e, dict) and int_or_none(e.get('id')) == show_id)
  305. except StopIteration:
  306. pass
  307. return {
  308. 'id': video_id,
  309. 'title': title,
  310. 'description': show.get('desc') or show.get('short_desc'),
  311. 'duration': float_or_none(show.get('audio_duration')),
  312. 'is_live': False,
  313. 'release_date': unified_strdate(show.get('published_date')),
  314. 'series': 'Bandcamp Weekly',
  315. 'episode': show.get('subtitle'),
  316. 'episode_number': episode_number,
  317. 'episode_id': compat_str(video_id),
  318. 'formats': formats
  319. }