You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

187 lines
6.9 KiB

10 years ago
10 years ago
  1. from __future__ import unicode_literals
  2. import json
  3. import re
  4. from .common import InfoExtractor
  5. from ..compat import (
  6. compat_str,
  7. compat_urlparse,
  8. )
  9. from ..utils import (
  10. ExtractorError,
  11. float_or_none,
  12. int_or_none,
  13. )
  14. class BandcampIE(InfoExtractor):
  15. _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)'
  16. _TESTS = [{
  17. 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
  18. 'md5': 'c557841d5e50261777a6585648adf439',
  19. 'info_dict': {
  20. 'id': '1812978515',
  21. 'ext': 'mp3',
  22. 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
  23. 'duration': 9.8485,
  24. },
  25. '_skip': 'There is a limit of 200 free downloads / month for the test song'
  26. }, {
  27. 'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
  28. 'md5': '73d0b3171568232574e45652f8720b5c',
  29. 'info_dict': {
  30. 'id': '2650410135',
  31. 'ext': 'mp3',
  32. 'title': 'Lanius (Battle)',
  33. 'uploader': 'Ben Prunty Music',
  34. },
  35. }]
  36. def _real_extract(self, url):
  37. mobj = re.match(self._VALID_URL, url)
  38. title = mobj.group('title')
  39. webpage = self._download_webpage(url, title)
  40. m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
  41. if not m_download:
  42. m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
  43. if m_trackinfo:
  44. json_code = m_trackinfo.group(1)
  45. data = json.loads(json_code)[0]
  46. track_id = compat_str(data['id'])
  47. if not data.get('file'):
  48. raise ExtractorError('Not streamable', video_id=track_id, expected=True)
  49. formats = []
  50. for format_id, format_url in data['file'].items():
  51. ext, abr_str = format_id.split('-', 1)
  52. formats.append({
  53. 'format_id': format_id,
  54. 'url': self._proto_relative_url(format_url, 'http:'),
  55. 'ext': ext,
  56. 'vcodec': 'none',
  57. 'acodec': ext,
  58. 'abr': int_or_none(abr_str),
  59. })
  60. self._sort_formats(formats)
  61. return {
  62. 'id': track_id,
  63. 'title': data['title'],
  64. 'formats': formats,
  65. 'duration': float_or_none(data.get('duration')),
  66. }
  67. else:
  68. raise ExtractorError('No free songs found')
  69. download_link = m_download.group(1)
  70. video_id = self._search_regex(
  71. r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
  72. webpage, 'video id')
  73. download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page')
  74. # We get the dictionary of the track from some javascript code
  75. all_info = self._parse_json(self._search_regex(
  76. r'(?sm)items: (.*?),$', download_webpage, 'items'), video_id)
  77. info = all_info[0]
  78. # We pick mp3-320 for now, until format selection can be easily implemented.
  79. mp3_info = info['downloads']['mp3-320']
  80. # If we try to use this url it says the link has expired
  81. initial_url = mp3_info['url']
  82. m_url = re.match(
  83. r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$',
  84. initial_url)
  85. # We build the url we will use to get the final track url
  86. # This url is build in Bandcamp in the script download_bunde_*.js
  87. request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), video_id, m_url.group('ts'))
  88. final_url_webpage = self._download_webpage(request_url, video_id, 'Requesting download url')
  89. # If we could correctly generate the .rand field the url would be
  90. # in the "download_url" key
  91. final_url = self._proto_relative_url(self._search_regex(
  92. r'"retry_url":"(.+?)"', final_url_webpage, 'final video URL'), 'http:')
  93. return {
  94. 'id': video_id,
  95. 'title': info['title'],
  96. 'ext': 'mp3',
  97. 'vcodec': 'none',
  98. 'url': final_url,
  99. 'thumbnail': info.get('thumb_url'),
  100. 'uploader': info.get('artist'),
  101. }
  102. class BandcampAlbumIE(InfoExtractor):
  103. IE_NAME = 'Bandcamp:album'
  104. _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^?#]+)|/?(?:$|[?#]))'
  105. _TESTS = [{
  106. 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
  107. 'playlist': [
  108. {
  109. 'md5': '39bc1eded3476e927c724321ddf116cf',
  110. 'info_dict': {
  111. 'id': '1353101989',
  112. 'ext': 'mp3',
  113. 'title': 'Intro',
  114. }
  115. },
  116. {
  117. 'md5': '1a2c32e2691474643e912cc6cd4bffaa',
  118. 'info_dict': {
  119. 'id': '38097443',
  120. 'ext': 'mp3',
  121. 'title': 'Kero One - Keep It Alive (Blazo remix)',
  122. }
  123. },
  124. ],
  125. 'info_dict': {
  126. 'title': 'Jazz Format Mixtape vol.1',
  127. 'id': 'jazz-format-mixtape-vol-1',
  128. 'uploader_id': 'blazo',
  129. },
  130. 'params': {
  131. 'playlistend': 2
  132. },
  133. 'skip': 'Bandcamp imposes download limits.'
  134. }, {
  135. 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
  136. 'info_dict': {
  137. 'title': 'Hierophany of the Open Grave',
  138. 'uploader_id': 'nightbringer',
  139. 'id': 'hierophany-of-the-open-grave',
  140. },
  141. 'playlist_mincount': 9,
  142. }, {
  143. 'url': 'http://dotscale.bandcamp.com',
  144. 'info_dict': {
  145. 'title': 'Loom',
  146. 'id': 'dotscale',
  147. 'uploader_id': 'dotscale',
  148. },
  149. 'playlist_mincount': 7,
  150. }]
  151. def _real_extract(self, url):
  152. mobj = re.match(self._VALID_URL, url)
  153. uploader_id = mobj.group('subdomain')
  154. album_id = mobj.group('album_id')
  155. playlist_id = album_id or uploader_id
  156. webpage = self._download_webpage(url, playlist_id)
  157. tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
  158. if not tracks_paths:
  159. raise ExtractorError('The page doesn\'t contain any tracks')
  160. entries = [
  161. self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
  162. for t_path in tracks_paths]
  163. title = self._search_regex(
  164. r'album_title\s*:\s*"(.*?)"', webpage, 'title', fatal=False)
  165. return {
  166. '_type': 'playlist',
  167. 'uploader_id': uploader_id,
  168. 'id': playlist_id,
  169. 'title': title,
  170. 'entries': entries,
  171. }