You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

183 lines
6.7 KiB

10 years ago
10 years ago
  1. from __future__ import unicode_literals
  2. import json
  3. import re
  4. from .common import InfoExtractor
  5. from ..compat import (
  6. compat_str,
  7. compat_urlparse,
  8. )
  9. from ..utils import (
  10. ExtractorError,
  11. float_or_none,
  12. int_or_none,
  13. )
  14. class BandcampIE(InfoExtractor):
  15. _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)'
  16. _TESTS = [{
  17. 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
  18. 'md5': 'c557841d5e50261777a6585648adf439',
  19. 'info_dict': {
  20. 'id': '1812978515',
  21. 'ext': 'mp3',
  22. 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
  23. 'duration': 9.8485,
  24. },
  25. '_skip': 'There is a limit of 200 free downloads / month for the test song'
  26. }, {
  27. 'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
  28. 'md5': '2b68e5851514c20efdff2afc5603b8b4',
  29. 'info_dict': {
  30. 'id': '2650410135',
  31. 'ext': 'mp3',
  32. 'title': 'Lanius (Battle)',
  33. 'uploader': 'Ben Prunty Music',
  34. },
  35. }]
  36. def _real_extract(self, url):
  37. mobj = re.match(self._VALID_URL, url)
  38. title = mobj.group('title')
  39. webpage = self._download_webpage(url, title)
  40. m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
  41. if not m_download:
  42. m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
  43. if m_trackinfo:
  44. json_code = m_trackinfo.group(1)
  45. data = json.loads(json_code)[0]
  46. formats = []
  47. for format_id, format_url in data['file'].items():
  48. ext, abr_str = format_id.split('-', 1)
  49. formats.append({
  50. 'format_id': format_id,
  51. 'url': self._proto_relative_url(format_url, 'http:'),
  52. 'ext': ext,
  53. 'vcodec': 'none',
  54. 'acodec': ext,
  55. 'abr': int_or_none(abr_str),
  56. })
  57. self._sort_formats(formats)
  58. return {
  59. 'id': compat_str(data['id']),
  60. 'title': data['title'],
  61. 'formats': formats,
  62. 'duration': float_or_none(data.get('duration')),
  63. }
  64. else:
  65. raise ExtractorError('No free songs found')
  66. download_link = m_download.group(1)
  67. video_id = self._search_regex(
  68. r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
  69. webpage, 'video id')
  70. download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page')
  71. # We get the dictionary of the track from some javascript code
  72. all_info = self._parse_json(self._search_regex(
  73. r'(?sm)items: (.*?),$', download_webpage, 'items'), video_id)
  74. info = all_info[0]
  75. # We pick mp3-320 for now, until format selection can be easily implemented.
  76. mp3_info = info['downloads']['mp3-320']
  77. # If we try to use this url it says the link has expired
  78. initial_url = mp3_info['url']
  79. m_url = re.match(
  80. r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$',
  81. initial_url)
  82. # We build the url we will use to get the final track url
  83. # This url is build in Bandcamp in the script download_bunde_*.js
  84. request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), video_id, m_url.group('ts'))
  85. final_url_webpage = self._download_webpage(request_url, video_id, 'Requesting download url')
  86. # If we could correctly generate the .rand field the url would be
  87. # in the "download_url" key
  88. final_url = self._proto_relative_url(self._search_regex(
  89. r'"retry_url":"(.+?)"', final_url_webpage, 'final video URL'), 'http:')
  90. return {
  91. 'id': video_id,
  92. 'title': info['title'],
  93. 'ext': 'mp3',
  94. 'vcodec': 'none',
  95. 'url': final_url,
  96. 'thumbnail': info.get('thumb_url'),
  97. 'uploader': info.get('artist'),
  98. }
  99. class BandcampAlbumIE(InfoExtractor):
  100. IE_NAME = 'Bandcamp:album'
  101. _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^?#]+)|/?(?:$|[?#]))'
  102. _TESTS = [{
  103. 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
  104. 'playlist': [
  105. {
  106. 'md5': '39bc1eded3476e927c724321ddf116cf',
  107. 'info_dict': {
  108. 'id': '1353101989',
  109. 'ext': 'mp3',
  110. 'title': 'Intro',
  111. }
  112. },
  113. {
  114. 'md5': '1a2c32e2691474643e912cc6cd4bffaa',
  115. 'info_dict': {
  116. 'id': '38097443',
  117. 'ext': 'mp3',
  118. 'title': 'Kero One - Keep It Alive (Blazo remix)',
  119. }
  120. },
  121. ],
  122. 'info_dict': {
  123. 'title': 'Jazz Format Mixtape vol.1',
  124. 'id': 'jazz-format-mixtape-vol-1',
  125. 'uploader_id': 'blazo',
  126. },
  127. 'params': {
  128. 'playlistend': 2
  129. },
  130. 'skip': 'Bandcamp imposes download limits.'
  131. }, {
  132. 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
  133. 'info_dict': {
  134. 'title': 'Hierophany of the Open Grave',
  135. 'uploader_id': 'nightbringer',
  136. 'id': 'hierophany-of-the-open-grave',
  137. },
  138. 'playlist_mincount': 9,
  139. }, {
  140. 'url': 'http://dotscale.bandcamp.com',
  141. 'info_dict': {
  142. 'title': 'Loom',
  143. 'id': 'dotscale',
  144. 'uploader_id': 'dotscale',
  145. },
  146. 'playlist_mincount': 7,
  147. }]
  148. def _real_extract(self, url):
  149. mobj = re.match(self._VALID_URL, url)
  150. uploader_id = mobj.group('subdomain')
  151. album_id = mobj.group('album_id')
  152. playlist_id = album_id or uploader_id
  153. webpage = self._download_webpage(url, playlist_id)
  154. tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
  155. if not tracks_paths:
  156. raise ExtractorError('The page doesn\'t contain any tracks')
  157. entries = [
  158. self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
  159. for t_path in tracks_paths]
  160. title = self._search_regex(
  161. r'album_title\s*:\s*"(.*?)"', webpage, 'title', fatal=False)
  162. return {
  163. '_type': 'playlist',
  164. 'uploader_id': uploader_id,
  165. 'id': playlist_id,
  166. 'title': title,
  167. 'entries': entries,
  168. }