You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

65 lines
2.1 KiB

  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. from .common import InfoExtractor
  4. from ..utils import determine_ext
  5. class LibraryOfCongressIE(InfoExtractor):
  6. _VALID_URL = r'https?://(?:www\.)?loc\.gov/item/(?P<id>[0-9]+)'
  7. _TESTS = [{
  8. 'url': 'http://loc.gov/item/90716351/',
  9. 'info_dict': {
  10. 'id': '90716351',
  11. 'ext': 'mp4',
  12. 'title': 'Pa\'s trip to Mars /'
  13. },
  14. 'params': {
  15. # m3u8 download
  16. 'skip_download': True,
  17. }
  18. }, {
  19. 'url': 'https://www.loc.gov/item/97516576/',
  20. 'only_matching': True,
  21. }]
  22. def _real_extract(self, url):
  23. video_id = self._match_id(url)
  24. webpage = self._download_webpage(url, video_id)
  25. self.report_extraction(video_id)
  26. json_id = self._search_regex('media-player-([0-9A-Z]{32})', webpage, 'json id')
  27. data = self._parse_json(self._download_webpage(
  28. 'https://media.loc.gov/services/v1/media?id=%s' % json_id,
  29. video_id), video_id)
  30. data = data['mediaObject']
  31. media_url = data['derivatives'][0]['derivativeUrl']
  32. media_url = media_url.replace('rtmp', 'https')
  33. is_video = data['mediaType'].lower() == 'v'
  34. if not determine_ext(media_url) in ('mp4', 'mp3'):
  35. media_url += '.mp4' if is_video else '.mp3'
  36. if media_url.index('vod/mp4:') > -1:
  37. media_url = media_url.replace('vod/mp4:', 'hls-vod/media/') + '.m3u8'
  38. elif url.index('vod/mp3:') > -1:
  39. media_url = media_url.replace('vod/mp3:', '')
  40. formats = []
  41. if determine_ext(media_url) == 'm3u8':
  42. formats = self._extract_m3u8_formats(media_url, video_id, ext='mp4')
  43. elif determine_ext(media_url) is 'mp3':
  44. formats.append({
  45. 'url': media_url,
  46. 'ext': 'mp3',
  47. })
  48. return {
  49. 'id': video_id,
  50. 'thumbnail': self._og_search_thumbnail(webpage),
  51. 'title': self._og_search_title(webpage),
  52. 'formats': formats,
  53. }