You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

74 lines
2.7 KiB

  1. import re
  2. import socket
  3. import xml.etree.ElementTree
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. compat_http_client,
  7. compat_str,
  8. compat_urllib_error,
  9. compat_urllib_parse_urlparse,
  10. compat_urllib_request,
  11. ExtractorError,
  12. )
  13. class CollegeHumorIE(InfoExtractor):
  14. _WORKING = False
  15. _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
  16. def report_manifest(self, video_id):
  17. """Report information extraction."""
  18. self.to_screen(u'%s: Downloading XML manifest' % video_id)
  19. def _real_extract(self, url):
  20. mobj = re.match(self._VALID_URL, url)
  21. if mobj is None:
  22. raise ExtractorError(u'Invalid URL: %s' % url)
  23. video_id = mobj.group('videoid')
  24. info = {
  25. 'id': video_id,
  26. 'uploader': None,
  27. 'upload_date': None,
  28. }
  29. self.report_extraction(video_id)
  30. xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
  31. try:
  32. metaXml = compat_urllib_request.urlopen(xmlUrl).read()
  33. except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  34. raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
  35. mdoc = xml.etree.ElementTree.fromstring(metaXml)
  36. try:
  37. videoNode = mdoc.findall('./video')[0]
  38. info['description'] = videoNode.findall('./description')[0].text
  39. info['title'] = videoNode.findall('./caption')[0].text
  40. info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
  41. manifest_url = videoNode.findall('./file')[0].text
  42. except IndexError:
  43. raise ExtractorError(u'Invalid metadata XML file')
  44. manifest_url += '?hdcore=2.10.3'
  45. self.report_manifest(video_id)
  46. try:
  47. manifestXml = compat_urllib_request.urlopen(manifest_url).read()
  48. except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  49. raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
  50. adoc = xml.etree.ElementTree.fromstring(manifestXml)
  51. try:
  52. media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
  53. node_id = media_node.attrib['url']
  54. video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
  55. except IndexError as err:
  56. raise ExtractorError(u'Invalid manifest file')
  57. url_pr = compat_urllib_parse_urlparse(manifest_url)
  58. url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
  59. info['url'] = url
  60. info['ext'] = 'f4f'
  61. return [info]