You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

60 lines
2.2 KiB

  1. import re
  2. import os
  3. import socket
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. compat_http_client,
  7. compat_str,
  8. compat_urllib_error,
  9. compat_urllib_parse,
  10. compat_urllib_request,
  11. ExtractorError,
  12. )
  13. class DepositFilesIE(InfoExtractor):
  14. """Information extractor for depositfiles.com"""
  15. _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
  16. def _real_extract(self, url):
  17. file_id = url.split('/')[-1]
  18. # Rebuild url in english locale
  19. url = 'http://depositfiles.com/en/files/' + file_id
  20. # Retrieve file webpage with 'Free download' button pressed
  21. free_download_indication = {'gateway_result' : '1'}
  22. request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
  23. try:
  24. self.report_download_webpage(file_id)
  25. webpage = compat_urllib_request.urlopen(request).read()
  26. except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  27. raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
  28. # Search for the real file URL
  29. mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
  30. if (mobj is None) or (mobj.group(1) is None):
  31. # Try to figure out reason of the error.
  32. mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
  33. if (mobj is not None) and (mobj.group(1) is not None):
  34. restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
  35. raise ExtractorError(u'%s' % restriction_message)
  36. else:
  37. raise ExtractorError(u'Unable to extract download URL from: %s' % url)
  38. file_url = mobj.group(1)
  39. file_extension = os.path.splitext(file_url)[1][1:]
  40. # Search for file title
  41. file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
  42. return [{
  43. 'id': file_id.decode('utf-8'),
  44. 'url': file_url.decode('utf-8'),
  45. 'uploader': None,
  46. 'upload_date': None,
  47. 'title': file_title,
  48. 'ext': file_extension.decode('utf-8'),
  49. }]