Browse Source

[vshare] Fix extraction (closes #14473)

master-ytdl-org
Timendum 7 years ago
committed by Sergey M
parent
commit
0987f2ddb2
2 changed files with 44 additions and 8 deletions
  1. +16
    -0
      youtube_dl/extractor/generic.py
  2. +28
    -8
      youtube_dl/extractor/vshare.py

+ 16
- 0
youtube_dl/extractor/generic.py View File

@ -102,6 +102,7 @@ from .joj import JojIE
from .megaphone import MegaphoneIE from .megaphone import MegaphoneIE
from .vzaar import VzaarIE from .vzaar import VzaarIE
from .channel9 import Channel9IE from .channel9 import Channel9IE
from .vshare import VShareIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -1921,6 +1922,16 @@ class GenericIE(InfoExtractor):
'title': 'Rescue Kit 14 Free Edition - Getting started', 'title': 'Rescue Kit 14 Free Edition - Getting started',
}, },
'playlist_count': 4, 'playlist_count': 4,
},
{
# vshare embed
'url': 'https://youtube-dl-demo.neocities.org/vshare.html',
'md5': '17b39f55b5497ae8b59f5fbce8e35886',
'info_dict': {
'id': '0f64ce6',
'title': 'vl14062007715967',
'ext': 'mp4',
}
} }
# { # {
# # TODO: find another test # # TODO: find another test
@ -2879,6 +2890,11 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
channel9_urls, video_id, video_title, ie=Channel9IE.ie_key()) channel9_urls, video_id, video_title, ie=Channel9IE.ie_key())
vshare_urls = VShareIE._extract_urls(webpage)
if vshare_urls:
return self.playlist_from_matches(
vshare_urls, video_id, video_title, ie=VShareIE.ie_key())
def merge_dicts(dict1, dict2): def merge_dicts(dict1, dict2):
merged = {} merged = {}
for k, v in dict1.items(): for k, v in dict1.items():


+ 28
- 8
youtube_dl/extractor/vshare.py View File

@ -1,14 +1,18 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_chr
from ..utils import decode_packed_codes
class VShareIE(InfoExtractor): class VShareIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?vshare\.io/[dv]/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:www\.)?vshare\.io/[dv]/(?P<id>[^/?#&]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://vshare.io/d/0f64ce6', 'url': 'https://vshare.io/d/0f64ce6',
'md5': '16d7b8fef58846db47419199ff1ab3e7',
'md5': '17b39f55b5497ae8b59f5fbce8e35886',
'info_dict': { 'info_dict': {
'id': '0f64ce6', 'id': '0f64ce6',
'title': 'vl14062007715967', 'title': 'vl14062007715967',
@ -19,20 +23,36 @@ class VShareIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
def _extract_packed(self, webpage):
packed = self._search_regex(r'(eval\(function.+)', webpage, 'packed code')
unpacked = decode_packed_codes(packed)
digits = self._search_regex(r'\[((?:\d+,?)+)\]', unpacked, 'digits')
digits = digits.split(',')
digits = [int(digit) for digit in digits]
key_digit = self._search_regex(r'fromCharCode\(.+?(\d+)\)}', unpacked, 'key digit')
chars = [compat_chr(d - int(key_digit)) for d in digits]
return ''.join(chars)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage( webpage = self._download_webpage(
'https://vshare.io/d/%s' % video_id, video_id)
'https://vshare.io/v/%s/width-650/height-430/1' % video_id, video_id)
title = self._html_search_regex(
r'(?s)<div id="root-container">(.+?)<br/>', webpage, 'title')
video_url = self._search_regex(
r'<a[^>]+href=(["\'])(?P<url>(?:https?:)?//.+?)\1[^>]*>[Cc]lick\s+here',
webpage, 'video url', group='url')
title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
title = title.split(' - ')[0]
unpacked = self._extract_packed(webpage)
video_urls = re.findall(r'<source src="([^"]+)', unpacked)
formats = [{'url': video_url} for video_url in video_urls]
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'url': video_url,
'formats': formats,
} }
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?vshare\.io/v/[^/?#&]+)',
webpage)

Loading…
Cancel
Save