From 5425626790a46f9b5bdecf4e33bb254c4c2423ea Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 18 Jul 2014 10:24:28 +0200 Subject: [PATCH] [youtube] Move swfinterp into its own file --- test/test_youtube_signature.py | 12 +- youtube_dl/extractor/youtube.py | 454 +--------------------------- youtube_dl/swfinterp.py | 503 ++++++++++++++++++++++++++++++++ 3 files changed, 516 insertions(+), 453 deletions(-) create mode 100644 youtube_dl/swfinterp.py diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index d95533959..e443e0be8 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -45,6 +45,12 @@ _TESTS = [ u'2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', u'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2', ), + ( + u'http://s.ytimg.com/yts/swfbin/player-vfl5vIhK2/watch_as3.swf', + u'swf', + 86, + u'23456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!?#$%&\'()*+,-./:;<=>"' + ), ] @@ -57,12 +63,12 @@ class TestSignature(unittest.TestCase): def make_tfunc(url, stype, sig_input, expected_sig): - basename = url.rpartition('/')[2] - m = re.match(r'.*-([a-zA-Z0-9_-]+)\.[a-z]+$', basename) - assert m, '%r should follow URL format' % basename + m = re.match(r'.*-([a-zA-Z0-9_-]+)(?:/watch_as3)?\.[a-z]+$', url) + assert m, '%r should follow URL format' % url test_id = m.group(1) def test_func(self): + basename = 'player-%s.%s' % (test_id, stype) fn = os.path.join(self.TESTDATA_DIR, basename) if not os.path.exists(fn): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 16f4a047d..623056bd9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -14,6 +14,7 @@ import zlib from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor from ..jsinterp import JSInterpreter +from ..swfinterp import SWFInterpreter from ..utils import ( compat_chr, compat_parse_qs, @@ -450,457 +451,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return lambda s: initial_function([s]) def _parse_sig_swf(self, file_contents): - if file_contents[1:3] != b'WS': - raise ExtractorError( - u'Not an SWF file; header is %r' % file_contents[:3]) - if file_contents[:1] == b'C': - content = zlib.decompress(file_contents[8:]) - else: - raise NotImplementedError(u'Unsupported compression format %r' % - file_contents[:1]) - - def extract_tags(content): - pos = 0 - while pos < len(content): - header16 = struct.unpack('> 6 - tag_len = header16 & 0x3f - if tag_len == 0x3f: - tag_len = struct.unpack('= 0x80) else b'\x00' - return struct.unpack('!i', first_byte + bs) - - def read_string(reader=None): - if reader is None: - reader = code_reader - slen = u30(reader) - resb = reader.read(slen) - assert len(resb) == slen - return resb.decode('utf-8') - - def read_bytes(count, reader=None): - if reader is None: - reader = code_reader - resb = reader.read(count) - assert len(resb) == count - return resb - - def read_byte(reader=None): - resb = read_bytes(1, reader=reader) - res = struct.unpack('> 4 - methods = {} - if kind in [0x00, 0x06]: # Slot or Const - u30() # Slot id - u30() # type_name_idx - vindex = u30() - if vindex != 0: - read_byte() # vkind - elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter - u30() # disp_id - method_idx = u30() - methods[multinames[trait_name_idx]] = method_idx - elif kind == 0x04: # Class - u30() # slot_id - u30() # classi - elif kind == 0x05: # Function - u30() # slot_id - function_idx = u30() - methods[function_idx] = multinames[trait_name_idx] - else: - raise ExtractorError(u'Unsupported trait kind %d' % kind) - - if attrs & 0x4 != 0: # Metadata present - metadata_count = u30() - for _c3 in range(metadata_count): - u30() # metadata index - - return methods - - class AVMClass(object): - def __init__(self, name_idx): - self.name_idx = name_idx - self.method_names = {} - self.method_idxs = {} - self.methods = {} - self.method_pyfunctions = {} - self.variables = {} - - @property - def name(self): - return multinames[self.name_idx] - - # Classes - class_count = u30() - classes = [] - for class_id in range(class_count): - name_idx = u30() - classes.append(AVMClass(name_idx)) - u30() # super_name idx - flags = read_byte() - if flags & 0x08 != 0: # Protected namespace is present - u30() # protected_ns_idx - intrf_count = u30() - for _c2 in range(intrf_count): - u30() - u30() # iinit - trait_count = u30() - for _c2 in range(trait_count): - parse_traits_info() - assert len(classes) == class_count - + swfi = SWFInterpreter(file_contents) TARGET_CLASSNAME = u'SignatureDecipher' - searched_class = next( - c for c in classes if c.name == TARGET_CLASSNAME) - if searched_class is None: - raise ExtractorError(u'Target class %r not found' % - TARGET_CLASSNAME) - - for avm_class in classes: - u30() # cinit - trait_count = u30() - for _c2 in range(trait_count): - trait_methods = parse_traits_info() - avm_class.method_names.update(trait_methods.items()) - avm_class.method_idxs.update(dict( - (idx, name) - for name, idx in trait_methods.items())) - - # Scripts - script_count = u30() - for _c in range(script_count): - u30() # init - trait_count = u30() - for _c2 in range(trait_count): - parse_traits_info() - - # Method bodies - method_body_count = u30() - Method = collections.namedtuple('Method', ['code', 'local_count']) - for _c in range(method_body_count): - method_idx = u30() - u30() # max_stack - local_count = u30() - u30() # init_scope_depth - u30() # max_scope_depth - code_length = u30() - code = read_bytes(code_length) - for avm_class in classes: - if method_idx in avm_class.method_idxs: - m = Method(code, local_count) - avm_class.methods[avm_class.method_idxs[method_idx]] = m - exception_count = u30() - for _c2 in range(exception_count): - u30() # from - u30() # to - u30() # target - u30() # exc_type - u30() # var_name - trait_count = u30() - for _c2 in range(trait_count): - parse_traits_info() - - assert p + code_reader.tell() == len(code_tag) - - def extract_function(avm_class, func_name): - if func_name in avm_class.method_pyfunctions: - return avm_class.method_pyfunctions[func_name] - if func_name not in avm_class.methods: - raise ExtractorError(u'Cannot find function %r' % func_name) - m = avm_class.methods[func_name] - - def resfunc(args): - registers = ['(this)'] + list(args) + [None] * m.local_count - stack = [] - coder = io.BytesIO(m.code) - while True: - opcode = struct.unpack('!B', coder.read(1))[0] - if opcode == 17: # iftrue - offset = s24(coder) - value = stack.pop() - if value: - coder.seek(coder.tell() + offset) - elif opcode == 36: # pushbyte - v = struct.unpack('!B', coder.read(1))[0] - stack.append(v) - elif opcode == 44: # pushstring - idx = u30(coder) - stack.append(constant_strings[idx]) - elif opcode == 48: # pushscope - # We don't implement the scope register, so we'll just - # ignore the popped value - stack.pop() - elif opcode == 70: # callproperty - index = u30(coder) - mname = multinames[index] - arg_count = u30(coder) - args = list(reversed( - [stack.pop() for _ in range(arg_count)])) - obj = stack.pop() - if mname == u'split': - assert len(args) == 1 - assert isinstance(args[0], compat_str) - assert isinstance(obj, compat_str) - if args[0] == u'': - res = list(obj) - else: - res = obj.split(args[0]) - stack.append(res) - elif mname == u'slice': - assert len(args) == 1 - assert isinstance(args[0], int) - assert isinstance(obj, list) - res = obj[args[0]:] - stack.append(res) - elif mname == u'join': - assert len(args) == 1 - assert isinstance(args[0], compat_str) - assert isinstance(obj, list) - res = args[0].join(obj) - stack.append(res) - elif mname in avm_class.method_pyfunctions: - stack.append(avm_class.method_pyfunctions[mname](args)) - else: - raise NotImplementedError( - u'Unsupported property %r on %r' - % (mname, obj)) - elif opcode == 72: # returnvalue - res = stack.pop() - return res - elif opcode == 79: # callpropvoid - index = u30(coder) - mname = multinames[index] - arg_count = u30(coder) - args = list(reversed( - [stack.pop() for _ in range(arg_count)])) - obj = stack.pop() - if mname == u'reverse': - assert isinstance(obj, list) - obj.reverse() - else: - raise NotImplementedError( - u'Unsupported (void) property %r on %r' - % (mname, obj)) - elif opcode == 86: # newarray - arg_count = u30(coder) - arr = [] - for i in range(arg_count): - arr.append(stack.pop()) - arr = arr[::-1] - stack.append(arr) - elif opcode == 93: # findpropstrict - index = u30(coder) - mname = multinames[index] - res = extract_function(avm_class, mname) - stack.append(res) - elif opcode == 94: # findproperty - index = u30(coder) - mname = multinames[index] - res = avm_class.variables.get(mname) - stack.append(res) - elif opcode == 96: # getlex - index = u30(coder) - mname = multinames[index] - res = avm_class.variables.get(mname) - stack.append(res) - elif opcode == 97: # setproperty - index = u30(coder) - value = stack.pop() - idx = stack.pop() - obj = stack.pop() - assert isinstance(obj, list) - assert isinstance(idx, int) - obj[idx] = value - elif opcode == 98: # getlocal - index = u30(coder) - stack.append(registers[index]) - elif opcode == 99: # setlocal - index = u30(coder) - value = stack.pop() - registers[index] = value - elif opcode == 102: # getproperty - index = u30(coder) - pname = multinames[index] - if pname == u'length': - obj = stack.pop() - assert isinstance(obj, list) - stack.append(len(obj)) - else: # Assume attribute access - idx = stack.pop() - assert isinstance(idx, int) - obj = stack.pop() - assert isinstance(obj, list) - stack.append(obj[idx]) - elif opcode == 128: # coerce - u30(coder) - elif opcode == 133: # coerce_s - assert isinstance(stack[-1], (type(None), compat_str)) - elif opcode == 164: # modulo - value2 = stack.pop() - value1 = stack.pop() - res = value1 % value2 - stack.append(res) - elif opcode == 175: # greaterequals - value2 = stack.pop() - value1 = stack.pop() - result = value1 >= value2 - stack.append(result) - elif opcode == 208: # getlocal_0 - stack.append(registers[0]) - elif opcode == 209: # getlocal_1 - stack.append(registers[1]) - elif opcode == 210: # getlocal_2 - stack.append(registers[2]) - elif opcode == 211: # getlocal_3 - stack.append(registers[3]) - elif opcode == 214: # setlocal_2 - registers[2] = stack.pop() - elif opcode == 215: # setlocal_3 - registers[3] = stack.pop() - else: - raise NotImplementedError( - u'Unsupported opcode %d' % opcode) - - avm_class.method_pyfunctions[func_name] = resfunc - return resfunc - - initial_function = extract_function(searched_class, u'decipher') + searched_class = swfi.extract_class(TARGET_CLASSNAME) + initial_function = swfi.extract_function(searched_class, u'decipher') return lambda s: initial_function([s]) def _decrypt_signature(self, s, video_id, player_url, age_gate=False): diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py new file mode 100644 index 000000000..1cd292138 --- /dev/null +++ b/youtube_dl/swfinterp.py @@ -0,0 +1,503 @@ +from __future__ import unicode_literals + +import collections +import io +import struct +import zlib + +from .utils import ExtractorError + + +def _extract_tags(content): + pos = 0 + while pos < len(content): + header16 = struct.unpack('> 6 + tag_len = header16 & 0x3f + if tag_len == 0x3f: + tag_len = struct.unpack('= 0x80) else b'\x00' + return struct.unpack('!i', first_byte + bs) + + +def _read_string(reader): + slen = _u30(reader) + resb = reader.read(slen) + assert len(resb) == slen + return resb.decode('utf-8') + + +def _read_bytes(count, reader): + if reader is None: + reader = code_reader + resb = reader.read(count) + assert len(resb) == count + return resb + + +def _read_byte(reader): + resb = _read_bytes(1, reader=reader) + res = struct.unpack('> 4 + methods = {} + if kind in [0x00, 0x06]: # Slot or Const + u30() # Slot id + u30() # type_name_idx + vindex = u30() + if vindex != 0: + read_byte() # vkind + elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter + u30() # disp_id + method_idx = u30() + methods[self.multinames[trait_name_idx]] = method_idx + elif kind == 0x04: # Class + u30() # slot_id + u30() # classi + elif kind == 0x05: # Function + u30() # slot_id + function_idx = u30() + methods[function_idx] = self.multinames[trait_name_idx] + else: + raise ExtractorError('Unsupported trait kind %d' % kind) + + if attrs & 0x4 != 0: # Metadata present + metadata_count = u30() + for _c3 in range(metadata_count): + u30() # metadata index + + return methods + + # Classes + class_count = u30() + classes = [] + for class_id in range(class_count): + name_idx = u30() + classes.append(_AVMClass(name_idx, self.multinames[name_idx])) + u30() # super_name idx + flags = read_byte() + if flags & 0x08 != 0: # Protected namespace is present + u30() # protected_ns_idx + intrf_count = u30() + for _c2 in range(intrf_count): + u30() + u30() # iinit + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + assert len(classes) == class_count + self._classes_by_name = dict((c.name, c) for c in classes) + + for avm_class in classes: + u30() # cinit + trait_count = u30() + for _c2 in range(trait_count): + trait_methods = parse_traits_info() + avm_class.method_names.update(trait_methods.items()) + avm_class.method_idxs.update(dict( + (idx, name) + for name, idx in trait_methods.items())) + + # Scripts + script_count = u30() + for _c in range(script_count): + u30() # init + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + # Method bodies + method_body_count = u30() + Method = collections.namedtuple('Method', ['code', 'local_count']) + for _c in range(method_body_count): + method_idx = u30() + u30() # max_stack + local_count = u30() + u30() # init_scope_depth + u30() # max_scope_depth + code_length = u30() + code = read_bytes(code_length) + for avm_class in classes: + if method_idx in avm_class.method_idxs: + m = Method(code, local_count) + avm_class.methods[avm_class.method_idxs[method_idx]] = m + exception_count = u30() + for _c2 in range(exception_count): + u30() # from + u30() # to + u30() # target + u30() # exc_type + u30() # var_name + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + assert p + code_reader.tell() == len(code_tag) + + def extract_class(self, class_name): + try: + return self._classes_by_name[class_name] + except KeyError: + raise ExtractorError('Class %r not found' % class_name) + + def extract_function(self, avm_class, func_name): + if func_name in avm_class.method_pyfunctions: + return avm_class.method_pyfunctions[func_name] + if func_name in self._classes_by_name: + return self._classes_by_name[func_name].make_object() + if func_name not in avm_class.methods: + raise ExtractorError('Cannot find function %r' % func_name) + m = avm_class.methods[func_name] + + def resfunc(args): + # Helper functions + coder = io.BytesIO(m.code) + s24 = lambda: _s24(coder) + u30 = lambda: _u30(coder) + + print('Invoking %s.%s(%r)' % (avm_class.name, func_name, tuple(args))) + registers = ['(this)'] + list(args) + [None] * m.local_count + stack = [] + while True: + opcode = _read_byte(coder) + print('opcode: %r, stack(%d): %r' % (opcode, len(stack), stack)) + if opcode == 17: # iftrue + offset = s24() + value = stack.pop() + if value: + coder.seek(coder.tell() + offset) + elif opcode == 36: # pushbyte + v = _read_byte(coder) + stack.append(v) + elif opcode == 44: # pushstring + idx = u30() + stack.append(constant_strings[idx]) + elif opcode == 48: # pushscope + # We don't implement the scope register, so we'll just + # ignore the popped value + new_scope = stack.pop() + elif opcode == 70: # callproperty + index = u30() + mname = self.multinames[index] + arg_count = u30() + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == 'split': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, compat_str) + if args[0] == '': + res = list(obj) + else: + res = obj.split(args[0]) + stack.append(res) + elif mname == 'slice': + assert len(args) == 1 + assert isinstance(args[0], int) + assert isinstance(obj, list) + res = obj[args[0]:] + stack.append(res) + elif mname == 'join': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, list) + res = args[0].join(obj) + stack.append(res) + elif mname in avm_class.method_pyfunctions: + stack.append(avm_class.method_pyfunctions[mname](args)) + else: + raise NotImplementedError( + 'Unsupported property %r on %r' + % (mname, obj)) + elif opcode == 72: # returnvalue + res = stack.pop() + return res + elif opcode == 74: # constructproperty + index = u30() + arg_count = u30() + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + + mname = self.multinames[index] + construct_method = self.extract_function( + obj.avm_class, mname) + # We do not actually call the constructor for now; + # we just pretend it does nothing + stack.append(obj) + elif opcode == 79: # callpropvoid + index = u30() + mname = self.multinames[index] + arg_count = u30() + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == 'reverse': + assert isinstance(obj, list) + obj.reverse() + else: + raise NotImplementedError( + 'Unsupported (void) property %r on %r' + % (mname, obj)) + elif opcode == 86: # newarray + arg_count = u30() + arr = [] + for i in range(arg_count): + arr.append(stack.pop()) + arr = arr[::-1] + stack.append(arr) + elif opcode == 93: # findpropstrict + index = u30() + mname = self.multinames[index] + res = self.extract_function(avm_class, mname) + stack.append(res) + elif opcode == 94: # findproperty + index = u30() + mname = self.multinames[index] + res = avm_class.variables.get(mname) + stack.append(res) + elif opcode == 96: # getlex + index = u30() + mname = self.multinames[index] + res = avm_class.variables.get(mname, None) + stack.append(res) + elif opcode == 97: # setproperty + index = u30() + value = stack.pop() + idx = self.multinames[index] + obj = stack.pop() + obj[idx] = value + elif opcode == 98: # getlocal + index = u30() + stack.append(registers[index]) + elif opcode == 99: # setlocal + index = u30() + value = stack.pop() + registers[index] = value + elif opcode == 102: # getproperty + index = u30() + pname = self.multinames[index] + if pname == 'length': + obj = stack.pop() + assert isinstance(obj, list) + stack.append(len(obj)) + else: # Assume attribute access + idx = stack.pop() + assert isinstance(idx, int) + obj = stack.pop() + assert isinstance(obj, list) + stack.append(obj[idx]) + elif opcode == 128: # coerce + u30() + elif opcode == 133: # coerce_s + assert isinstance(stack[-1], (type(None), compat_str)) + elif opcode == 164: # modulo + value2 = stack.pop() + value1 = stack.pop() + res = value1 % value2 + stack.append(res) + elif opcode == 175: # greaterequals + value2 = stack.pop() + value1 = stack.pop() + result = value1 >= value2 + stack.append(result) + elif opcode == 208: # getlocal_0 + stack.append(registers[0]) + elif opcode == 209: # getlocal_1 + stack.append(registers[1]) + elif opcode == 210: # getlocal_2 + stack.append(registers[2]) + elif opcode == 211: # getlocal_3 + stack.append(registers[3]) + elif opcode == 214: # setlocal_2 + registers[2] = stack.pop() + elif opcode == 215: # setlocal_3 + registers[3] = stack.pop() + else: + raise NotImplementedError( + 'Unsupported opcode %d' % opcode) + + avm_class.method_pyfunctions[func_name] = resfunc + return resfunc +