From 6a6774373e1548f30c4c22c03ef8c4c0e13adff7 Mon Sep 17 00:00:00 2001 From: Suzen Fylke Date: Fri, 26 Jun 2020 10:53:28 -0400 Subject: [PATCH 1/2] Update cloudpickle to v1.4.1 --- python/pyspark/cloudpickle.py | 1076 +++++++++++++++++---------------- 1 file changed, 549 insertions(+), 527 deletions(-) diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py index 09d3a5e7cfb6f..eac09dd99feb1 100644 --- a/python/pyspark/cloudpickle.py +++ b/python/pyspark/cloudpickle.py @@ -42,27 +42,42 @@ """ from __future__ import print_function +import abc +import builtins import dis -from functools import partial import io import itertools import logging import opcode import operator import pickle +import platform import struct import sys -import traceback import types import weakref import uuid import threading +import typing +from enum import Enum + +from typing import Generic, Union, Tuple, Callable +from pickle import _Pickler as Pickler +from pickle import _getattribute +from io import BytesIO +from importlib._bootstrap import _find_spec + +try: # pragma: no branch + import typing_extensions as _typing_extensions + from typing_extensions import Literal, Final +except ImportError: + _typing_extensions = Literal = Final = None +if sys.version_info >= (3, 5, 3): + from typing import ClassVar +else: # pragma: no cover + ClassVar = None -try: - from enum import Enum -except ImportError: - Enum = None # cloudpickle is meant for inter process communication: we expect all # communicating processes to run the same Python version hence we favor @@ -76,29 +91,17 @@ _DYNAMIC_CLASS_TRACKER_BY_ID = weakref.WeakValueDictionary() _DYNAMIC_CLASS_TRACKER_LOCK = threading.Lock() -if sys.version_info[0] < 3: # pragma: no branch - from pickle import Pickler - try: - from cStringIO import StringIO - except ImportError: - from StringIO import StringIO - string_types = (basestring,) # noqa - PY3 = False - PY2 = True - PY2_WRAPPER_DESCRIPTOR_TYPE = type(object.__init__) - PY2_METHOD_WRAPPER_TYPE = type(object.__eq__) - PY2_CLASS_DICT_BLACKLIST = (PY2_METHOD_WRAPPER_TYPE, - PY2_WRAPPER_DESCRIPTOR_TYPE) -else: - types.ClassType = type - from pickle import _Pickler as Pickler - from io import BytesIO as StringIO - string_types = (str,) - PY3 = True - PY2 = False - - -def _ensure_tracking(class_def): +PYPY = platform.python_implementation() == "PyPy" + +builtin_code_type = None +if PYPY: + # builtin-code objects only exist in pypy + builtin_code_type = type(float.__new__.__code__) + +_extract_code_globals_cache = weakref.WeakKeyDictionary() + + +def _get_or_create_tracker_id(class_def): with _DYNAMIC_CLASS_TRACKER_LOCK: class_tracker_id = _DYNAMIC_CLASS_TRACKER_BY_CLASS.get(class_def) if class_tracker_id is None: @@ -117,109 +120,262 @@ def _lookup_class_or_track(class_tracker_id, class_def): return class_def -def _make_cell_set_template_code(): - """Get the Python compiler to emit LOAD_FAST(arg); STORE_DEREF +def _whichmodule(obj, name): + """Find the module an object belongs to. - Notes - ----- - In Python 3, we could use an easier function: + This function differs from ``pickle.whichmodule`` in two ways: + - it does not mangle the cases where obj's module is __main__ and obj was + not found in any module. + - Errors arising during module introspection are ignored, as those errors + are considered unwanted side effects. + """ + if sys.version_info[:2] < (3, 7) and isinstance(obj, typing.TypeVar): # pragma: no branch # noqa + # Workaround bug in old Python versions: prior to Python 3.7, + # T.__module__ would always be set to "typing" even when the TypeVar T + # would be defined in a different module. + # + # For such older Python versions, we ignore the __module__ attribute of + # TypeVar instances and instead exhaustively lookup those instances in + # all currently imported modules. + module_name = None + else: + module_name = getattr(obj, '__module__', None) + + if module_name is not None: + return module_name + # Protect the iteration by using a copy of sys.modules against dynamic + # modules that trigger imports of other modules upon calls to getattr or + # other threads importing at the same time. + for module_name, module in sys.modules.copy().items(): + # Some modules such as coverage can inject non-module objects inside + # sys.modules + if ( + module_name == '__main__' or + module is None or + not isinstance(module, types.ModuleType) + ): + continue + try: + if _getattribute(module, name)[0] is obj: + return module_name + except Exception: + pass + return None + + +def _is_importable_by_name(obj, name=None): + """Determine if obj can be pickled as attribute of a file-backed module""" + return _lookup_module_and_qualname(obj, name=name) is not None - .. code-block:: python - def f(): - cell = None +def _lookup_module_and_qualname(obj, name=None): + if name is None: + name = getattr(obj, '__qualname__', None) + if name is None: # pragma: no cover + # This used to be needed for Python 2.7 support but is probably not + # needed anymore. However we keep the __name__ introspection in case + # users of cloudpickle rely on this old behavior for unknown reasons. + name = getattr(obj, '__name__', None) - def _stub(value): - nonlocal cell - cell = value + module_name = _whichmodule(obj, name) - return _stub + if module_name is None: + # In this case, obj.__module__ is None AND obj was not found in any + # imported module. obj is thus treated as dynamic. + return None + + if module_name == "__main__": + return None + + module = sys.modules.get(module_name, None) + if module is None: + # The main reason why obj's module would not be imported is that this + # module has been dynamically created, using for example + # types.ModuleType. The other possibility is that module was removed + # from sys.modules after obj was created/imported. But this case is not + # supported, as the standard pickle does not support it either. + return None + + # module has been added to sys.modules, but it can still be dynamic. + if _is_dynamic(module): + return None + + try: + obj2, parent = _getattribute(module, name) + except AttributeError: + # obj was not found inside the module it points to + return None + if obj2 is not obj: + return None + return module, name - _cell_set_template_code = f().__code__ - This function is _only_ a LOAD_FAST(arg); STORE_DEREF, but that is - invalid syntax on Python 2. If we use this function we also don't need - to do the weird freevars/cellvars swap below +def _extract_code_globals(co): """ - def inner(value): - lambda: cell # make ``cell`` a closure so that we get a STORE_DEREF - cell = value + Find all globals names read or written to by codeblock co + """ + out_names = _extract_code_globals_cache.get(co) + if out_names is None: + names = co.co_names + out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} - co = inner.__code__ - - # NOTE: we are marking the cell variable as a free variable intentionally - # so that we simulate an inner function instead of the outer function. This - # is what gives us the ``nonlocal`` behavior in a Python 2 compatible way. - if PY2: # pragma: no branch - return types.CodeType( - co.co_argcount, - co.co_nlocals, - co.co_stacksize, - co.co_flags, - co.co_code, - co.co_consts, - co.co_names, - co.co_varnames, - co.co_filename, - co.co_name, - co.co_firstlineno, - co.co_lnotab, - co.co_cellvars, # this is the trickery - (), - ) - else: - if hasattr(types.CodeType, "co_posonlyargcount"): # pragma: no branch - return types.CodeType( - co.co_argcount, - co.co_posonlyargcount, # Python3.8 with PEP570 - co.co_kwonlyargcount, - co.co_nlocals, - co.co_stacksize, - co.co_flags, - co.co_code, - co.co_consts, - co.co_names, - co.co_varnames, - co.co_filename, - co.co_name, - co.co_firstlineno, - co.co_lnotab, - co.co_cellvars, # this is the trickery - (), - ) - else: - return types.CodeType( - co.co_argcount, - co.co_kwonlyargcount, - co.co_nlocals, - co.co_stacksize, - co.co_flags, - co.co_code, - co.co_consts, - co.co_names, - co.co_varnames, - co.co_filename, - co.co_name, - co.co_firstlineno, - co.co_lnotab, - co.co_cellvars, # this is the trickery - (), - ) + # Declaring a function inside another one using the "def ..." + # syntax generates a constant code object corresonding to the one + # of the nested function's As the nested function may itself need + # global variables, we need to introspect its code, extract its + # globals, (look for code object in it's co_consts attribute..) and + # add the result to code_globals + if co.co_consts: + for const in co.co_consts: + if isinstance(const, types.CodeType): + out_names |= _extract_code_globals(const) + + _extract_code_globals_cache[co] = out_names + + return out_names -_cell_set_template_code = _make_cell_set_template_code() + +def _find_imported_submodules(code, top_level_dependencies): + """ + Find currently imported submodules used by a function. + + Submodules used by a function need to be detected and referenced for the + function to work correctly at depickling time. Because submodules can be + referenced as attribute of their parent package (``package.submodule``), we + need a special introspection technique that does not rely on GLOBAL-related + opcodes to find references of them in a code object. + + Example: + ``` + import concurrent.futures + import cloudpickle + def func(): + x = concurrent.futures.ThreadPoolExecutor + if __name__ == '__main__': + cloudpickle.dumps(func) + ``` + The globals extracted by cloudpickle in the function's state include the + concurrent package, but not its submodule (here, concurrent.futures), which + is the module used by func. Find_imported_submodules will detect the usage + of concurrent.futures. Saving this module alongside with func will ensure + that calling func once depickled does not fail due to concurrent.futures + not being imported + """ + + subimports = [] + # check if any known dependency is an imported package + for x in top_level_dependencies: + if (isinstance(x, types.ModuleType) and + hasattr(x, '__package__') and x.__package__): + # check if the package has any currently loaded sub-imports + prefix = x.__name__ + '.' + # A concurrent thread could mutate sys.modules, + # make sure we iterate over a copy to avoid exceptions + for name in list(sys.modules): + # Older versions of pytest will add a "None" module to + # sys.modules. + if name is not None and name.startswith(prefix): + # check whether the function can address the sub-module + tokens = set(name[len(prefix):].split('.')) + if not tokens - set(code.co_names): + subimports.append(sys.modules[name]) + return subimports def cell_set(cell, value): """Set the value of a closure cell. + + The point of this function is to set the cell_contents attribute of a cell + after its creation. This operation is necessary in case the cell contains a + reference to the function the cell belongs to, as when calling the + function's constructor + ``f = types.FunctionType(code, globals, name, argdefs, closure)``, + closure will not be able to contain the yet-to-be-created f. + + In Python3.7, cell_contents is writeable, so setting the contents of a cell + can be done simply using + >>> cell.cell_contents = value + + In earlier Python3 versions, the cell_contents attribute of a cell is read + only, but this limitation can be worked around by leveraging the Python 3 + ``nonlocal`` keyword. + + In Python2 however, this attribute is read only, and there is no + ``nonlocal`` keyword. For this reason, we need to come up with more + complicated hacks to set this attribute. + + The chosen approach is to create a function with a STORE_DEREF opcode, + which sets the content of a closure variable. Typically: + + >>> def inner(value): + ... lambda: cell # the lambda makes cell a closure + ... cell = value # cell is a closure, so this triggers a STORE_DEREF + + (Note that in Python2, A STORE_DEREF can never be triggered from an inner + function. The function g for example here + >>> def f(var): + ... def g(): + ... var += 1 + ... return g + + will not modify the closure variable ``var```inplace, but instead try to + load a local variable var and increment it. As g does not assign the local + variable ``var`` any initial value, calling f(1)() will fail at runtime.) + + Our objective is to set the value of a given cell ``cell``. So we need to + somewhat reference our ``cell`` object into the ``inner`` function so that + this object (and not the smoke cell of the lambda function) gets affected + by the STORE_DEREF operation. + + In inner, ``cell`` is referenced as a cell variable (an enclosing variable + that is referenced by the inner function). If we create a new function + cell_set with the exact same code as ``inner``, but with ``cell`` marked as + a free variable instead, the STORE_DEREF will be applied on its closure - + ``cell``, which we can specify explicitly during construction! The new + cell_set variable thus actually sets the contents of a specified cell! + + Note: we do not make use of the ``nonlocal`` keyword to set the contents of + a cell in early python3 versions to limit possible syntax errors in case + test and checker libraries decide to parse the whole file. """ - return types.FunctionType( - _cell_set_template_code, - {}, - '_cell_set_inner', - (), - (cell,), - )(value) + if sys.version_info[:2] >= (3, 7): # pragma: no branch + cell.cell_contents = value + else: + _cell_set = types.FunctionType( + _cell_set_template_code, {}, '_cell_set', (), (cell,),) + _cell_set(value) + + +def _make_cell_set_template_code(): + def _cell_set_factory(value): + lambda: cell + cell = value + + co = _cell_set_factory.__code__ + + _cell_set_template_code = types.CodeType( + co.co_argcount, + co.co_kwonlyargcount, # Python 3 only argument + co.co_nlocals, + co.co_stacksize, + co.co_flags, + co.co_code, + co.co_consts, + co.co_names, + co.co_varnames, + co.co_filename, + co.co_name, + co.co_firstlineno, + co.co_lnotab, + co.co_cellvars, # co_freevars is initialized with co_cellvars + (), # co_cellvars is made empty + ) + return _cell_set_template_code + + +if sys.version_info[:2] < (3, 7): + _cell_set_template_code = _make_cell_set_template_code() # relevant opcodes STORE_GLOBAL = opcode.opmap['STORE_GLOBAL'] @@ -230,10 +386,6 @@ def cell_set(cell, value): EXTENDED_ARG = dis.EXTENDED_ARG -def islambda(func): - return getattr(func, '__name__') == '' - - _BUILTIN_TYPE_NAMES = {} for k, v in types.__dict__.items(): if type(v) is type: @@ -241,70 +393,23 @@ def islambda(func): def _builtin_type(name): + if name == "ClassType": # pragma: no cover + # Backward compat to load pickle files generated with cloudpickle + # < 1.3 even if loading pickle files from older versions is not + # officially supported. + return type return getattr(types, name) -def _make__new__factory(type_): - def _factory(): - return type_.__new__ - return _factory - - -# NOTE: These need to be module globals so that they're pickleable as globals. -_get_dict_new = _make__new__factory(dict) -_get_frozenset_new = _make__new__factory(frozenset) -_get_list_new = _make__new__factory(list) -_get_set_new = _make__new__factory(set) -_get_tuple_new = _make__new__factory(tuple) -_get_object_new = _make__new__factory(object) - -# Pre-defined set of builtin_function_or_method instances that can be -# serialized. -_BUILTIN_TYPE_CONSTRUCTORS = { - dict.__new__: _get_dict_new, - frozenset.__new__: _get_frozenset_new, - set.__new__: _get_set_new, - list.__new__: _get_list_new, - tuple.__new__: _get_tuple_new, - object.__new__: _get_object_new, -} - - -if sys.version_info < (3, 4): # pragma: no branch - def _walk_global_ops(code): - """ - Yield (opcode, argument number) tuples for all - global-referencing instructions in *code*. - """ - code = getattr(code, 'co_code', b'') - if PY2: # pragma: no branch - code = map(ord, code) - - n = len(code) - i = 0 - extended_arg = 0 - while i < n: - op = code[i] - i += 1 - if op >= HAVE_ARGUMENT: - oparg = code[i] + code[i + 1] * 256 + extended_arg - extended_arg = 0 - i += 2 - if op == EXTENDED_ARG: - extended_arg = oparg * 65536 - if op in GLOBAL_OPS: - yield op, oparg - -else: - def _walk_global_ops(code): - """ - Yield (opcode, argument number) tuples for all - global-referencing instructions in *code*. - """ - for instr in dis.get_instructions(code): - op = instr.opcode - if op in GLOBAL_OPS: - yield op, instr.arg +def _walk_global_ops(code): + """ + Yield (opcode, argument number) tuples for all + global-referencing instructions in *code*. + """ + for instr in dis.get_instructions(code): + op = instr.opcode + if op in GLOBAL_OPS: + yield op, instr.arg def _extract_class_dict(cls): @@ -322,17 +427,6 @@ def _extract_class_dict(cls): base_value = inherited_dict[name] if value is base_value: to_remove.append(name) - elif PY2: - # backward compat for Python 2 - if hasattr(value, "im_func"): - if value.im_func is getattr(base_value, "im_func", None): - to_remove.append(name) - elif isinstance(value, PY2_CLASS_DICT_BLACKLIST): - # On Python 2 we have no way to pickle those specific - # methods types nor to check that they are actually - # inherited. So we assume that they are always inherited - # from builtin types. - to_remove.append(name) except KeyError: pass for name in to_remove: @@ -340,6 +434,32 @@ def _extract_class_dict(cls): return clsdict +if sys.version_info[:2] < (3, 7): # pragma: no branch + def _is_parametrized_type_hint(obj): + # This is very cheap but might generate false positives. + # general typing Constructs + is_typing = getattr(obj, '__origin__', None) is not None + + # typing_extensions.Literal + is_litteral = getattr(obj, '__values__', None) is not None + + # typing_extensions.Final + is_final = getattr(obj, '__type__', None) is not None + + # typing.Union/Tuple for old Python 3.5 + is_union = getattr(obj, '__union_params__', None) is not None + is_tuple = getattr(obj, '__tuple_params__', None) is not None + is_callable = ( + getattr(obj, '__result__', None) is not None and + getattr(obj, '__args__', None) is not None + ) + return any((is_typing, is_litteral, is_final, is_union, is_tuple, + is_callable)) + + def _create_parametrized_type_hint(origin, args): + return origin[args] + + class CloudPickler(Pickler): dispatch = Pickler.dispatch.copy() @@ -362,22 +482,22 @@ def dump(self, obj): else: raise + def save_typevar(self, obj): + self.save_reduce(*_typevar_reduce(obj), obj=obj) + + dispatch[typing.TypeVar] = save_typevar + def save_memoryview(self, obj): self.save(obj.tobytes()) dispatch[memoryview] = save_memoryview - if PY2: # pragma: no branch - def save_buffer(self, obj): - self.save(str(obj)) - - dispatch[buffer] = save_buffer # noqa: F821 'buffer' was removed in Python 3 - def save_module(self, obj): """ Save a module as an import """ if _is_dynamic(obj): + obj.__dict__.pop('__builtins__', None) self.save_reduce(dynamic_subimport, (obj.__name__, vars(obj)), obj=obj) else: @@ -389,29 +509,22 @@ def save_codeobject(self, obj): """ Save a code object """ - if PY3: # pragma: no branch - if hasattr(obj, "co_posonlyargcount"): # pragma: no branch - args = ( - obj.co_argcount, obj.co_posonlyargcount, - obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, - obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, - obj.co_varnames, obj.co_filename, obj.co_name, - obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, - obj.co_cellvars - ) - else: - args = ( - obj.co_argcount, obj.co_kwonlyargcount, obj.co_nlocals, - obj.co_stacksize, obj.co_flags, obj.co_code, obj.co_consts, - obj.co_names, obj.co_varnames, obj.co_filename, - obj.co_name, obj.co_firstlineno, obj.co_lnotab, - obj.co_freevars, obj.co_cellvars - ) + if hasattr(obj, "co_posonlyargcount"): # pragma: no branch + args = ( + obj.co_argcount, obj.co_posonlyargcount, + obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, + obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, + obj.co_varnames, obj.co_filename, obj.co_name, + obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, + obj.co_cellvars + ) else: args = ( - obj.co_argcount, obj.co_nlocals, obj.co_stacksize, obj.co_flags, obj.co_code, - obj.co_consts, obj.co_names, obj.co_varnames, obj.co_filename, obj.co_name, - obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, obj.co_cellvars + obj.co_argcount, obj.co_kwonlyargcount, obj.co_nlocals, + obj.co_stacksize, obj.co_flags, obj.co_code, obj.co_consts, + obj.co_names, obj.co_varnames, obj.co_filename, + obj.co_name, obj.co_firstlineno, obj.co_lnotab, + obj.co_freevars, obj.co_cellvars ) self.save_reduce(types.CodeType, args, obj=obj) @@ -423,143 +536,38 @@ def save_function(self, obj, name=None): Determines what kind of function obj is (e.g. lambda, defined at interactive prompt, etc) and handles the pickling appropriately. """ - try: - should_special_case = obj in _BUILTIN_TYPE_CONSTRUCTORS - except TypeError: - # Methods of builtin types aren't hashable in python 2. - should_special_case = False - - if should_special_case: - # We keep a special-cased cache of built-in type constructors at - # global scope, because these functions are structured very - # differently in different python versions and implementations (for - # example, they're instances of types.BuiltinFunctionType in - # CPython, but they're ordinary types.FunctionType instances in - # PyPy). - # - # If the function we've received is in that cache, we just - # serialize it as a lookup into the cache. - return self.save_reduce(_BUILTIN_TYPE_CONSTRUCTORS[obj], (), obj=obj) - - write = self.write - - if name is None: - name = obj.__name__ - try: - # whichmodule() could fail, see - # https://bitbucket.org/gutworth/six/issues/63/importing-six-breaks-pickling - modname = pickle.whichmodule(obj, name) - except Exception: - modname = None - # print('which gives %s %s %s' % (modname, obj, name)) - try: - themodule = sys.modules[modname] - except KeyError: - # eval'd items such as namedtuple give invalid items for their function __module__ - modname = '__main__' - - if modname == '__main__': - themodule = None - - try: - lookedup_by_name = getattr(themodule, name, None) - except Exception: - lookedup_by_name = None - - if themodule: - if lookedup_by_name is obj: - return self.save_global(obj, name) - - # a builtin_function_or_method which comes in as an attribute of some - # object (e.g., itertools.chain.from_iterable) will end - # up with modname "__main__" and so end up here. But these functions - # have no __code__ attribute in CPython, so the handling for - # user-defined functions below will fail. - # So we pickle them here using save_reduce; have to do it differently - # for different python versions. - if not hasattr(obj, '__code__'): - if PY3: # pragma: no branch - rv = obj.__reduce_ex__(self.proto) - else: - if hasattr(obj, '__self__'): - rv = (getattr, (obj.__self__, name)) - else: - raise pickle.PicklingError("Can't pickle %r" % obj) - return self.save_reduce(obj=obj, *rv) - - # if func is lambda, def'ed at prompt, is in main, or is nested, then - # we'll pickle the actual function object rather than simply saving a - # reference (as is done in default pickler), via save_function_tuple. - if (islambda(obj) - or getattr(obj.__code__, 'co_filename', None) == '' - or themodule is None): - self.save_function_tuple(obj) - return - else: - # func is nested - if lookedup_by_name is None or lookedup_by_name is not obj: - self.save_function_tuple(obj) - return - - if obj.__dict__: - # essentially save_reduce, but workaround needed to avoid recursion - self.save(_restore_attr) - write(pickle.MARK + pickle.GLOBAL + modname + '\n' + name + '\n') - self.memoize(obj) - self.save(obj.__dict__) - write(pickle.TUPLE + pickle.REDUCE) + if _is_importable_by_name(obj, name=name): + return Pickler.save_global(self, obj, name=name) + elif PYPY and isinstance(obj.__code__, builtin_code_type): + return self.save_pypy_builtin_func(obj) else: - write(pickle.GLOBAL + modname + '\n' + name + '\n') - self.memoize(obj) + return self.save_function_tuple(obj) dispatch[types.FunctionType] = save_function - def _save_subimports(self, code, top_level_dependencies): + def save_pypy_builtin_func(self, obj): + """Save pypy equivalent of builtin functions. + + PyPy does not have the concept of builtin-functions. Instead, + builtin-functions are simple function instances, but with a + builtin-code attribute. + Most of the time, builtin functions should be pickled by attribute. But + PyPy has flaky support for __qualname__, so some builtin functions such + as float.__new__ will be classified as dynamic. For this reason only, + we created this special routine. Because builtin-functions are not + expected to have closure or globals, there is no additional hack + (compared the one already implemented in pickle) to protect ourselves + from reference cycles. A simple (reconstructor, newargs, obj.__dict__) + tuple is save_reduced. + + Note also that PyPy improved their support for __qualname__ in v3.6, so + this routing should be removed when cloudpickle supports only PyPy 3.6 + and later. """ - Save submodules used by a function but not listed in its globals. - - In the example below: - - ``` - import concurrent.futures - import cloudpickle - - - def func(): - x = concurrent.futures.ThreadPoolExecutor - - - if __name__ == '__main__': - cloudpickle.dumps(func) - ``` - - the globals extracted by cloudpickle in the function's state include - the concurrent module, but not its submodule (here, - concurrent.futures), which is the module used by func. - - To ensure that calling the depickled function does not raise an - AttributeError, this function looks for any currently loaded submodule - that the function uses and whose parent is present in the function - globals, and saves it before saving the function. - """ - - # check if any known dependency is an imported package - for x in top_level_dependencies: - if isinstance(x, types.ModuleType) and hasattr(x, '__package__') and x.__package__: - # check if the package has any currently loaded sub-imports - prefix = x.__name__ + '.' - # A concurrent thread could mutate sys.modules, - # make sure we iterate over a copy to avoid exceptions - for name in list(sys.modules): - # Older versions of pytest will add a "None" module to sys.modules. - if name is not None and name.startswith(prefix): - # check whether the function can address the sub-module - tokens = set(name[len(prefix):].split('.')) - if not tokens - set(code.co_names): - # ensure unpickler executes this import - self.save(sys.modules[name]) - # then discards the reference to it - self.write(pickle.POP) + rv = (types.FunctionType, (obj.__code__, {}, obj.__name__, + obj.__defaults__, obj.__closure__), + obj.__dict__) + self.save_reduce(*rv, obj=obj) def _save_dynamic_enum(self, obj, clsdict): """Special handling for dynamic Enum subclasses @@ -570,13 +578,12 @@ def _save_dynamic_enum(self, obj, clsdict): """ members = dict((e.name, e.value) for e in obj) - # Python 2.7 with enum34 can have no qualname: - qualname = getattr(obj, "__qualname__", None) - - self.save_reduce(_make_skeleton_enum, - (obj.__bases__, obj.__name__, qualname, members, - obj.__module__, _ensure_tracking(obj), None), - obj=obj) + self.save_reduce( + _make_skeleton_enum, + (obj.__bases__, obj.__name__, obj.__qualname__, + members, obj.__module__, _get_or_create_tracker_id(obj), None), + obj=obj + ) # Cleanup the clsdict that will be passed to _rehydrate_skeleton_class: # Those attributes are already handled by the metaclass. @@ -597,26 +604,38 @@ def save_dynamic_class(self, obj): clsdict = _extract_class_dict(obj) clsdict.pop('__weakref__', None) - # For ABCMeta in python3.7+, remove _abc_impl as it is not picklable. - # This is a fix which breaks the cache but this only makes the first - # calls to issubclass slower. - if "_abc_impl" in clsdict: - import abc - (registry, _, _, _) = abc._get_dump(obj) - clsdict["_abc_impl"] = [subclass_weakref() - for subclass_weakref in registry] + if issubclass(type(obj), abc.ABCMeta): + # If obj is an instance of an ABCMeta subclass, dont pickle the + # cache/negative caches populated during isinstance/issubclass + # checks, but pickle the list of registered subclasses of obj. + clsdict.pop('_abc_cache', None) + clsdict.pop('_abc_negative_cache', None) + clsdict.pop('_abc_negative_cache_version', None) + registry = clsdict.pop('_abc_registry', None) + if registry is None: + # in Python3.7+, the abc caches and registered subclasses of a + # class are bundled into the single _abc_impl attribute + clsdict.pop('_abc_impl', None) + (registry, _, _, _) = abc._get_dump(obj) + + clsdict["_abc_impl"] = [subclass_weakref() + for subclass_weakref in registry] + else: + # In the above if clause, registry is a set of weakrefs -- in + # this case, registry is a WeakSet + clsdict["_abc_impl"] = [type_ for type_ in registry] # On PyPy, __doc__ is a readonly attribute, so we need to include it in # the initial skeleton class. This is safe because we know that the # doc can't participate in a cycle with the original class. type_kwargs = {'__doc__': clsdict.pop('__doc__', None)} - if hasattr(obj, "__slots__"): + if "__slots__" in clsdict: type_kwargs['__slots__'] = obj.__slots__ # pickle string length optimization: member descriptors of obj are # created automatically from obj's __slots__ attribute, no need to # save them in obj's state - if isinstance(obj.__slots__, string_types): + if isinstance(obj.__slots__, str): clsdict.pop(obj.__slots__) else: for k in obj.__slots__: @@ -624,6 +643,7 @@ def save_dynamic_class(self, obj): # If type overrides __dict__ as a property, include it in the type # kwargs. In Python 2, we can't set this attribute after construction. + # XXX: can this ever happen in Python 3? If so add a test. __dict__ = clsdict.pop('__dict__', None) if isinstance(__dict__, property): type_kwargs['__dict__'] = __dict__ @@ -657,8 +677,8 @@ def save_dynamic_class(self, obj): # "Regular" class definition: tp = type(obj) self.save_reduce(_make_skeleton_class, - (tp, obj.__name__, obj.__bases__, type_kwargs, - _ensure_tracking(obj), None), + (tp, obj.__name__, _get_bases(obj), type_kwargs, + _get_or_create_tracker_id(obj), None), obj=obj) # Now save the rest of obj's __dict__. Any references to obj @@ -696,7 +716,12 @@ def save_function_tuple(self, func): save(_fill_function) # skeleton function updater write(pickle.MARK) # beginning of tuple that _fill_function expects - self._save_subimports( + # Extract currently-imported submodules used by func. Storing these + # modules in a smoke _cloudpickle_subimports attribute of the object's + # state will trigger the side effect of importing these modules at + # unpickling time (which is necessary for func to work correctly once + # depickled) + submodules = _find_imported_submodules( code, itertools.chain(f_globals.values(), closure_values or ()), ) @@ -720,8 +745,9 @@ def save_function_tuple(self, func): 'module': func.__module__, 'name': func.__name__, 'doc': func.__doc__, + '_cloudpickle_submodules': submodules } - if hasattr(func, '__annotations__') and sys.version_info >= (3, 7): + if hasattr(func, '__annotations__'): state['annotations'] = func.__annotations__ if hasattr(func, '__qualname__'): state['qualname'] = func.__qualname__ @@ -731,36 +757,6 @@ def save_function_tuple(self, func): write(pickle.TUPLE) write(pickle.REDUCE) # applies _fill_function on the tuple - _extract_code_globals_cache = ( - weakref.WeakKeyDictionary() - if not hasattr(sys, "pypy_version_info") - else {}) - - @classmethod - def extract_code_globals(cls, co): - """ - Find all globals names read or written to by codeblock co - """ - out_names = cls._extract_code_globals_cache.get(co) - if out_names is None: - try: - names = co.co_names - except AttributeError: - # PyPy "builtin-code" object - out_names = set() - else: - out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} - - # see if nested function have any global refs - if co.co_consts: - for const in co.co_consts: - if type(const) is types.CodeType: - out_names |= cls.extract_code_globals(const) - - cls._extract_code_globals_cache[co] = out_names - - return out_names - def extract_func_data(self, func): """ Turn the function into a tuple of data necessary to recreate it: @@ -769,7 +765,7 @@ def extract_func_data(self, func): code = func.__code__ # extract all global ref's - func_global_refs = self.extract_code_globals(code) + func_global_refs = _extract_code_globals(code) # process all variables referenced by global environment f_globals = {} @@ -813,12 +809,10 @@ def extract_func_data(self, func): return (code, f_globals, defaults, closure, dct, base_globals) - def save_builtin_function(self, obj): - if obj.__module__ == "__builtin__": - return self.save_global(obj) - return self.save_function(obj) + def save_getset_descriptor(self, obj): + return self.save_reduce(getattr, (obj.__objclass__, obj.__name__)) - dispatch[types.BuiltinFunctionType] = save_builtin_function + dispatch[types.GetSetDescriptorType] = save_getset_descriptor def save_global(self, obj, name=None, pack=struct.pack): """ @@ -833,91 +827,39 @@ def save_global(self, obj, name=None, pack=struct.pack): return self.save_reduce(type, (Ellipsis,), obj=obj) elif obj is type(NotImplemented): return self.save_reduce(type, (NotImplemented,), obj=obj) - - if obj.__module__ == "__main__": - return self.save_dynamic_class(obj) - - try: - return Pickler.save_global(self, obj, name=name) - except Exception: - if obj.__module__ == "__builtin__" or obj.__module__ == "builtins": - if obj in _BUILTIN_TYPE_NAMES: - return self.save_reduce( - _builtin_type, (_BUILTIN_TYPE_NAMES[obj],), obj=obj) - - typ = type(obj) - if typ is not obj and isinstance(obj, (type, types.ClassType)): - return self.save_dynamic_class(obj) - - raise + elif obj in _BUILTIN_TYPE_NAMES: + return self.save_reduce( + _builtin_type, (_BUILTIN_TYPE_NAMES[obj],), obj=obj) + + if sys.version_info[:2] < (3, 7) and _is_parametrized_type_hint(obj): # noqa # pragma: no branch + # Parametrized typing constructs in Python < 3.7 are not compatible + # with type checks and ``isinstance`` semantics. For this reason, + # it is easier to detect them using a duck-typing-based check + # (``_is_parametrized_type_hint``) than to populate the Pickler's + # dispatch with type-specific savers. + self._save_parametrized_type_hint(obj) + elif name is not None: + Pickler.save_global(self, obj, name=name) + elif not _is_importable_by_name(obj, name=name): + self.save_dynamic_class(obj) + else: + Pickler.save_global(self, obj, name=name) dispatch[type] = save_global - dispatch[types.ClassType] = save_global def save_instancemethod(self, obj): # Memoization rarely is ever useful due to python bounding if obj.__self__ is None: self.save_reduce(getattr, (obj.im_class, obj.__name__)) else: - if PY3: # pragma: no branch - self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj) - else: - self.save_reduce(types.MethodType, (obj.__func__, obj.__self__, obj.__self__.__class__), - obj=obj) + self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj) dispatch[types.MethodType] = save_instancemethod - def save_inst(self, obj): - """Inner logic to save instance. Based off pickle.save_inst""" - cls = obj.__class__ - - # Try the dispatch table (pickle module doesn't do it) - f = self.dispatch.get(cls) - if f: - f(self, obj) # Call unbound method with explicit self - return - - memo = self.memo - write = self.write - save = self.save - - if hasattr(obj, '__getinitargs__'): - args = obj.__getinitargs__() - len(args) # XXX Assert it's a sequence - pickle._keep_alive(args, memo) - else: - args = () - - write(pickle.MARK) - - if self.bin: - save(cls) - for arg in args: - save(arg) - write(pickle.OBJ) - else: - for arg in args: - save(arg) - write(pickle.INST + cls.__module__ + '\n' + cls.__name__ + '\n') - - self.memoize(obj) - - try: - getstate = obj.__getstate__ - except AttributeError: - stuff = obj.__dict__ - else: - stuff = getstate() - pickle._keep_alive(stuff, memo) - save(stuff) - write(pickle.BUILD) - - if PY2: # pragma: no branch - dispatch[types.InstanceType] = save_inst - def save_property(self, obj): # properties not correctly saved in python - self.save_reduce(property, (obj.fget, obj.fset, obj.fdel, obj.__doc__), obj=obj) + self.save_reduce(property, (obj.fget, obj.fset, obj.fdel, obj.__doc__), + obj=obj) dispatch[property] = save_property @@ -965,10 +907,6 @@ def __getattribute__(self, item): def save_file(self, obj): """Save a file""" - try: - import StringIO as pystringIO # we can't use cStringIO as it lacks the name attribute - except ImportError: - import io as pystringIO if not hasattr(obj, 'name') or not hasattr(obj, 'mode'): raise pickle.PicklingError("Cannot pickle files that do not map to an actual file") @@ -987,7 +925,8 @@ def save_file(self, obj): name = obj.name - retval = pystringIO.StringIO() + # TODO: also support binary mode files with io.BytesIO + retval = io.StringIO() try: # Read the whole file @@ -1010,11 +949,7 @@ def save_ellipsis(self, obj): def save_not_implemented(self, obj): self.save_reduce(_gen_not_implemented, ()) - try: # Python 2 - dispatch[file] = save_file - except NameError: # Python 3 # pragma: no branch - dispatch[io.TextIOWrapper] = save_file - + dispatch[io.TextIOWrapper] = save_file dispatch[type(Ellipsis)] = save_ellipsis dispatch[type(NotImplemented)] = save_not_implemented @@ -1044,6 +979,57 @@ def inject_addons(self): """Plug in system. Register additional pickling functions if modules already loaded""" pass + if sys.version_info < (3, 7): # pragma: no branch + def _save_parametrized_type_hint(self, obj): + # The distorted type check sematic for typing construct becomes: + # ``type(obj) is type(TypeHint)``, which means "obj is a + # parametrized TypeHint" + if type(obj) is type(Literal): # pragma: no branch + initargs = (Literal, obj.__values__) + elif type(obj) is type(Final): # pragma: no branch + initargs = (Final, obj.__type__) + elif type(obj) is type(ClassVar): + initargs = (ClassVar, obj.__type__) + elif type(obj) is type(Generic): + parameters = obj.__parameters__ + if len(obj.__parameters__) > 0: + # in early Python 3.5, __parameters__ was sometimes + # preferred to __args__ + initargs = (obj.__origin__, parameters) + else: + initargs = (obj.__origin__, obj.__args__) + elif type(obj) is type(Union): + if sys.version_info < (3, 5, 3): # pragma: no cover + initargs = (Union, obj.__union_params__) + else: + initargs = (Union, obj.__args__) + elif type(obj) is type(Tuple): + if sys.version_info < (3, 5, 3): # pragma: no cover + initargs = (Tuple, obj.__tuple_params__) + else: + initargs = (Tuple, obj.__args__) + elif type(obj) is type(Callable): + if sys.version_info < (3, 5, 3): # pragma: no cover + args = obj.__args__ + result = obj.__result__ + if args != Ellipsis: + if isinstance(args, tuple): + args = list(args) + else: + args = [args] + else: + (*args, result) = obj.__args__ + if len(args) == 1 and args[0] is Ellipsis: + args = Ellipsis + else: + args = list(args) + initargs = (Callable, (args, result)) + else: # pragma: no cover + raise pickle.PicklingError( + "Cloudpickle Error: Unknown type {}".format(type(obj)) + ) + self.save_reduce(_create_parametrized_type_hint, initargs, obj=obj) + # Tornado support @@ -1091,7 +1077,7 @@ def dumps(obj, protocol=None): Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure compatibility with older versions of Python. """ - file = StringIO() + file = BytesIO() try: cp = CloudPickler(file, protocol=protocol) cp.dump(obj) @@ -1114,16 +1100,10 @@ def subimport(name): def dynamic_subimport(name, vars): mod = types.ModuleType(name) mod.__dict__.update(vars) + mod.__dict__['__builtins__'] = builtins.__dict__ return mod -# restores function attributes -def _restore_attr(obj, attr): - for key, val in attr.items(): - setattr(obj, key, val) - return obj - - def _gen_ellipsis(): return Ellipsis @@ -1213,6 +1193,13 @@ def _fill_function(*args): func.__qualname__ = state['qualname'] if 'kwdefaults' in state: func.__kwdefaults__ = state['kwdefaults'] + # _cloudpickle_subimports is a set of submodules that must be loaded for + # the pickled function to work correctly at unpickling time. Now that these + # submodules are depickled (hence imported), they can be removed from the + # object's state (the object state only served as a reference holder to + # these submodules) + if '_cloudpickle_submodules' in state: + state.pop('_cloudpickle_submodules') cells = func.__closure__ if cells is not None: @@ -1265,7 +1252,10 @@ class id will also reuse this class definition. The "extra" variable is meant to be a dict (or None) that can be used for forward compatibility shall the need arise. """ - skeleton_class = type_constructor(name, bases, type_kwargs) + skeleton_class = types.new_class( + name, bases, {'metaclass': type_constructor}, + lambda ns: ns.update(type_kwargs) + ) return _lookup_class_or_track(class_tracker_id, skeleton_class) @@ -1312,10 +1302,7 @@ class id will also reuse this enum definition. classdict[member_name] = member_value enum_class = metacls.__new__(metacls, name, bases, classdict) enum_class.__module__ = module - - # Python 2.7 compat - if qualname is not None: - enum_class.__qualname__ = qualname + enum_class.__qualname__ = qualname return _lookup_class_or_track(class_tracker_id, enum_class) @@ -1329,34 +1316,69 @@ def _is_dynamic(module): if hasattr(module, '__file__'): return False - if hasattr(module, '__spec__'): - return module.__spec__ is None - else: - # Backward compat for Python 2 - import imp - try: - path = None - for part in module.__name__.split('.'): - if path is not None: - path = [path] - f, path, description = imp.find_module(part, path) - if f is not None: - f.close() - except ImportError: - return True + if module.__spec__ is not None: return False + # In PyPy, Some built-in modules such as _codecs can have their + # __spec__ attribute set to None despite being imported. For such + # modules, the ``_find_spec`` utility of the standard library is used. + parent_name = module.__name__.rpartition('.')[0] + if parent_name: # pragma: no cover + # This code handles the case where an imported package (and not + # module) remains with __spec__ set to None. It is however untested + # as no package in the PyPy stdlib has __spec__ set to None after + # it is imported. + try: + parent = sys.modules[parent_name] + except KeyError: + msg = "parent {!r} not in sys.modules" + raise ImportError(msg.format(parent_name)) + else: + pkgpath = parent.__path__ + else: + pkgpath = None + return _find_spec(module.__name__, pkgpath, module) is None -""" Use copy_reg to extend global pickle definitions """ -if sys.version_info < (3, 4): # pragma: no branch - method_descriptor = type(str.upper) +def _make_typevar(name, bound, constraints, covariant, contravariant, + class_tracker_id): + tv = typing.TypeVar( + name, *constraints, bound=bound, + covariant=covariant, contravariant=contravariant + ) + if class_tracker_id is not None: + return _lookup_class_or_track(class_tracker_id, tv) + else: # pragma: nocover + # Only for Python 3.5.3 compat. + return tv - def _reduce_method_descriptor(obj): - return (getattr, (obj.__objclass__, obj.__name__)) +def _decompose_typevar(obj): try: - import copy_reg as copyreg - except ImportError: - import copyreg - copyreg.pickle(method_descriptor, _reduce_method_descriptor) \ No newline at end of file + class_tracker_id = _get_or_create_tracker_id(obj) + except TypeError: # pragma: nocover + # TypeVar instances are not weakref-able in Python 3.5.3 + class_tracker_id = None + return ( + obj.__name__, obj.__bound__, obj.__constraints__, + obj.__covariant__, obj.__contravariant__, + class_tracker_id, + ) + + +def _typevar_reduce(obj): + # TypeVar instances have no __qualname__ hence we pass the name explicitly. + module_and_name = _lookup_module_and_qualname(obj, name=obj.__name__) + if module_and_name is None: + return (_make_typevar, _decompose_typevar(obj)) + return (getattr, module_and_name) + + +def _get_bases(typ): + if hasattr(typ, '__orig_bases__'): + # For generic types (see PEP 560) + bases_attr = '__orig_bases__' + else: + # For regular class objects + bases_attr = '__bases__' + return getattr(typ, bases_attr) \ No newline at end of file From 192fa93d018dd25f15da4643a7c2a906276fd677 Mon Sep 17 00:00:00 2001 From: Suzen Fylke Date: Fri, 26 Jun 2020 10:53:28 -0400 Subject: [PATCH 2/2] Update cloudpickle to v1.4.1 --- python/pyspark/cloudpickle.py | 1076 ++++++++++++++-------------- python/pyspark/cloudpickle_fast.py | 557 ++++++++++++++ 2 files changed, 1106 insertions(+), 527 deletions(-) create mode 100644 python/pyspark/cloudpickle_fast.py diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py index 09d3a5e7cfb6f..eac09dd99feb1 100644 --- a/python/pyspark/cloudpickle.py +++ b/python/pyspark/cloudpickle.py @@ -42,27 +42,42 @@ """ from __future__ import print_function +import abc +import builtins import dis -from functools import partial import io import itertools import logging import opcode import operator import pickle +import platform import struct import sys -import traceback import types import weakref import uuid import threading +import typing +from enum import Enum + +from typing import Generic, Union, Tuple, Callable +from pickle import _Pickler as Pickler +from pickle import _getattribute +from io import BytesIO +from importlib._bootstrap import _find_spec + +try: # pragma: no branch + import typing_extensions as _typing_extensions + from typing_extensions import Literal, Final +except ImportError: + _typing_extensions = Literal = Final = None +if sys.version_info >= (3, 5, 3): + from typing import ClassVar +else: # pragma: no cover + ClassVar = None -try: - from enum import Enum -except ImportError: - Enum = None # cloudpickle is meant for inter process communication: we expect all # communicating processes to run the same Python version hence we favor @@ -76,29 +91,17 @@ _DYNAMIC_CLASS_TRACKER_BY_ID = weakref.WeakValueDictionary() _DYNAMIC_CLASS_TRACKER_LOCK = threading.Lock() -if sys.version_info[0] < 3: # pragma: no branch - from pickle import Pickler - try: - from cStringIO import StringIO - except ImportError: - from StringIO import StringIO - string_types = (basestring,) # noqa - PY3 = False - PY2 = True - PY2_WRAPPER_DESCRIPTOR_TYPE = type(object.__init__) - PY2_METHOD_WRAPPER_TYPE = type(object.__eq__) - PY2_CLASS_DICT_BLACKLIST = (PY2_METHOD_WRAPPER_TYPE, - PY2_WRAPPER_DESCRIPTOR_TYPE) -else: - types.ClassType = type - from pickle import _Pickler as Pickler - from io import BytesIO as StringIO - string_types = (str,) - PY3 = True - PY2 = False - - -def _ensure_tracking(class_def): +PYPY = platform.python_implementation() == "PyPy" + +builtin_code_type = None +if PYPY: + # builtin-code objects only exist in pypy + builtin_code_type = type(float.__new__.__code__) + +_extract_code_globals_cache = weakref.WeakKeyDictionary() + + +def _get_or_create_tracker_id(class_def): with _DYNAMIC_CLASS_TRACKER_LOCK: class_tracker_id = _DYNAMIC_CLASS_TRACKER_BY_CLASS.get(class_def) if class_tracker_id is None: @@ -117,109 +120,262 @@ def _lookup_class_or_track(class_tracker_id, class_def): return class_def -def _make_cell_set_template_code(): - """Get the Python compiler to emit LOAD_FAST(arg); STORE_DEREF +def _whichmodule(obj, name): + """Find the module an object belongs to. - Notes - ----- - In Python 3, we could use an easier function: + This function differs from ``pickle.whichmodule`` in two ways: + - it does not mangle the cases where obj's module is __main__ and obj was + not found in any module. + - Errors arising during module introspection are ignored, as those errors + are considered unwanted side effects. + """ + if sys.version_info[:2] < (3, 7) and isinstance(obj, typing.TypeVar): # pragma: no branch # noqa + # Workaround bug in old Python versions: prior to Python 3.7, + # T.__module__ would always be set to "typing" even when the TypeVar T + # would be defined in a different module. + # + # For such older Python versions, we ignore the __module__ attribute of + # TypeVar instances and instead exhaustively lookup those instances in + # all currently imported modules. + module_name = None + else: + module_name = getattr(obj, '__module__', None) + + if module_name is not None: + return module_name + # Protect the iteration by using a copy of sys.modules against dynamic + # modules that trigger imports of other modules upon calls to getattr or + # other threads importing at the same time. + for module_name, module in sys.modules.copy().items(): + # Some modules such as coverage can inject non-module objects inside + # sys.modules + if ( + module_name == '__main__' or + module is None or + not isinstance(module, types.ModuleType) + ): + continue + try: + if _getattribute(module, name)[0] is obj: + return module_name + except Exception: + pass + return None + + +def _is_importable_by_name(obj, name=None): + """Determine if obj can be pickled as attribute of a file-backed module""" + return _lookup_module_and_qualname(obj, name=name) is not None - .. code-block:: python - def f(): - cell = None +def _lookup_module_and_qualname(obj, name=None): + if name is None: + name = getattr(obj, '__qualname__', None) + if name is None: # pragma: no cover + # This used to be needed for Python 2.7 support but is probably not + # needed anymore. However we keep the __name__ introspection in case + # users of cloudpickle rely on this old behavior for unknown reasons. + name = getattr(obj, '__name__', None) - def _stub(value): - nonlocal cell - cell = value + module_name = _whichmodule(obj, name) - return _stub + if module_name is None: + # In this case, obj.__module__ is None AND obj was not found in any + # imported module. obj is thus treated as dynamic. + return None + + if module_name == "__main__": + return None + + module = sys.modules.get(module_name, None) + if module is None: + # The main reason why obj's module would not be imported is that this + # module has been dynamically created, using for example + # types.ModuleType. The other possibility is that module was removed + # from sys.modules after obj was created/imported. But this case is not + # supported, as the standard pickle does not support it either. + return None + + # module has been added to sys.modules, but it can still be dynamic. + if _is_dynamic(module): + return None + + try: + obj2, parent = _getattribute(module, name) + except AttributeError: + # obj was not found inside the module it points to + return None + if obj2 is not obj: + return None + return module, name - _cell_set_template_code = f().__code__ - This function is _only_ a LOAD_FAST(arg); STORE_DEREF, but that is - invalid syntax on Python 2. If we use this function we also don't need - to do the weird freevars/cellvars swap below +def _extract_code_globals(co): """ - def inner(value): - lambda: cell # make ``cell`` a closure so that we get a STORE_DEREF - cell = value + Find all globals names read or written to by codeblock co + """ + out_names = _extract_code_globals_cache.get(co) + if out_names is None: + names = co.co_names + out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} - co = inner.__code__ - - # NOTE: we are marking the cell variable as a free variable intentionally - # so that we simulate an inner function instead of the outer function. This - # is what gives us the ``nonlocal`` behavior in a Python 2 compatible way. - if PY2: # pragma: no branch - return types.CodeType( - co.co_argcount, - co.co_nlocals, - co.co_stacksize, - co.co_flags, - co.co_code, - co.co_consts, - co.co_names, - co.co_varnames, - co.co_filename, - co.co_name, - co.co_firstlineno, - co.co_lnotab, - co.co_cellvars, # this is the trickery - (), - ) - else: - if hasattr(types.CodeType, "co_posonlyargcount"): # pragma: no branch - return types.CodeType( - co.co_argcount, - co.co_posonlyargcount, # Python3.8 with PEP570 - co.co_kwonlyargcount, - co.co_nlocals, - co.co_stacksize, - co.co_flags, - co.co_code, - co.co_consts, - co.co_names, - co.co_varnames, - co.co_filename, - co.co_name, - co.co_firstlineno, - co.co_lnotab, - co.co_cellvars, # this is the trickery - (), - ) - else: - return types.CodeType( - co.co_argcount, - co.co_kwonlyargcount, - co.co_nlocals, - co.co_stacksize, - co.co_flags, - co.co_code, - co.co_consts, - co.co_names, - co.co_varnames, - co.co_filename, - co.co_name, - co.co_firstlineno, - co.co_lnotab, - co.co_cellvars, # this is the trickery - (), - ) + # Declaring a function inside another one using the "def ..." + # syntax generates a constant code object corresonding to the one + # of the nested function's As the nested function may itself need + # global variables, we need to introspect its code, extract its + # globals, (look for code object in it's co_consts attribute..) and + # add the result to code_globals + if co.co_consts: + for const in co.co_consts: + if isinstance(const, types.CodeType): + out_names |= _extract_code_globals(const) + + _extract_code_globals_cache[co] = out_names + + return out_names -_cell_set_template_code = _make_cell_set_template_code() + +def _find_imported_submodules(code, top_level_dependencies): + """ + Find currently imported submodules used by a function. + + Submodules used by a function need to be detected and referenced for the + function to work correctly at depickling time. Because submodules can be + referenced as attribute of their parent package (``package.submodule``), we + need a special introspection technique that does not rely on GLOBAL-related + opcodes to find references of them in a code object. + + Example: + ``` + import concurrent.futures + import cloudpickle + def func(): + x = concurrent.futures.ThreadPoolExecutor + if __name__ == '__main__': + cloudpickle.dumps(func) + ``` + The globals extracted by cloudpickle in the function's state include the + concurrent package, but not its submodule (here, concurrent.futures), which + is the module used by func. Find_imported_submodules will detect the usage + of concurrent.futures. Saving this module alongside with func will ensure + that calling func once depickled does not fail due to concurrent.futures + not being imported + """ + + subimports = [] + # check if any known dependency is an imported package + for x in top_level_dependencies: + if (isinstance(x, types.ModuleType) and + hasattr(x, '__package__') and x.__package__): + # check if the package has any currently loaded sub-imports + prefix = x.__name__ + '.' + # A concurrent thread could mutate sys.modules, + # make sure we iterate over a copy to avoid exceptions + for name in list(sys.modules): + # Older versions of pytest will add a "None" module to + # sys.modules. + if name is not None and name.startswith(prefix): + # check whether the function can address the sub-module + tokens = set(name[len(prefix):].split('.')) + if not tokens - set(code.co_names): + subimports.append(sys.modules[name]) + return subimports def cell_set(cell, value): """Set the value of a closure cell. + + The point of this function is to set the cell_contents attribute of a cell + after its creation. This operation is necessary in case the cell contains a + reference to the function the cell belongs to, as when calling the + function's constructor + ``f = types.FunctionType(code, globals, name, argdefs, closure)``, + closure will not be able to contain the yet-to-be-created f. + + In Python3.7, cell_contents is writeable, so setting the contents of a cell + can be done simply using + >>> cell.cell_contents = value + + In earlier Python3 versions, the cell_contents attribute of a cell is read + only, but this limitation can be worked around by leveraging the Python 3 + ``nonlocal`` keyword. + + In Python2 however, this attribute is read only, and there is no + ``nonlocal`` keyword. For this reason, we need to come up with more + complicated hacks to set this attribute. + + The chosen approach is to create a function with a STORE_DEREF opcode, + which sets the content of a closure variable. Typically: + + >>> def inner(value): + ... lambda: cell # the lambda makes cell a closure + ... cell = value # cell is a closure, so this triggers a STORE_DEREF + + (Note that in Python2, A STORE_DEREF can never be triggered from an inner + function. The function g for example here + >>> def f(var): + ... def g(): + ... var += 1 + ... return g + + will not modify the closure variable ``var```inplace, but instead try to + load a local variable var and increment it. As g does not assign the local + variable ``var`` any initial value, calling f(1)() will fail at runtime.) + + Our objective is to set the value of a given cell ``cell``. So we need to + somewhat reference our ``cell`` object into the ``inner`` function so that + this object (and not the smoke cell of the lambda function) gets affected + by the STORE_DEREF operation. + + In inner, ``cell`` is referenced as a cell variable (an enclosing variable + that is referenced by the inner function). If we create a new function + cell_set with the exact same code as ``inner``, but with ``cell`` marked as + a free variable instead, the STORE_DEREF will be applied on its closure - + ``cell``, which we can specify explicitly during construction! The new + cell_set variable thus actually sets the contents of a specified cell! + + Note: we do not make use of the ``nonlocal`` keyword to set the contents of + a cell in early python3 versions to limit possible syntax errors in case + test and checker libraries decide to parse the whole file. """ - return types.FunctionType( - _cell_set_template_code, - {}, - '_cell_set_inner', - (), - (cell,), - )(value) + if sys.version_info[:2] >= (3, 7): # pragma: no branch + cell.cell_contents = value + else: + _cell_set = types.FunctionType( + _cell_set_template_code, {}, '_cell_set', (), (cell,),) + _cell_set(value) + + +def _make_cell_set_template_code(): + def _cell_set_factory(value): + lambda: cell + cell = value + + co = _cell_set_factory.__code__ + + _cell_set_template_code = types.CodeType( + co.co_argcount, + co.co_kwonlyargcount, # Python 3 only argument + co.co_nlocals, + co.co_stacksize, + co.co_flags, + co.co_code, + co.co_consts, + co.co_names, + co.co_varnames, + co.co_filename, + co.co_name, + co.co_firstlineno, + co.co_lnotab, + co.co_cellvars, # co_freevars is initialized with co_cellvars + (), # co_cellvars is made empty + ) + return _cell_set_template_code + + +if sys.version_info[:2] < (3, 7): + _cell_set_template_code = _make_cell_set_template_code() # relevant opcodes STORE_GLOBAL = opcode.opmap['STORE_GLOBAL'] @@ -230,10 +386,6 @@ def cell_set(cell, value): EXTENDED_ARG = dis.EXTENDED_ARG -def islambda(func): - return getattr(func, '__name__') == '' - - _BUILTIN_TYPE_NAMES = {} for k, v in types.__dict__.items(): if type(v) is type: @@ -241,70 +393,23 @@ def islambda(func): def _builtin_type(name): + if name == "ClassType": # pragma: no cover + # Backward compat to load pickle files generated with cloudpickle + # < 1.3 even if loading pickle files from older versions is not + # officially supported. + return type return getattr(types, name) -def _make__new__factory(type_): - def _factory(): - return type_.__new__ - return _factory - - -# NOTE: These need to be module globals so that they're pickleable as globals. -_get_dict_new = _make__new__factory(dict) -_get_frozenset_new = _make__new__factory(frozenset) -_get_list_new = _make__new__factory(list) -_get_set_new = _make__new__factory(set) -_get_tuple_new = _make__new__factory(tuple) -_get_object_new = _make__new__factory(object) - -# Pre-defined set of builtin_function_or_method instances that can be -# serialized. -_BUILTIN_TYPE_CONSTRUCTORS = { - dict.__new__: _get_dict_new, - frozenset.__new__: _get_frozenset_new, - set.__new__: _get_set_new, - list.__new__: _get_list_new, - tuple.__new__: _get_tuple_new, - object.__new__: _get_object_new, -} - - -if sys.version_info < (3, 4): # pragma: no branch - def _walk_global_ops(code): - """ - Yield (opcode, argument number) tuples for all - global-referencing instructions in *code*. - """ - code = getattr(code, 'co_code', b'') - if PY2: # pragma: no branch - code = map(ord, code) - - n = len(code) - i = 0 - extended_arg = 0 - while i < n: - op = code[i] - i += 1 - if op >= HAVE_ARGUMENT: - oparg = code[i] + code[i + 1] * 256 + extended_arg - extended_arg = 0 - i += 2 - if op == EXTENDED_ARG: - extended_arg = oparg * 65536 - if op in GLOBAL_OPS: - yield op, oparg - -else: - def _walk_global_ops(code): - """ - Yield (opcode, argument number) tuples for all - global-referencing instructions in *code*. - """ - for instr in dis.get_instructions(code): - op = instr.opcode - if op in GLOBAL_OPS: - yield op, instr.arg +def _walk_global_ops(code): + """ + Yield (opcode, argument number) tuples for all + global-referencing instructions in *code*. + """ + for instr in dis.get_instructions(code): + op = instr.opcode + if op in GLOBAL_OPS: + yield op, instr.arg def _extract_class_dict(cls): @@ -322,17 +427,6 @@ def _extract_class_dict(cls): base_value = inherited_dict[name] if value is base_value: to_remove.append(name) - elif PY2: - # backward compat for Python 2 - if hasattr(value, "im_func"): - if value.im_func is getattr(base_value, "im_func", None): - to_remove.append(name) - elif isinstance(value, PY2_CLASS_DICT_BLACKLIST): - # On Python 2 we have no way to pickle those specific - # methods types nor to check that they are actually - # inherited. So we assume that they are always inherited - # from builtin types. - to_remove.append(name) except KeyError: pass for name in to_remove: @@ -340,6 +434,32 @@ def _extract_class_dict(cls): return clsdict +if sys.version_info[:2] < (3, 7): # pragma: no branch + def _is_parametrized_type_hint(obj): + # This is very cheap but might generate false positives. + # general typing Constructs + is_typing = getattr(obj, '__origin__', None) is not None + + # typing_extensions.Literal + is_litteral = getattr(obj, '__values__', None) is not None + + # typing_extensions.Final + is_final = getattr(obj, '__type__', None) is not None + + # typing.Union/Tuple for old Python 3.5 + is_union = getattr(obj, '__union_params__', None) is not None + is_tuple = getattr(obj, '__tuple_params__', None) is not None + is_callable = ( + getattr(obj, '__result__', None) is not None and + getattr(obj, '__args__', None) is not None + ) + return any((is_typing, is_litteral, is_final, is_union, is_tuple, + is_callable)) + + def _create_parametrized_type_hint(origin, args): + return origin[args] + + class CloudPickler(Pickler): dispatch = Pickler.dispatch.copy() @@ -362,22 +482,22 @@ def dump(self, obj): else: raise + def save_typevar(self, obj): + self.save_reduce(*_typevar_reduce(obj), obj=obj) + + dispatch[typing.TypeVar] = save_typevar + def save_memoryview(self, obj): self.save(obj.tobytes()) dispatch[memoryview] = save_memoryview - if PY2: # pragma: no branch - def save_buffer(self, obj): - self.save(str(obj)) - - dispatch[buffer] = save_buffer # noqa: F821 'buffer' was removed in Python 3 - def save_module(self, obj): """ Save a module as an import """ if _is_dynamic(obj): + obj.__dict__.pop('__builtins__', None) self.save_reduce(dynamic_subimport, (obj.__name__, vars(obj)), obj=obj) else: @@ -389,29 +509,22 @@ def save_codeobject(self, obj): """ Save a code object """ - if PY3: # pragma: no branch - if hasattr(obj, "co_posonlyargcount"): # pragma: no branch - args = ( - obj.co_argcount, obj.co_posonlyargcount, - obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, - obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, - obj.co_varnames, obj.co_filename, obj.co_name, - obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, - obj.co_cellvars - ) - else: - args = ( - obj.co_argcount, obj.co_kwonlyargcount, obj.co_nlocals, - obj.co_stacksize, obj.co_flags, obj.co_code, obj.co_consts, - obj.co_names, obj.co_varnames, obj.co_filename, - obj.co_name, obj.co_firstlineno, obj.co_lnotab, - obj.co_freevars, obj.co_cellvars - ) + if hasattr(obj, "co_posonlyargcount"): # pragma: no branch + args = ( + obj.co_argcount, obj.co_posonlyargcount, + obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, + obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, + obj.co_varnames, obj.co_filename, obj.co_name, + obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, + obj.co_cellvars + ) else: args = ( - obj.co_argcount, obj.co_nlocals, obj.co_stacksize, obj.co_flags, obj.co_code, - obj.co_consts, obj.co_names, obj.co_varnames, obj.co_filename, obj.co_name, - obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, obj.co_cellvars + obj.co_argcount, obj.co_kwonlyargcount, obj.co_nlocals, + obj.co_stacksize, obj.co_flags, obj.co_code, obj.co_consts, + obj.co_names, obj.co_varnames, obj.co_filename, + obj.co_name, obj.co_firstlineno, obj.co_lnotab, + obj.co_freevars, obj.co_cellvars ) self.save_reduce(types.CodeType, args, obj=obj) @@ -423,143 +536,38 @@ def save_function(self, obj, name=None): Determines what kind of function obj is (e.g. lambda, defined at interactive prompt, etc) and handles the pickling appropriately. """ - try: - should_special_case = obj in _BUILTIN_TYPE_CONSTRUCTORS - except TypeError: - # Methods of builtin types aren't hashable in python 2. - should_special_case = False - - if should_special_case: - # We keep a special-cased cache of built-in type constructors at - # global scope, because these functions are structured very - # differently in different python versions and implementations (for - # example, they're instances of types.BuiltinFunctionType in - # CPython, but they're ordinary types.FunctionType instances in - # PyPy). - # - # If the function we've received is in that cache, we just - # serialize it as a lookup into the cache. - return self.save_reduce(_BUILTIN_TYPE_CONSTRUCTORS[obj], (), obj=obj) - - write = self.write - - if name is None: - name = obj.__name__ - try: - # whichmodule() could fail, see - # https://bitbucket.org/gutworth/six/issues/63/importing-six-breaks-pickling - modname = pickle.whichmodule(obj, name) - except Exception: - modname = None - # print('which gives %s %s %s' % (modname, obj, name)) - try: - themodule = sys.modules[modname] - except KeyError: - # eval'd items such as namedtuple give invalid items for their function __module__ - modname = '__main__' - - if modname == '__main__': - themodule = None - - try: - lookedup_by_name = getattr(themodule, name, None) - except Exception: - lookedup_by_name = None - - if themodule: - if lookedup_by_name is obj: - return self.save_global(obj, name) - - # a builtin_function_or_method which comes in as an attribute of some - # object (e.g., itertools.chain.from_iterable) will end - # up with modname "__main__" and so end up here. But these functions - # have no __code__ attribute in CPython, so the handling for - # user-defined functions below will fail. - # So we pickle them here using save_reduce; have to do it differently - # for different python versions. - if not hasattr(obj, '__code__'): - if PY3: # pragma: no branch - rv = obj.__reduce_ex__(self.proto) - else: - if hasattr(obj, '__self__'): - rv = (getattr, (obj.__self__, name)) - else: - raise pickle.PicklingError("Can't pickle %r" % obj) - return self.save_reduce(obj=obj, *rv) - - # if func is lambda, def'ed at prompt, is in main, or is nested, then - # we'll pickle the actual function object rather than simply saving a - # reference (as is done in default pickler), via save_function_tuple. - if (islambda(obj) - or getattr(obj.__code__, 'co_filename', None) == '' - or themodule is None): - self.save_function_tuple(obj) - return - else: - # func is nested - if lookedup_by_name is None or lookedup_by_name is not obj: - self.save_function_tuple(obj) - return - - if obj.__dict__: - # essentially save_reduce, but workaround needed to avoid recursion - self.save(_restore_attr) - write(pickle.MARK + pickle.GLOBAL + modname + '\n' + name + '\n') - self.memoize(obj) - self.save(obj.__dict__) - write(pickle.TUPLE + pickle.REDUCE) + if _is_importable_by_name(obj, name=name): + return Pickler.save_global(self, obj, name=name) + elif PYPY and isinstance(obj.__code__, builtin_code_type): + return self.save_pypy_builtin_func(obj) else: - write(pickle.GLOBAL + modname + '\n' + name + '\n') - self.memoize(obj) + return self.save_function_tuple(obj) dispatch[types.FunctionType] = save_function - def _save_subimports(self, code, top_level_dependencies): + def save_pypy_builtin_func(self, obj): + """Save pypy equivalent of builtin functions. + + PyPy does not have the concept of builtin-functions. Instead, + builtin-functions are simple function instances, but with a + builtin-code attribute. + Most of the time, builtin functions should be pickled by attribute. But + PyPy has flaky support for __qualname__, so some builtin functions such + as float.__new__ will be classified as dynamic. For this reason only, + we created this special routine. Because builtin-functions are not + expected to have closure or globals, there is no additional hack + (compared the one already implemented in pickle) to protect ourselves + from reference cycles. A simple (reconstructor, newargs, obj.__dict__) + tuple is save_reduced. + + Note also that PyPy improved their support for __qualname__ in v3.6, so + this routing should be removed when cloudpickle supports only PyPy 3.6 + and later. """ - Save submodules used by a function but not listed in its globals. - - In the example below: - - ``` - import concurrent.futures - import cloudpickle - - - def func(): - x = concurrent.futures.ThreadPoolExecutor - - - if __name__ == '__main__': - cloudpickle.dumps(func) - ``` - - the globals extracted by cloudpickle in the function's state include - the concurrent module, but not its submodule (here, - concurrent.futures), which is the module used by func. - - To ensure that calling the depickled function does not raise an - AttributeError, this function looks for any currently loaded submodule - that the function uses and whose parent is present in the function - globals, and saves it before saving the function. - """ - - # check if any known dependency is an imported package - for x in top_level_dependencies: - if isinstance(x, types.ModuleType) and hasattr(x, '__package__') and x.__package__: - # check if the package has any currently loaded sub-imports - prefix = x.__name__ + '.' - # A concurrent thread could mutate sys.modules, - # make sure we iterate over a copy to avoid exceptions - for name in list(sys.modules): - # Older versions of pytest will add a "None" module to sys.modules. - if name is not None and name.startswith(prefix): - # check whether the function can address the sub-module - tokens = set(name[len(prefix):].split('.')) - if not tokens - set(code.co_names): - # ensure unpickler executes this import - self.save(sys.modules[name]) - # then discards the reference to it - self.write(pickle.POP) + rv = (types.FunctionType, (obj.__code__, {}, obj.__name__, + obj.__defaults__, obj.__closure__), + obj.__dict__) + self.save_reduce(*rv, obj=obj) def _save_dynamic_enum(self, obj, clsdict): """Special handling for dynamic Enum subclasses @@ -570,13 +578,12 @@ def _save_dynamic_enum(self, obj, clsdict): """ members = dict((e.name, e.value) for e in obj) - # Python 2.7 with enum34 can have no qualname: - qualname = getattr(obj, "__qualname__", None) - - self.save_reduce(_make_skeleton_enum, - (obj.__bases__, obj.__name__, qualname, members, - obj.__module__, _ensure_tracking(obj), None), - obj=obj) + self.save_reduce( + _make_skeleton_enum, + (obj.__bases__, obj.__name__, obj.__qualname__, + members, obj.__module__, _get_or_create_tracker_id(obj), None), + obj=obj + ) # Cleanup the clsdict that will be passed to _rehydrate_skeleton_class: # Those attributes are already handled by the metaclass. @@ -597,26 +604,38 @@ def save_dynamic_class(self, obj): clsdict = _extract_class_dict(obj) clsdict.pop('__weakref__', None) - # For ABCMeta in python3.7+, remove _abc_impl as it is not picklable. - # This is a fix which breaks the cache but this only makes the first - # calls to issubclass slower. - if "_abc_impl" in clsdict: - import abc - (registry, _, _, _) = abc._get_dump(obj) - clsdict["_abc_impl"] = [subclass_weakref() - for subclass_weakref in registry] + if issubclass(type(obj), abc.ABCMeta): + # If obj is an instance of an ABCMeta subclass, dont pickle the + # cache/negative caches populated during isinstance/issubclass + # checks, but pickle the list of registered subclasses of obj. + clsdict.pop('_abc_cache', None) + clsdict.pop('_abc_negative_cache', None) + clsdict.pop('_abc_negative_cache_version', None) + registry = clsdict.pop('_abc_registry', None) + if registry is None: + # in Python3.7+, the abc caches and registered subclasses of a + # class are bundled into the single _abc_impl attribute + clsdict.pop('_abc_impl', None) + (registry, _, _, _) = abc._get_dump(obj) + + clsdict["_abc_impl"] = [subclass_weakref() + for subclass_weakref in registry] + else: + # In the above if clause, registry is a set of weakrefs -- in + # this case, registry is a WeakSet + clsdict["_abc_impl"] = [type_ for type_ in registry] # On PyPy, __doc__ is a readonly attribute, so we need to include it in # the initial skeleton class. This is safe because we know that the # doc can't participate in a cycle with the original class. type_kwargs = {'__doc__': clsdict.pop('__doc__', None)} - if hasattr(obj, "__slots__"): + if "__slots__" in clsdict: type_kwargs['__slots__'] = obj.__slots__ # pickle string length optimization: member descriptors of obj are # created automatically from obj's __slots__ attribute, no need to # save them in obj's state - if isinstance(obj.__slots__, string_types): + if isinstance(obj.__slots__, str): clsdict.pop(obj.__slots__) else: for k in obj.__slots__: @@ -624,6 +643,7 @@ def save_dynamic_class(self, obj): # If type overrides __dict__ as a property, include it in the type # kwargs. In Python 2, we can't set this attribute after construction. + # XXX: can this ever happen in Python 3? If so add a test. __dict__ = clsdict.pop('__dict__', None) if isinstance(__dict__, property): type_kwargs['__dict__'] = __dict__ @@ -657,8 +677,8 @@ def save_dynamic_class(self, obj): # "Regular" class definition: tp = type(obj) self.save_reduce(_make_skeleton_class, - (tp, obj.__name__, obj.__bases__, type_kwargs, - _ensure_tracking(obj), None), + (tp, obj.__name__, _get_bases(obj), type_kwargs, + _get_or_create_tracker_id(obj), None), obj=obj) # Now save the rest of obj's __dict__. Any references to obj @@ -696,7 +716,12 @@ def save_function_tuple(self, func): save(_fill_function) # skeleton function updater write(pickle.MARK) # beginning of tuple that _fill_function expects - self._save_subimports( + # Extract currently-imported submodules used by func. Storing these + # modules in a smoke _cloudpickle_subimports attribute of the object's + # state will trigger the side effect of importing these modules at + # unpickling time (which is necessary for func to work correctly once + # depickled) + submodules = _find_imported_submodules( code, itertools.chain(f_globals.values(), closure_values or ()), ) @@ -720,8 +745,9 @@ def save_function_tuple(self, func): 'module': func.__module__, 'name': func.__name__, 'doc': func.__doc__, + '_cloudpickle_submodules': submodules } - if hasattr(func, '__annotations__') and sys.version_info >= (3, 7): + if hasattr(func, '__annotations__'): state['annotations'] = func.__annotations__ if hasattr(func, '__qualname__'): state['qualname'] = func.__qualname__ @@ -731,36 +757,6 @@ def save_function_tuple(self, func): write(pickle.TUPLE) write(pickle.REDUCE) # applies _fill_function on the tuple - _extract_code_globals_cache = ( - weakref.WeakKeyDictionary() - if not hasattr(sys, "pypy_version_info") - else {}) - - @classmethod - def extract_code_globals(cls, co): - """ - Find all globals names read or written to by codeblock co - """ - out_names = cls._extract_code_globals_cache.get(co) - if out_names is None: - try: - names = co.co_names - except AttributeError: - # PyPy "builtin-code" object - out_names = set() - else: - out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} - - # see if nested function have any global refs - if co.co_consts: - for const in co.co_consts: - if type(const) is types.CodeType: - out_names |= cls.extract_code_globals(const) - - cls._extract_code_globals_cache[co] = out_names - - return out_names - def extract_func_data(self, func): """ Turn the function into a tuple of data necessary to recreate it: @@ -769,7 +765,7 @@ def extract_func_data(self, func): code = func.__code__ # extract all global ref's - func_global_refs = self.extract_code_globals(code) + func_global_refs = _extract_code_globals(code) # process all variables referenced by global environment f_globals = {} @@ -813,12 +809,10 @@ def extract_func_data(self, func): return (code, f_globals, defaults, closure, dct, base_globals) - def save_builtin_function(self, obj): - if obj.__module__ == "__builtin__": - return self.save_global(obj) - return self.save_function(obj) + def save_getset_descriptor(self, obj): + return self.save_reduce(getattr, (obj.__objclass__, obj.__name__)) - dispatch[types.BuiltinFunctionType] = save_builtin_function + dispatch[types.GetSetDescriptorType] = save_getset_descriptor def save_global(self, obj, name=None, pack=struct.pack): """ @@ -833,91 +827,39 @@ def save_global(self, obj, name=None, pack=struct.pack): return self.save_reduce(type, (Ellipsis,), obj=obj) elif obj is type(NotImplemented): return self.save_reduce(type, (NotImplemented,), obj=obj) - - if obj.__module__ == "__main__": - return self.save_dynamic_class(obj) - - try: - return Pickler.save_global(self, obj, name=name) - except Exception: - if obj.__module__ == "__builtin__" or obj.__module__ == "builtins": - if obj in _BUILTIN_TYPE_NAMES: - return self.save_reduce( - _builtin_type, (_BUILTIN_TYPE_NAMES[obj],), obj=obj) - - typ = type(obj) - if typ is not obj and isinstance(obj, (type, types.ClassType)): - return self.save_dynamic_class(obj) - - raise + elif obj in _BUILTIN_TYPE_NAMES: + return self.save_reduce( + _builtin_type, (_BUILTIN_TYPE_NAMES[obj],), obj=obj) + + if sys.version_info[:2] < (3, 7) and _is_parametrized_type_hint(obj): # noqa # pragma: no branch + # Parametrized typing constructs in Python < 3.7 are not compatible + # with type checks and ``isinstance`` semantics. For this reason, + # it is easier to detect them using a duck-typing-based check + # (``_is_parametrized_type_hint``) than to populate the Pickler's + # dispatch with type-specific savers. + self._save_parametrized_type_hint(obj) + elif name is not None: + Pickler.save_global(self, obj, name=name) + elif not _is_importable_by_name(obj, name=name): + self.save_dynamic_class(obj) + else: + Pickler.save_global(self, obj, name=name) dispatch[type] = save_global - dispatch[types.ClassType] = save_global def save_instancemethod(self, obj): # Memoization rarely is ever useful due to python bounding if obj.__self__ is None: self.save_reduce(getattr, (obj.im_class, obj.__name__)) else: - if PY3: # pragma: no branch - self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj) - else: - self.save_reduce(types.MethodType, (obj.__func__, obj.__self__, obj.__self__.__class__), - obj=obj) + self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj) dispatch[types.MethodType] = save_instancemethod - def save_inst(self, obj): - """Inner logic to save instance. Based off pickle.save_inst""" - cls = obj.__class__ - - # Try the dispatch table (pickle module doesn't do it) - f = self.dispatch.get(cls) - if f: - f(self, obj) # Call unbound method with explicit self - return - - memo = self.memo - write = self.write - save = self.save - - if hasattr(obj, '__getinitargs__'): - args = obj.__getinitargs__() - len(args) # XXX Assert it's a sequence - pickle._keep_alive(args, memo) - else: - args = () - - write(pickle.MARK) - - if self.bin: - save(cls) - for arg in args: - save(arg) - write(pickle.OBJ) - else: - for arg in args: - save(arg) - write(pickle.INST + cls.__module__ + '\n' + cls.__name__ + '\n') - - self.memoize(obj) - - try: - getstate = obj.__getstate__ - except AttributeError: - stuff = obj.__dict__ - else: - stuff = getstate() - pickle._keep_alive(stuff, memo) - save(stuff) - write(pickle.BUILD) - - if PY2: # pragma: no branch - dispatch[types.InstanceType] = save_inst - def save_property(self, obj): # properties not correctly saved in python - self.save_reduce(property, (obj.fget, obj.fset, obj.fdel, obj.__doc__), obj=obj) + self.save_reduce(property, (obj.fget, obj.fset, obj.fdel, obj.__doc__), + obj=obj) dispatch[property] = save_property @@ -965,10 +907,6 @@ def __getattribute__(self, item): def save_file(self, obj): """Save a file""" - try: - import StringIO as pystringIO # we can't use cStringIO as it lacks the name attribute - except ImportError: - import io as pystringIO if not hasattr(obj, 'name') or not hasattr(obj, 'mode'): raise pickle.PicklingError("Cannot pickle files that do not map to an actual file") @@ -987,7 +925,8 @@ def save_file(self, obj): name = obj.name - retval = pystringIO.StringIO() + # TODO: also support binary mode files with io.BytesIO + retval = io.StringIO() try: # Read the whole file @@ -1010,11 +949,7 @@ def save_ellipsis(self, obj): def save_not_implemented(self, obj): self.save_reduce(_gen_not_implemented, ()) - try: # Python 2 - dispatch[file] = save_file - except NameError: # Python 3 # pragma: no branch - dispatch[io.TextIOWrapper] = save_file - + dispatch[io.TextIOWrapper] = save_file dispatch[type(Ellipsis)] = save_ellipsis dispatch[type(NotImplemented)] = save_not_implemented @@ -1044,6 +979,57 @@ def inject_addons(self): """Plug in system. Register additional pickling functions if modules already loaded""" pass + if sys.version_info < (3, 7): # pragma: no branch + def _save_parametrized_type_hint(self, obj): + # The distorted type check sematic for typing construct becomes: + # ``type(obj) is type(TypeHint)``, which means "obj is a + # parametrized TypeHint" + if type(obj) is type(Literal): # pragma: no branch + initargs = (Literal, obj.__values__) + elif type(obj) is type(Final): # pragma: no branch + initargs = (Final, obj.__type__) + elif type(obj) is type(ClassVar): + initargs = (ClassVar, obj.__type__) + elif type(obj) is type(Generic): + parameters = obj.__parameters__ + if len(obj.__parameters__) > 0: + # in early Python 3.5, __parameters__ was sometimes + # preferred to __args__ + initargs = (obj.__origin__, parameters) + else: + initargs = (obj.__origin__, obj.__args__) + elif type(obj) is type(Union): + if sys.version_info < (3, 5, 3): # pragma: no cover + initargs = (Union, obj.__union_params__) + else: + initargs = (Union, obj.__args__) + elif type(obj) is type(Tuple): + if sys.version_info < (3, 5, 3): # pragma: no cover + initargs = (Tuple, obj.__tuple_params__) + else: + initargs = (Tuple, obj.__args__) + elif type(obj) is type(Callable): + if sys.version_info < (3, 5, 3): # pragma: no cover + args = obj.__args__ + result = obj.__result__ + if args != Ellipsis: + if isinstance(args, tuple): + args = list(args) + else: + args = [args] + else: + (*args, result) = obj.__args__ + if len(args) == 1 and args[0] is Ellipsis: + args = Ellipsis + else: + args = list(args) + initargs = (Callable, (args, result)) + else: # pragma: no cover + raise pickle.PicklingError( + "Cloudpickle Error: Unknown type {}".format(type(obj)) + ) + self.save_reduce(_create_parametrized_type_hint, initargs, obj=obj) + # Tornado support @@ -1091,7 +1077,7 @@ def dumps(obj, protocol=None): Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure compatibility with older versions of Python. """ - file = StringIO() + file = BytesIO() try: cp = CloudPickler(file, protocol=protocol) cp.dump(obj) @@ -1114,16 +1100,10 @@ def subimport(name): def dynamic_subimport(name, vars): mod = types.ModuleType(name) mod.__dict__.update(vars) + mod.__dict__['__builtins__'] = builtins.__dict__ return mod -# restores function attributes -def _restore_attr(obj, attr): - for key, val in attr.items(): - setattr(obj, key, val) - return obj - - def _gen_ellipsis(): return Ellipsis @@ -1213,6 +1193,13 @@ def _fill_function(*args): func.__qualname__ = state['qualname'] if 'kwdefaults' in state: func.__kwdefaults__ = state['kwdefaults'] + # _cloudpickle_subimports is a set of submodules that must be loaded for + # the pickled function to work correctly at unpickling time. Now that these + # submodules are depickled (hence imported), they can be removed from the + # object's state (the object state only served as a reference holder to + # these submodules) + if '_cloudpickle_submodules' in state: + state.pop('_cloudpickle_submodules') cells = func.__closure__ if cells is not None: @@ -1265,7 +1252,10 @@ class id will also reuse this class definition. The "extra" variable is meant to be a dict (or None) that can be used for forward compatibility shall the need arise. """ - skeleton_class = type_constructor(name, bases, type_kwargs) + skeleton_class = types.new_class( + name, bases, {'metaclass': type_constructor}, + lambda ns: ns.update(type_kwargs) + ) return _lookup_class_or_track(class_tracker_id, skeleton_class) @@ -1312,10 +1302,7 @@ class id will also reuse this enum definition. classdict[member_name] = member_value enum_class = metacls.__new__(metacls, name, bases, classdict) enum_class.__module__ = module - - # Python 2.7 compat - if qualname is not None: - enum_class.__qualname__ = qualname + enum_class.__qualname__ = qualname return _lookup_class_or_track(class_tracker_id, enum_class) @@ -1329,34 +1316,69 @@ def _is_dynamic(module): if hasattr(module, '__file__'): return False - if hasattr(module, '__spec__'): - return module.__spec__ is None - else: - # Backward compat for Python 2 - import imp - try: - path = None - for part in module.__name__.split('.'): - if path is not None: - path = [path] - f, path, description = imp.find_module(part, path) - if f is not None: - f.close() - except ImportError: - return True + if module.__spec__ is not None: return False + # In PyPy, Some built-in modules such as _codecs can have their + # __spec__ attribute set to None despite being imported. For such + # modules, the ``_find_spec`` utility of the standard library is used. + parent_name = module.__name__.rpartition('.')[0] + if parent_name: # pragma: no cover + # This code handles the case where an imported package (and not + # module) remains with __spec__ set to None. It is however untested + # as no package in the PyPy stdlib has __spec__ set to None after + # it is imported. + try: + parent = sys.modules[parent_name] + except KeyError: + msg = "parent {!r} not in sys.modules" + raise ImportError(msg.format(parent_name)) + else: + pkgpath = parent.__path__ + else: + pkgpath = None + return _find_spec(module.__name__, pkgpath, module) is None -""" Use copy_reg to extend global pickle definitions """ -if sys.version_info < (3, 4): # pragma: no branch - method_descriptor = type(str.upper) +def _make_typevar(name, bound, constraints, covariant, contravariant, + class_tracker_id): + tv = typing.TypeVar( + name, *constraints, bound=bound, + covariant=covariant, contravariant=contravariant + ) + if class_tracker_id is not None: + return _lookup_class_or_track(class_tracker_id, tv) + else: # pragma: nocover + # Only for Python 3.5.3 compat. + return tv - def _reduce_method_descriptor(obj): - return (getattr, (obj.__objclass__, obj.__name__)) +def _decompose_typevar(obj): try: - import copy_reg as copyreg - except ImportError: - import copyreg - copyreg.pickle(method_descriptor, _reduce_method_descriptor) \ No newline at end of file + class_tracker_id = _get_or_create_tracker_id(obj) + except TypeError: # pragma: nocover + # TypeVar instances are not weakref-able in Python 3.5.3 + class_tracker_id = None + return ( + obj.__name__, obj.__bound__, obj.__constraints__, + obj.__covariant__, obj.__contravariant__, + class_tracker_id, + ) + + +def _typevar_reduce(obj): + # TypeVar instances have no __qualname__ hence we pass the name explicitly. + module_and_name = _lookup_module_and_qualname(obj, name=obj.__name__) + if module_and_name is None: + return (_make_typevar, _decompose_typevar(obj)) + return (getattr, module_and_name) + + +def _get_bases(typ): + if hasattr(typ, '__orig_bases__'): + # For generic types (see PEP 560) + bases_attr = '__orig_bases__' + else: + # For regular class objects + bases_attr = '__bases__' + return getattr(typ, bases_attr) \ No newline at end of file diff --git a/python/pyspark/cloudpickle_fast.py b/python/pyspark/cloudpickle_fast.py new file mode 100644 index 0000000000000..0413ea57b2438 --- /dev/null +++ b/python/pyspark/cloudpickle_fast.py @@ -0,0 +1,557 @@ +""" +New, fast version of the CloudPickler. + +This new CloudPickler class can now extend the fast C Pickler instead of the +previous Python implementation of the Pickler class. Because this functionality +is only available for Python versions 3.8+, a lot of backward-compatibility +code is also removed. + +Note that the C Pickler sublassing API is CPython-specific. Therefore, some +guards present in cloudpickle.py that were written to handle PyPy specificities +are not present in cloudpickle_fast.py +""" +import abc +import copyreg +import io +import itertools +import logging +import _pickle +import pickle +import sys +import types +import weakref +import typing + +from _pickle import Pickler + +from .cloudpickle import ( + _is_dynamic, _extract_code_globals, _BUILTIN_TYPE_NAMES, DEFAULT_PROTOCOL, + _find_imported_submodules, _get_cell_contents, _is_importable_by_name, _builtin_type, + Enum, _get_or_create_tracker_id, _make_skeleton_class, _make_skeleton_enum, + _extract_class_dict, dynamic_subimport, subimport, _typevar_reduce, _get_bases, +) + +load, loads = _pickle.load, _pickle.loads + + +# Shorthands similar to pickle.dump/pickle.dumps +def dump(obj, file, protocol=None, buffer_callback=None): + """Serialize obj as bytes streamed into file + + protocol defaults to cloudpickle.DEFAULT_PROTOCOL which is an alias to + pickle.HIGHEST_PROTOCOL. This setting favors maximum communication speed + between processes running the same Python version. + + Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure + compatibility with older versions of Python. + """ + CloudPickler(file, protocol=protocol, buffer_callback=buffer_callback).dump(obj) + + +def dumps(obj, protocol=None, buffer_callback=None): + """Serialize obj as a string of bytes allocated in memory + + protocol defaults to cloudpickle.DEFAULT_PROTOCOL which is an alias to + pickle.HIGHEST_PROTOCOL. This setting favors maximum communication speed + between processes running the same Python version. + + Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure + compatibility with older versions of Python. + """ + with io.BytesIO() as file: + cp = CloudPickler(file, protocol=protocol, buffer_callback=buffer_callback) + cp.dump(obj) + return file.getvalue() + + +# COLLECTION OF OBJECTS __getnewargs__-LIKE METHODS +# ------------------------------------------------- + +def _class_getnewargs(obj): + type_kwargs = {} + if "__slots__" in obj.__dict__: + type_kwargs["__slots__"] = obj.__slots__ + + __dict__ = obj.__dict__.get('__dict__', None) + if isinstance(__dict__, property): + type_kwargs['__dict__'] = __dict__ + + return (type(obj), obj.__name__, _get_bases(obj), type_kwargs, + _get_or_create_tracker_id(obj), None) + + +def _enum_getnewargs(obj): + members = dict((e.name, e.value) for e in obj) + return (obj.__bases__, obj.__name__, obj.__qualname__, members, + obj.__module__, _get_or_create_tracker_id(obj), None) + + +# COLLECTION OF OBJECTS RECONSTRUCTORS +# ------------------------------------ +def _file_reconstructor(retval): + return retval + + +# COLLECTION OF OBJECTS STATE GETTERS +# ----------------------------------- +def _function_getstate(func): + # - Put func's dynamic attributes (stored in func.__dict__) in state. These + # attributes will be restored at unpickling time using + # f.__dict__.update(state) + # - Put func's members into slotstate. Such attributes will be restored at + # unpickling time by iterating over slotstate and calling setattr(func, + # slotname, slotvalue) + slotstate = { + "__name__": func.__name__, + "__qualname__": func.__qualname__, + "__annotations__": func.__annotations__, + "__kwdefaults__": func.__kwdefaults__, + "__defaults__": func.__defaults__, + "__module__": func.__module__, + "__doc__": func.__doc__, + "__closure__": func.__closure__, + } + + f_globals_ref = _extract_code_globals(func.__code__) + f_globals = {k: func.__globals__[k] for k in f_globals_ref if k in + func.__globals__} + + closure_values = ( + list(map(_get_cell_contents, func.__closure__)) + if func.__closure__ is not None else () + ) + + # Extract currently-imported submodules used by func. Storing these modules + # in a smoke _cloudpickle_subimports attribute of the object's state will + # trigger the side effect of importing these modules at unpickling time + # (which is necessary for func to work correctly once depickled) + slotstate["_cloudpickle_submodules"] = _find_imported_submodules( + func.__code__, itertools.chain(f_globals.values(), closure_values)) + slotstate["__globals__"] = f_globals + + state = func.__dict__ + return state, slotstate + + +def _class_getstate(obj): + clsdict = _extract_class_dict(obj) + clsdict.pop('__weakref__', None) + + if issubclass(type(obj), abc.ABCMeta): + # If obj is an instance of an ABCMeta subclass, dont pickle the + # cache/negative caches populated during isinstance/issubclass + # checks, but pickle the list of registered subclasses of obj. + clsdict.pop('_abc_impl', None) + (registry, _, _, _) = abc._get_dump(obj) + clsdict["_abc_impl"] = [subclass_weakref() + for subclass_weakref in registry] + + if "__slots__" in clsdict: + # pickle string length optimization: member descriptors of obj are + # created automatically from obj's __slots__ attribute, no need to + # save them in obj's state + if isinstance(obj.__slots__, str): + clsdict.pop(obj.__slots__) + else: + for k in obj.__slots__: + clsdict.pop(k, None) + + clsdict.pop('__dict__', None) # unpicklable property object + + return (clsdict, {}) + + +def _enum_getstate(obj): + clsdict, slotstate = _class_getstate(obj) + + members = dict((e.name, e.value) for e in obj) + # Cleanup the clsdict that will be passed to _rehydrate_skeleton_class: + # Those attributes are already handled by the metaclass. + for attrname in ["_generate_next_value_", "_member_names_", + "_member_map_", "_member_type_", + "_value2member_map_"]: + clsdict.pop(attrname, None) + for member in members: + clsdict.pop(member) + # Special handling of Enum subclasses + return clsdict, slotstate + + +# COLLECTIONS OF OBJECTS REDUCERS +# ------------------------------- +# A reducer is a function taking a single argument (obj), and that returns a +# tuple with all the necessary data to re-construct obj. Apart from a few +# exceptions (list, dict, bytes, int, etc.), a reducer is necessary to +# correctly pickle an object. +# While many built-in objects (Exceptions objects, instances of the "object" +# class, etc), are shipped with their own built-in reducer (invoked using +# obj.__reduce__), some do not. The following methods were created to "fill +# these holes". + +def _code_reduce(obj): + """codeobject reducer""" + args = ( + obj.co_argcount, obj.co_posonlyargcount, + obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, + obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, + obj.co_varnames, obj.co_filename, obj.co_name, + obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, + obj.co_cellvars + ) + return types.CodeType, args + + +def _cell_reduce(obj): + """Cell (containing values of a function's free variables) reducer""" + try: + obj.cell_contents + except ValueError: # cell is empty + return types.CellType, () + else: + return types.CellType, (obj.cell_contents,) + + +def _classmethod_reduce(obj): + orig_func = obj.__func__ + return type(obj), (orig_func,) + + +def _file_reduce(obj): + """Save a file""" + import io + + if not hasattr(obj, "name") or not hasattr(obj, "mode"): + raise pickle.PicklingError( + "Cannot pickle files that do not map to an actual file" + ) + if obj is sys.stdout: + return getattr, (sys, "stdout") + if obj is sys.stderr: + return getattr, (sys, "stderr") + if obj is sys.stdin: + raise pickle.PicklingError("Cannot pickle standard input") + if obj.closed: + raise pickle.PicklingError("Cannot pickle closed files") + if hasattr(obj, "isatty") and obj.isatty(): + raise pickle.PicklingError( + "Cannot pickle files that map to tty objects" + ) + if "r" not in obj.mode and "+" not in obj.mode: + raise pickle.PicklingError( + "Cannot pickle files that are not opened for reading: %s" + % obj.mode + ) + + name = obj.name + + retval = io.StringIO() + + try: + # Read the whole file + curloc = obj.tell() + obj.seek(0) + contents = obj.read() + obj.seek(curloc) + except IOError: + raise pickle.PicklingError( + "Cannot pickle file %s as it cannot be read" % name + ) + retval.write(contents) + retval.seek(curloc) + + retval.name = name + return _file_reconstructor, (retval,) + + +def _getset_descriptor_reduce(obj): + return getattr, (obj.__objclass__, obj.__name__) + + +def _mappingproxy_reduce(obj): + return types.MappingProxyType, (dict(obj),) + + +def _memoryview_reduce(obj): + return bytes, (obj.tobytes(),) + + +def _module_reduce(obj): + if _is_dynamic(obj): + obj.__dict__.pop('__builtins__', None) + return dynamic_subimport, (obj.__name__, vars(obj)) + else: + return subimport, (obj.__name__,) + + +def _method_reduce(obj): + return (types.MethodType, (obj.__func__, obj.__self__)) + + +def _logger_reduce(obj): + return logging.getLogger, (obj.name,) + + +def _root_logger_reduce(obj): + return logging.getLogger, () + + +def _property_reduce(obj): + return property, (obj.fget, obj.fset, obj.fdel, obj.__doc__) + + +def _weakset_reduce(obj): + return weakref.WeakSet, (list(obj),) + + +def _dynamic_class_reduce(obj): + """ + Save a class that can't be stored as module global. + + This method is used to serialize classes that are defined inside + functions, or that otherwise can't be serialized as attribute lookups + from global modules. + """ + if Enum is not None and issubclass(obj, Enum): + return ( + _make_skeleton_enum, _enum_getnewargs(obj), _enum_getstate(obj), + None, None, _class_setstate + ) + else: + return ( + _make_skeleton_class, _class_getnewargs(obj), _class_getstate(obj), + None, None, _class_setstate + ) + + +def _class_reduce(obj): + """Select the reducer depending on the dynamic nature of the class obj""" + if obj is type(None): # noqa + return type, (None,) + elif obj is type(Ellipsis): + return type, (Ellipsis,) + elif obj is type(NotImplemented): + return type, (NotImplemented,) + elif obj in _BUILTIN_TYPE_NAMES: + return _builtin_type, (_BUILTIN_TYPE_NAMES[obj],) + elif not _is_importable_by_name(obj): + return _dynamic_class_reduce(obj) + return NotImplemented + + +# COLLECTIONS OF OBJECTS STATE SETTERS +# ------------------------------------ +# state setters are called at unpickling time, once the object is created and +# it has to be updated to how it was at unpickling time. + + +def _function_setstate(obj, state): + """Update the state of a dynaamic function. + + As __closure__ and __globals__ are readonly attributes of a function, we + cannot rely on the native setstate routine of pickle.load_build, that calls + setattr on items of the slotstate. Instead, we have to modify them inplace. + """ + state, slotstate = state + obj.__dict__.update(state) + + obj_globals = slotstate.pop("__globals__") + obj_closure = slotstate.pop("__closure__") + # _cloudpickle_subimports is a set of submodules that must be loaded for + # the pickled function to work correctly at unpickling time. Now that these + # submodules are depickled (hence imported), they can be removed from the + # object's state (the object state only served as a reference holder to + # these submodules) + slotstate.pop("_cloudpickle_submodules") + + obj.__globals__.update(obj_globals) + obj.__globals__["__builtins__"] = __builtins__ + + if obj_closure is not None: + for i, cell in enumerate(obj_closure): + try: + value = cell.cell_contents + except ValueError: # cell is empty + continue + obj.__closure__[i].cell_contents = value + + for k, v in slotstate.items(): + setattr(obj, k, v) + + +def _class_setstate(obj, state): + state, slotstate = state + registry = None + for attrname, attr in state.items(): + if attrname == "_abc_impl": + registry = attr + else: + setattr(obj, attrname, attr) + if registry is not None: + for subclass in registry: + obj.register(subclass) + + return obj + + +class CloudPickler(Pickler): + """Fast C Pickler extension with additional reducing routines. + + CloudPickler's extensions exist into into: + + * its dispatch_table containing reducers that are called only if ALL + built-in saving functions were previously discarded. + * a special callback named "reducer_override", invoked before standard + function/class builtin-saving method (save_global), to serialize dynamic + functions + """ + + # cloudpickle's own dispatch_table, containing the additional set of + # objects (compared to the standard library pickle) that cloupickle can + # serialize. + dispatch = {} + dispatch[classmethod] = _classmethod_reduce + dispatch[io.TextIOWrapper] = _file_reduce + dispatch[logging.Logger] = _logger_reduce + dispatch[logging.RootLogger] = _root_logger_reduce + dispatch[memoryview] = _memoryview_reduce + dispatch[property] = _property_reduce + dispatch[staticmethod] = _classmethod_reduce + dispatch[types.CellType] = _cell_reduce + dispatch[types.CodeType] = _code_reduce + dispatch[types.GetSetDescriptorType] = _getset_descriptor_reduce + dispatch[types.ModuleType] = _module_reduce + dispatch[types.MethodType] = _method_reduce + dispatch[types.MappingProxyType] = _mappingproxy_reduce + dispatch[weakref.WeakSet] = _weakset_reduce + dispatch[typing.TypeVar] = _typevar_reduce + + def __init__(self, file, protocol=None, buffer_callback=None): + if protocol is None: + protocol = DEFAULT_PROTOCOL + Pickler.__init__(self, file, protocol=protocol, buffer_callback=buffer_callback) + # map functions __globals__ attribute ids, to ensure that functions + # sharing the same global namespace at pickling time also share their + # global namespace at unpickling time. + self.globals_ref = {} + + # Take into account potential custom reducers registered by external + # modules + self.dispatch_table = copyreg.dispatch_table.copy() + self.dispatch_table.update(self.dispatch) + self.proto = int(protocol) + + def reducer_override(self, obj): + """Type-agnostic reducing callback for function and classes. + + For performance reasons, subclasses of the C _pickle.Pickler class + cannot register custom reducers for functions and classes in the + dispatch_table. Reducer for such types must instead implemented in the + special reducer_override method. + + Note that method will be called for any object except a few + builtin-types (int, lists, dicts etc.), which differs from reducers in + the Pickler's dispatch_table, each of them being invoked for objects of + a specific type only. + + This property comes in handy for classes: although most classes are + instances of the ``type`` metaclass, some of them can be instances of + other custom metaclasses (such as enum.EnumMeta for example). In + particular, the metaclass will likely not be known in advance, and thus + cannot be special-cased using an entry in the dispatch_table. + reducer_override, among other things, allows us to register a reducer + that will be called for any class, independently of its type. + + + Notes: + + * reducer_override has the priority over dispatch_table-registered + reducers. + * reducer_override can be used to fix other limitations of cloudpickle + for other types that suffered from type-specific reducers, such as + Exceptions. See https://github.com/cloudpipe/cloudpickle/issues/248 + """ + t = type(obj) + try: + is_anyclass = issubclass(t, type) + except TypeError: # t is not a class (old Boost; see SF #502085) + is_anyclass = False + + if is_anyclass: + return _class_reduce(obj) + elif isinstance(obj, types.FunctionType): + return self._function_reduce(obj) + else: + # fallback to save_global, including the Pickler's distpatch_table + return NotImplemented + + # function reducers are defined as instance methods of CloudPickler + # objects, as they rely on a CloudPickler attribute (globals_ref) + def _dynamic_function_reduce(self, func): + """Reduce a function that is not pickleable via attribute lookup.""" + newargs = self._function_getnewargs(func) + state = _function_getstate(func) + return (types.FunctionType, newargs, state, None, None, + _function_setstate) + + def _function_reduce(self, obj): + """Reducer for function objects. + + If obj is a top-level attribute of a file-backed module, this + reducer returns NotImplemented, making the CloudPickler fallback to + traditional _pickle.Pickler routines to save obj. Otherwise, it reduces + obj using a custom cloudpickle reducer designed specifically to handle + dynamic functions. + + As opposed to cloudpickle.py, There no special handling for builtin + pypy functions because cloudpickle_fast is CPython-specific. + """ + if _is_importable_by_name(obj): + return NotImplemented + else: + return self._dynamic_function_reduce(obj) + + def _function_getnewargs(self, func): + code = func.__code__ + + # base_globals represents the future global namespace of func at + # unpickling time. Looking it up and storing it in + # CloudpiPickler.globals_ref allow functions sharing the same globals + # at pickling time to also share them once unpickled, at one condition: + # since globals_ref is an attribute of a CloudPickler instance, and + # that a new CloudPickler is created each time pickle.dump or + # pickle.dumps is called, functions also need to be saved within the + # same invocation of cloudpickle.dump/cloudpickle.dumps (for example: + # cloudpickle.dumps([f1, f2])). There is no such limitation when using + # CloudPickler.dump, as long as the multiple invocations are bound to + # the same CloudPickler. + base_globals = self.globals_ref.setdefault(id(func.__globals__), {}) + + if base_globals == {}: + # Add module attributes used to resolve relative imports + # instructions inside func. + for k in ["__package__", "__name__", "__path__", "__file__"]: + if k in func.__globals__: + base_globals[k] = func.__globals__[k] + + # Do not bind the free variables before the function is created to + # avoid infinite recursion. + if func.__closure__ is None: + closure = None + else: + closure = tuple( + types.CellType() for _ in range(len(code.co_freevars))) + + return code, base_globals, None, None, closure + + def dump(self, obj): + try: + return Pickler.dump(self, obj) + except RuntimeError as e: + if "recursion" in e.args[0]: + msg = ( + "Could not pickle object as excessively deep recursion " + "required." + ) + raise pickle.PicklingError(msg) + else: + raise \ No newline at end of file