Skip to content

Commit

Permalink
ENH: derive from C-pickler for fast serialization (#253)
Browse files Browse the repository at this point in the history
  • Loading branch information
pierreglaser authored and ogrisel committed Jun 7, 2019
1 parent f3c3aea commit 167e163
Show file tree
Hide file tree
Showing 8 changed files with 675 additions and 80 deletions.
11 changes: 8 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ matrix:
dist: trusty
python: "pypy3"
- os: linux
if: commit_message =~ /(\[ci python-nightly\])/
env: PYTHON_NIGHTLY=1
python: 3.7
- os: linux
python: 3.7
- os: linux
Expand Down Expand Up @@ -91,8 +91,12 @@ install:
- $PYTHON_EXE -m pip install .
- $PYTHON_EXE -m pip install --upgrade -r dev-requirements.txt
- $PYTHON_EXE -m pip install tornado
- if [[ $TRAVIS_PYTHON_VERSION != 'pypy'* && "$PYTHON_NIGHTLY" != 1 ]]; then
$PYTHON_EXE -m pip install numpy scipy;
- if [[ $TRAVIS_PYTHON_VERSION != 'pypy'* ]]; then
if [[ "$PYTHON_NIGHTLY" == "1" ]]; then
$PYTHON_EXE -m pip install git+https://github.com/cython/cython git+https://github.com/numpy/numpy;
else
$PYTHON_EXE -m pip install numpy scipy;
fi
fi
- if [[ $PROJECT != "" ]]; then
$PYTHON_EXE -m pip install $TEST_REQUIREMENTS;
Expand Down Expand Up @@ -126,5 +130,6 @@ script:
fi
fi
after_success:
- pip install coverage codecov
- coverage combine --append
- codecov
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
1.2.0
=====

- Leverage the C-accelerated Pickler new subclassing API (available in Python
3.8) in cloudpickle. This allows cloudpickle to pickle Python objects up to
30 times faster.
([issue #253](https://github.com/cloudpipe/cloudpickle/pull/253))

- Support pickling of classmethod and staticmethod objects in python2.
arguments. ([issue #262](https://github.com/cloudpipe/cloudpickle/pull/262))

Expand Down
6 changes: 6 additions & 0 deletions cloudpickle/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
from __future__ import absolute_import

import sys
import pickle


from cloudpickle.cloudpickle import *
if sys.version_info[:2] >= (3, 8):
from cloudpickle.cloudpickle_fast import CloudPickler, dumps, dump

__version__ = '1.2.0.dev0'
161 changes: 89 additions & 72 deletions cloudpickle/cloudpickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@
PY2 = False
from importlib._bootstrap import _find_spec

_extract_code_globals_cache = weakref.WeakKeyDictionary()


def _ensure_tracking(class_def):
with _DYNAMIC_CLASS_TRACKER_LOCK:
Expand Down Expand Up @@ -195,6 +197,78 @@ def _is_global(obj, name=None):
return obj2 is obj


def _extract_code_globals(co):
"""
Find all globals names read or written to by codeblock co
"""
out_names = _extract_code_globals_cache.get(co)
if out_names is None:
names = co.co_names
out_names = {names[oparg] for _, oparg in _walk_global_ops(co)}

# Declaring a function inside another one using the "def ..."
# syntax generates a constant code object corresonding to the one
# of the nested function's As the nested function may itself need
# global variables, we need to introspect its code, extract its
# globals, (look for code object in it's co_consts attribute..) and
# add the result to code_globals
if co.co_consts:
for const in co.co_consts:
if isinstance(const, types.CodeType):
out_names |= _extract_code_globals(const)

_extract_code_globals_cache[co] = out_names

return out_names


def _find_imported_submodules(code, top_level_dependencies):
"""
Find currently imported submodules used by a function.
Submodules used by a function need to be detected and referenced for the
function to work correctly at depickling time. Because submodules can be
referenced as attribute of their parent package (``package.submodule``), we
need a special introspection technique that does not rely on GLOBAL-related
opcodes to find references of them in a code object.
Example:
```
import concurrent.futures
import cloudpickle
def func():
x = concurrent.futures.ThreadPoolExecutor
if __name__ == '__main__':
cloudpickle.dumps(func)
```
The globals extracted by cloudpickle in the function's state include the
concurrent package, but not its submodule (here, concurrent.futures), which
is the module used by func. Find_imported_submodules will detect the usage
of concurrent.futures. Saving this module alongside with func will ensure
that calling func once depickled does not fail due to concurrent.futures
not being imported
"""

subimports = []
# check if any known dependency is an imported package
for x in top_level_dependencies:
if (isinstance(x, types.ModuleType) and
hasattr(x, '__package__') and x.__package__):
# check if the package has any currently loaded sub-imports
prefix = x.__name__ + '.'
# A concurrent thread could mutate sys.modules,
# make sure we iterate over a copy to avoid exceptions
for name in list(sys.modules):
# Older versions of pytest will add a "None" module to
# sys.modules.
if name is not None and name.startswith(prefix):
# check whether the function can address the sub-module
tokens = set(name[len(prefix):].split('.'))
if not tokens - set(code.co_names):
subimports.append(sys.modules[name])
return subimports


def _make_cell_set_template_code():
"""Get the Python compiler to emit LOAD_FAST(arg); STORE_DEREF
Expand Down Expand Up @@ -493,54 +567,6 @@ def save_pypy_builtin_func(self, obj):
obj.__dict__)
self.save_reduce(*rv, obj=obj)


def _save_subimports(self, code, top_level_dependencies):
"""
Save submodules used by a function but not listed in its globals.
In the example below:
```
import concurrent.futures
import cloudpickle
def func():
x = concurrent.futures.ThreadPoolExecutor
if __name__ == '__main__':
cloudpickle.dumps(func)
```
the globals extracted by cloudpickle in the function's state include
the concurrent module, but not its submodule (here,
concurrent.futures), which is the module used by func.
To ensure that calling the depickled function does not raise an
AttributeError, this function looks for any currently loaded submodule
that the function uses and whose parent is present in the function
globals, and saves it before saving the function.
"""

# check if any known dependency is an imported package
for x in top_level_dependencies:
if isinstance(x, types.ModuleType) and hasattr(x, '__package__') and x.__package__:
# check if the package has any currently loaded sub-imports
prefix = x.__name__ + '.'
# A concurrent thread could mutate sys.modules,
# make sure we iterate over a copy to avoid exceptions
for name in list(sys.modules):
# Older versions of pytest will add a "None" module to sys.modules.
if name is not None and name.startswith(prefix):
# check whether the function can address the sub-module
tokens = set(name[len(prefix):].split('.'))
if not tokens - set(code.co_names):
# ensure unpickler executes this import
self.save(sys.modules[name])
# then discards the reference to it
self.write(pickle.POP)

def _save_dynamic_enum(self, obj, clsdict):
"""Special handling for dynamic Enum subclasses
Expand Down Expand Up @@ -676,7 +702,12 @@ def save_function_tuple(self, func):
save(_fill_function) # skeleton function updater
write(pickle.MARK) # beginning of tuple that _fill_function expects

self._save_subimports(
# Extract currently-imported submodules used by func. Storing these
# modules in a smoke _cloudpickle_subimports attribute of the object's
# state will trigger the side effect of importing these modules at
# unpickling time (which is necessary for func to work correctly once
# depickled)
submodules = _find_imported_submodules(
code,
itertools.chain(f_globals.values(), closure_values or ()),
)
Expand All @@ -700,6 +731,7 @@ def save_function_tuple(self, func):
'module': func.__module__,
'name': func.__name__,
'doc': func.__doc__,
'_cloudpickle_submodules': submodules
}
if hasattr(func, '__annotations__') and sys.version_info >= (3, 4):
state['annotations'] = func.__annotations__
Expand All @@ -711,28 +743,6 @@ def save_function_tuple(self, func):
write(pickle.TUPLE)
write(pickle.REDUCE) # applies _fill_function on the tuple

_extract_code_globals_cache = weakref.WeakKeyDictionary()

@classmethod
def extract_code_globals(cls, co):
"""
Find all globals names read or written to by codeblock co
"""
out_names = cls._extract_code_globals_cache.get(co)
if out_names is None:
names = co.co_names
out_names = {names[oparg] for _, oparg in _walk_global_ops(co)}

# see if nested function have any global refs
if co.co_consts:
for const in co.co_consts:
if isinstance(const, types.CodeType):
out_names |= cls.extract_code_globals(const)

cls._extract_code_globals_cache[co] = out_names

return out_names

def extract_func_data(self, func):
"""
Turn the function into a tuple of data necessary to recreate it:
Expand All @@ -741,7 +751,7 @@ def extract_func_data(self, func):
code = func.__code__

# extract all global ref's
func_global_refs = self.extract_code_globals(code)
func_global_refs = _extract_code_globals(code)

# process all variables referenced by global environment
f_globals = {}
Expand Down Expand Up @@ -1202,6 +1212,13 @@ def _fill_function(*args):
func.__qualname__ = state['qualname']
if 'kwdefaults' in state:
func.__kwdefaults__ = state['kwdefaults']
# _cloudpickle_subimports is a set of submodules that must be loaded for
# the pickled function to work correctly at unpickling time. Now that these
# submodules are depickled (hence imported), they can be removed from the
# object's state (the object state only served as a reference holder to
# these submodules)
if '_cloudpickle_submodules' in state:
state.pop('_cloudpickle_submodules')

cells = func.__closure__
if cells is not None:
Expand Down
Loading

0 comments on commit 167e163

Please sign in to comment.