import os import sys from array import array import pytest from dask.multiprocessing import get_context from dask.sizeof import sizeof from dask.utils import funcname try: import pandas as pd from dask.dataframe._compat import PANDAS_GT_130 except ImportError: pd = None PANDAS_GT_130 = False requires_pandas = pytest.mark.skipif(pd is None, reason="requires pandas") requires_pandas_130 = pytest.mark.skipif( not PANDAS_GT_130, reason="requires pandas 1.3.0" ) def test_base(): assert sizeof(1) == sys.getsizeof(1) def test_name(): assert funcname(sizeof) == "sizeof" def test_containers(): assert sizeof([1, 2, [3]]) > (sys.getsizeof(3) * 3 + sys.getsizeof([])) def test_bytes_like(): assert 1000 <= sizeof(bytes(1000)) <= 2000 assert 1000 <= sizeof(bytearray(1000)) <= 2000 assert 1000 <= sizeof(memoryview(bytes(1000))) <= 2000 assert 8000 <= sizeof(array("d", range(1000))) <= 9000 def test_numpy(): np = pytest.importorskip("numpy") assert 8000 <= sizeof(np.empty(1000, dtype="f8")) <= 9000 dt = np.dtype("f8") assert sizeof(dt) == sys.getsizeof(dt) def test_numpy_0_strided(): np = pytest.importorskip("numpy") x = np.broadcast_to(1, (100, 100, 100)) assert sizeof(x) <= 8 @requires_pandas def test_pandas(): df = pd.DataFrame( {"x": [1, 2, 3], "y": ["a" * 100, "b" * 100, "c" * 100]}, index=[10, 20, 30] ) assert sizeof(df) >= sizeof(df.x) + sizeof(df.y) - sizeof(df.index) assert sizeof(df.x) >= sizeof(df.index) assert sizeof(df.y) >= 100 * 3 assert sizeof(df.index) >= 20 assert isinstance(sizeof(df), int) assert isinstance(sizeof(df.x), int) assert isinstance(sizeof(df.index), int) @requires_pandas def test_pandas_contiguous_dtypes(): """2+ contiguous columns of the same dtype in the same DataFrame share the same surface thus have lower overhead """ df1 = pd.DataFrame([[1, 2.2], [3, 4.4]]) df2 = pd.DataFrame([[1.1, 2.2], [3.3, 4.4]]) assert sizeof(df2) < sizeof(df1) @requires_pandas def test_pandas_multiindex(): index = pd.MultiIndex.from_product([range(5), ["a", "b", "c", "d", "e"]]) actual_size = sys.getsizeof(index) assert 0.5 * actual_size < sizeof(index) < 2 * actual_size assert isinstance(sizeof(index), int) @requires_pandas def test_pandas_repeated_column(): df = pd.DataFrame({"x": list(range(10_000))}) df2 = df[["x", "x", "x"]] df3 = pd.DataFrame({"x": list(range(10_000)), "y": list(range(10_000))}) assert 80_000 < sizeof(df) < 85_000 assert 80_000 < sizeof(df2) < 85_000 assert 160_000 < sizeof(df3) < 165_000 def test_sparse_matrix(): sparse = pytest.importorskip("scipy.sparse") sp = sparse.eye(10) # These are the 32-bit Python 2.7 values. assert sizeof(sp.todia()) >= 152 assert sizeof(sp.tobsr()) >= 232 assert sizeof(sp.tocoo()) >= 240 assert sizeof(sp.tocsc()) >= 232 assert sizeof(sp.tocsr()) >= 232 assert sizeof(sp.todok()) >= 188 assert sizeof(sp.tolil()) >= 204 @requires_pandas @pytest.mark.parametrize("cls_name", ["Series", "DataFrame", "Index"]) @pytest.mark.parametrize( "dtype", [object, pytest.param("string[python]", marks=requires_pandas_130)] ) def test_pandas_object_dtype(dtype, cls_name): cls = getattr(pd, cls_name) s1 = cls([f"x{i:3d}" for i in range(1000)], dtype=dtype) assert sizeof("x000") * 1000 < sizeof(s1) < 2 * sizeof("x000") * 1000 x = "x" * 100_000 y = "y" * 100_000 z = "z" * 100_000 w = "w" * 100_000 # High duplication of references to the same object s2 = cls([x, y, z, w] * 1000, dtype=dtype) assert 400_000 < sizeof(s2) < 500_000 # Low duplication of references to the same object s3 = cls([x, y, z, w], dtype=dtype) s4 = cls([x, y, z, x], dtype=dtype) s5 = cls([x, x, x, x], dtype=dtype) assert sizeof(s5) < sizeof(s4) < sizeof(s3) @requires_pandas @pytest.mark.parametrize( "dtype", [object, pytest.param("string[python]", marks=requires_pandas_130)] ) def test_dataframe_object_dtype(dtype): x = "x" * 100_000 y = "y" * 100_000 z = "z" * 100_000 w = "w" * 100_000 # High duplication of references to the same object, across different columns objs = [x, y, z, w] df1 = pd.DataFrame([objs * 3] * 1000, dtype=dtype) assert 400_000 < sizeof(df1) < 550_000 # Low duplication of references to the same object, across different columns df2 = pd.DataFrame([[x, y], [z, w]], dtype=dtype) df3 = pd.DataFrame([[x, y], [z, x]], dtype=dtype) df4 = pd.DataFrame([[x, x], [x, x]], dtype=dtype) assert sizeof(df4) < sizeof(df3) < sizeof(df2) @requires_pandas_130 @pytest.mark.parametrize("cls_name", ["Series", "DataFrame", "Index"]) def test_pandas_string_arrow_dtype(cls_name): pytest.importorskip("pyarrow") cls = getattr(pd, cls_name) s = cls(["x" * 100_000, "y" * 50_000], dtype="string[pyarrow]") assert 150_000 < sizeof(s) < 155_000 @requires_pandas def test_pandas_empty(): df = pd.DataFrame( {"x": [1, 2, 3], "y": ["a" * 100, "b" * 100, "c" * 100]}, index=[10, 20, 30] ) empty = df.head(0) assert sizeof(empty) > 0 assert sizeof(empty.x) > 0 assert sizeof(empty.y) > 0 assert sizeof(empty.index) > 0 @requires_pandas def test_pyarrow_table(): pa = pytest.importorskip("pyarrow") df = pd.DataFrame( {"x": [1, 2, 3], "y": ["a" * 100, "b" * 100, "c" * 100]}, index=[10, 20, 30] ) table = pa.Table.from_pandas(df) assert sizeof(table) > sizeof(table.schema.metadata) assert isinstance(sizeof(table), int) assert isinstance(sizeof(table.columns[0]), int) assert isinstance(sizeof(table.columns[1]), int) assert isinstance(sizeof(table.columns[2]), int) empty = pa.Table.from_pandas(df.head(0)) assert sizeof(empty) > sizeof(empty.schema.metadata) assert sizeof(empty.columns[0]) > 0 assert sizeof(empty.columns[1]) > 0 assert sizeof(empty.columns[2]) > 0 def test_dict(): np = pytest.importorskip("numpy") x = np.ones(10000) assert sizeof({"x": x}) > x.nbytes assert sizeof({"x": [x]}) > x.nbytes assert sizeof({"x": [{"y": x}]}) > x.nbytes d = {i: x for i in range(100)} assert sizeof(d) > x.nbytes * 100 assert isinstance(sizeof(d), int) def _get_sizeof_on_path(path, size): sys.path.append(os.fsdecode(path)) # Dask will have already called _register_entry_point_plugins # before we can modify sys.path, so we re-register here. import dask.sizeof dask.sizeof._register_entry_point_plugins() import class_impl cls = class_impl.Impl(size) return sizeof(cls) def test_register_backend_entrypoint(tmp_path): # Create special sizeof implementation for a dummy class (tmp_path / "impl_sizeof.py").write_bytes( b"def sizeof_plugin(sizeof):\n" b' print("REG")\n' b' @sizeof.register_lazy("class_impl")\n' b" def register_impl():\n" b" import class_impl\n" b" @sizeof.register(class_impl.Impl)\n" b" def sizeof_impl(obj):\n" b" return obj.size \n" ) # Define dummy class that possesses a size attribute (tmp_path / "class_impl.py").write_bytes( b"class Impl:\n def __init__(self, size):\n self.size = size" ) dist_info = tmp_path / "impl_sizeof-0.0.0.dist-info" dist_info.mkdir() (dist_info / "entry_points.txt").write_bytes( b"[dask.sizeof]\nimpl = impl_sizeof:sizeof_plugin\n" ) with get_context().Pool(1) as pool: assert ( pool.apply(_get_sizeof_on_path, args=(tmp_path, 3_14159265)) == 3_14159265 ) pool.join()