Merge branch 'memory_warning' into 'main'

File size warning See merge request !79

Merge branch 'memory_warning' into 'main'
584ad739 · fima · 8632c599 · 83e03a11 · 584ad739 · 584ad739
Commit 584ad739 authored May 1, 2024 by fima
--- a/qim3d/io/loading.py
+++ b/qim3d/io/loading.py
@@ -27,7 +27,7 @@ from PIL import Image, UnidentifiedImageError
 import qim3d
 from qim3d.io.logger import log
-from qim3d.utils.internal_tools import sizeof, stringify_path
+from qim3d.utils.internal_tools import sizeof, stringify_path, get_file_size
 from qim3d.utils.system import Memory
@@ -67,12 +67,15 @@ class DataLoader:
            return_metadata (bool, optional): Specifies whether to return metadata or not. Default is False (only for HDF5, TXRM/TXM/XRM and NIfTI files)
            contains (str, optional): Specifies a part of the name that is common for the TIFF file stack to be loaded (only for TIFF stacks).
                Default is None.
+            force_load (bool, optional): If false and user tries to load file that exceeds available memory, throws a MemoryError. If true, this error is
+                changed to warning and dataloader tries to load the file. Default is False.
            dim_order (tuple, optional): The order of the dimensions in the volume. Default is (2,1,0) which corresponds to (z,y,x)
        """
        self.virtual_stack = kwargs.get("virtual_stack", False)
        self.dataset_name = kwargs.get("dataset_name", None)
        self.return_metadata = kwargs.get("return_metadata", False)
        self.contains = kwargs.get("contains", None)
+        self.force_load = kwargs.get("force_load", False)
        self.dim_order = kwargs.get("dim_order", (2, 1, 0))
    def load_tiff(self, path):
@@ -258,8 +261,10 @@ class DataLoader:
            )
        if self.virtual_stack:
-            if not path.endswith('.txm'):
+            if not path.endswith(".txm"):
-                log.warning("Virtual stack is only thoroughly tested for reconstructed volumes in TXM format and is thus not guaranteed to load TXRM and XRM files correctly")
+                log.warning(
+                    "Virtual stack is only thoroughly tested for reconstructed volumes in TXM format and is thus not guaranteed to load TXRM and XRM files correctly"
+                )
            # Get metadata
            ole = olefile.OleFileIO(path)
@@ -268,23 +273,29 @@ class DataLoader:
            # Compute data offsets in bytes for each slice
            offsets = _get_ole_offsets(ole)
-            if len(offsets)!=metadata['number_of_images']:
+            if len(offsets) != metadata["number_of_images"]:
-                raise ValueError(f'Metadata is erroneous: number of images {metadata["number_of_images"]} is different from number of data offsets {len(offsets)}') 
+                raise ValueError(
+                    f'Metadata is erroneous: number of images {metadata["number_of_images"]} is different from number of data offsets {len(offsets)}'
+                )
            slices = []
            for _, offset in offsets.items():
                slices.append(
                    np.memmap(
                        path,
-                        dtype=dxchange.reader._get_ole_data_type(metadata).newbyteorder('<'),
+                        dtype=dxchange.reader._get_ole_data_type(metadata).newbyteorder(
-                        mode='r',
+                            "<"
+                        ),
+                        mode="r",
                        offset=offset,
-                        shape = (1,metadata['image_height'],metadata['image_width'])
+                        shape=(1, metadata["image_height"], metadata["image_width"]),
                    )
                )
            vol = da.concatenate(slices, axis=0)
-            log.warning('Virtual stack volume will be returned as a dask array. To load certain slices into memory, use normal indexing followed by the compute() method, e.g. vol[:,0,:].compute()')
+            log.warning(
+                "Virtual stack volume will be returned as a dask array. To load certain slices into memory, use normal indexing followed by the compute() method, e.g. vol[:,0,:].compute()"
+            )
        else:
            vol, metadata = dxchange.read_txrm(path)
@@ -352,11 +363,11 @@ class DataLoader:
        should_indent = True
-        with open(path, 'r') as f:
+        with open(path, "r") as f:
            for line in f:
                line = line.strip()
                # {NAME} is start of a new object, so should indent
-                if line.startswith('{') and line.endswith('}'):
+                if line.startswith("{") and line.endswith("}"):
                    section_name = line[1:-1]
                    current_section[section_name] = {}
                    section_stack.append(current_section)
@@ -364,7 +375,7 @@ class DataLoader:
                    should_indent = True
                # [NAME] is start of a section, so should not indent
-                elif line.startswith('[') and line.endswith(']'):
+                elif line.startswith("[") and line.endswith("]"):
                    section_name = line[1:-1]
                    if not should_indent:
@@ -377,10 +388,10 @@ class DataLoader:
                    should_indent = False
                # = is a key value pair
-                elif '=' in line:
+                elif "=" in line:
-                    key, value = line.split('=', 1)
+                    key, value = line.split("=", 1)
                    current_section[key.strip()] = value.strip()
-                elif line == '':
+                elif line == "":
                    if len(section_stack) > 1:
                        current_section = section_stack.pop()
@@ -404,35 +415,45 @@ class DataLoader:
            path = path.replace(".vol", ".vgi")
            log.warning("Corrected path to .vgi metadata file from .vol file")
        elif path.endswith(".vol") and not os.path.isfile(path.replace(".vol", ".vgi")):
-            raise ValueError(f"Unsupported file format, should point to .vgi metadata file assumed to be in same folder as .vol file: {path}")
+            raise ValueError(
+                f"Unsupported file format, should point to .vgi metadata file assumed to be in same folder as .vol file: {path}"
+            )
        meta_data = self._load_vgi_metadata(path)
        # Extracts relevant information from the metadata
-        file_name =  meta_data['volume1']["file1"]["Name"]
+        file_name = meta_data["volume1"]["file1"]["Name"]
-        path = path.rsplit('/', 1)[0]  # Remove characters after the last "/" to be replaced with .vol filename
+        path = path.rsplit("/", 1)[
-        vol_path = os.path.join(path, file_name) # .vol and .vgi files are assumed to be in the same directory
+            0
-        dims = meta_data['volume1']['file1']['Size']
+        ]  # Remove characters after the last "/" to be replaced with .vol filename
+        vol_path = os.path.join(
+            path, file_name
+        )  # .vol and .vgi files are assumed to be in the same directory
+        dims = meta_data["volume1"]["file1"]["Size"]
        dims = [int(n) for n in dims.split() if n.isdigit()]
-        dt = meta_data['volume1']['file1']['Datatype']
+        dt = meta_data["volume1"]["file1"]["Datatype"]
        match dt:
-            case 'float':
+            case "float":
                dt = np.float32
-            case 'float32':
+            case "float32":
                dt = np.float32
-            case 'uint8':
+            case "uint8":
                dt = np.uint8
-            case 'unsigned integer':
+            case "unsigned integer":
                dt = np.uint16
-            case 'uint16':
+            case "uint16":
                dt = np.uint16
            case _:
                raise ValueError(f"Unsupported data type: {dt}")
-        dims_order = (dims[self.dim_order[0]], dims[self.dim_order[1]], dims[self.dim_order[2]])
+        dims_order = (
+            dims[self.dim_order[0]],
+            dims[self.dim_order[1]],
+            dims[self.dim_order[2]],
+        )
        if self.virtual_stack:
-            vol = np.memmap(vol_path, dtype=dt, mode='r', shape=dims_order)
+            vol = np.memmap(vol_path, dtype=dt, mode="r", shape=dims_order)
        else:
            vol = np.fromfile(vol_path, dtype=dt, count=np.prod(dims))
            vol = np.reshape(vol, dims_order)
@@ -466,11 +487,7 @@ class DataLoader:
                "Please specify a part of the name that is common for the DICOM file stack with the argument 'contains'"
            )
-        dicom_stack = [
+        dicom_stack = [file for file in os.listdir(path) if self.contains in file]
-            file
-            for file in os.listdir(path)
-            if self.contains in file
-        ]
        dicom_stack.sort()  # Ensure proper ordering
        # Check that only one DICOM stack in the directory contains the provided string in its name
@@ -498,6 +515,34 @@ class DataLoader:
        else:
            return vol
+    def check_file_size(self, filename: str):
+        """
+        Checks if there is enough memory where the file can be loaded.
+        Args:
+        ------------
+            filename: (str) Specifies path to file
+            force_load: (bool, optional) If true, the memory error will not be raised. Warning will be printed insted and
+                the loader will attempt to load the file.
+        Raises:
+        -----------
+            MemoryError: If filesize is greater then available memory
+        """
+        if (
+            self.virtual_stack
+        ):  # If virtual_stack is True, then data is loaded from the disk, no need for loading into memory
+            return
+        file_size = get_file_size(filename)
+        available_memory = Memory().free
+        if file_size > available_memory:
+            message = f"The file {filename} has {sizeof(file_size)} but only {sizeof(available_memory)} of memory is available."
+            if self.force_load:
+                log.warning(message)
+            else:
+                raise MemoryError(
+                    message + " Set 'force_load=True' to ignore this error."
+                )
    def load(self, path):
        """
@@ -515,6 +560,7 @@ class DataLoader:
        Raises:
            ValueError: If the format is not supported
            ValueError: If the file or directory does not exist.
+            MemoryError: If file size exceeds available memory and force_load is not set to True. In check_size function.
        Example:
            loader = qim3d.io.DataLoader()
@@ -527,6 +573,7 @@ class DataLoader:
        # Load a file
        if os.path.isfile(path):
            # Choose the loader based on the file extension
+            self.check_file_size(path)
            if path.endswith(".tif") or path.endswith(".tiff"):
                return self.load_tiff(path)
            elif path.endswith(".h5"):
@@ -548,7 +595,9 @@ class DataLoader:
        # Load a directory
        elif os.path.isdir(path):
            # load tiff stack if folder contains tiff files else load dicom directory
-            if any([f.endswith('.tif') or f.endswith('.tiff') for f in os.listdir(path)]):
+            if any(
+                [f.endswith(".tif") or f.endswith(".tiff") for f in os.listdir(path)]
+            ):
                return self.load_tiff_stack(path)
            else:
                return self.load_dicom_dir(path)
@@ -556,8 +605,8 @@ class DataLoader:
        # Fails
        else:
            # Find the closest matching path to warn the user
-            parent_dir = os.path.dirname(path) or '.'
+            parent_dir = os.path.dirname(path) or "."
-            parent_files = os.listdir(parent_dir) if os.path.isdir(parent_dir) else ''
+            parent_files = os.listdir(parent_dir) if os.path.isdir(parent_dir) else ""
            valid_paths = [os.path.join(parent_dir, file) for file in parent_files]
            similar_paths = difflib.get_close_matches(path, valid_paths)
            if similar_paths:
@@ -573,18 +622,24 @@ def _get_h5_dataset_keys(f):
    f.visit(lambda key: keys.append(key) if isinstance(f[key], h5py.Dataset) else None)
    return keys
 def _get_ole_offsets(ole):
    slice_offset = {}
    for stream in ole.listdir():
-        if stream[0].startswith('ImageData'):
+        if stream[0].startswith("ImageData"):
            sid = ole._find(stream)
            direntry = ole.direntries[sid]
            sect_start = direntry.isectStart
            offset = ole.sectorsize * (sect_start + 1)
-            slice_offset[f'{stream[0]}/{stream[1]}']=offset
+            slice_offset[f"{stream[0]}/{stream[1]}"] = offset
    # sort dictionary after natural sorting (https://blog.codinghorror.com/sorting-for-humans-natural-sort-order/)
-    sorted_keys = sorted(slice_offset.keys(),key=lambda string_: [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)])
+    sorted_keys = sorted(
+        slice_offset.keys(),
+        key=lambda string_: [
+            int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_)
+        ],
+    )
    slice_offset_sorted = {key: slice_offset[key] for key in sorted_keys}
    return slice_offset_sorted
@@ -596,6 +651,7 @@ def load(
    dataset_name=None,
    return_metadata=False,
    contains=None,
+    force_load: bool = False,
    dim_order=(2, 1, 0),
    **kwargs,
 ):
@@ -611,6 +667,8 @@ def load(
        return_metadata (bool, optional): Specifies whether to return metadata or not. Default is False (only for HDF5 and TXRM/TXM/XRM files)
        contains (str, optional): Specifies a part of the name that is common for the TIFF file stack to be loaded (only for TIFF stacks).
            Default is None.
+        force_load (bool, optional): If the file size exceeds available memory, a MemoryError is raised.
+            If force_load is True, the error is changed to warning and the loader tries to load it anyway. Default is False.
        dim_order (tuple, optional): The order of the dimensions in the volume for .vol files. Default is (2,1,0) which corresponds to (z,y,x)
        **kwargs: Additional keyword arguments to be passed
        to the DataLoader constructor.
@@ -621,6 +679,9 @@ def load(
            If `virtual_stack=True`, returns `numpy.memmap`, `h5py._hl.dataset.Dataset` or `nibabel.arrayproxy.ArrayProxy` depending on file format
            If `return_metadata=True` and file format is either HDF5, NIfTI or TXRM/TXM/XRM, returns `tuple` (volume, metadata).
+    Raises:
+        MemoryError: if the given file size exceeds available memory
    Example:
        ```python
        import qim3d
@@ -634,13 +695,13 @@ def load(
        dataset_name=dataset_name,
        return_metadata=return_metadata,
        contains=contains,
+        force_load=force_load,
        dim_order=dim_order,
        **kwargs,
    )
    data = loader.load(path)
    def log_memory_info(data):
        mem = Memory()
        log.info(
@@ -650,16 +711,18 @@ def load(
        mem.report()
    if return_metadata and not isinstance(data, tuple):
-        log.warning('The file format does not contain metadata')
+        log.warning("The file format does not contain metadata")
    if not virtual_stack:
        log_memory_info(data)
    else:
        # Only log if file type is not a np.ndarray, i.e., it is some kind of memmap object
-        if not isinstance( type(data[0]) if isinstance(data,tuple) else type(data), np.ndarray ):
+        if not isinstance(
+            type(data[0]) if isinstance(data, tuple) else type(data), np.ndarray
+        ):
            log.info("Using virtual stack")
        else:
-            log.warning('Virtual stack is not supported for this file format')
+            log.warning("Virtual stack is not supported for this file format")
            log_memory_info(data)
    return data
@@ -695,10 +758,6 @@ class ImgExamples:
        img_examples_path = Path(qim3d.__file__).parents[0] / "img_examples"
        img_paths = list(img_examples_path.glob("*.tif"))
-        img_names = []
-        for path in img_paths:
-            img_names.append(path.stem)
-        # Generate loader for each image found
+        update_dict = {path.stem : load(path) for path in img_paths}
-        for idx, name in enumerate(img_names):
+        self.__dict__.update(update_dict)
-            exec(f"self.{name} = load(path = img_paths[idx])")
\ No newline at end of file
--- a/qim3d/io/logger.py
+++ b/qim3d/io/logger.py
@@ -132,5 +132,5 @@ def level(log_level):
 # create the logger
 log = logging.getLogger("qim3d")
-set_simple_output()
+# set_simple_output() #TODO: This used to work, but now it gives duplicated messages. Need to be investigated.
 set_level_warning()
--- a/qim3d/utils/internal_tools.py
+++ b/qim3d/utils/internal_tools.py
@@ -179,6 +179,18 @@ def sizeof(num, suffix="B"):
        num /= 1024.0
    return f"{num:.1f} Y{suffix}"
+def get_file_size(filename:str) -> int:
+    """
+    Args:
+    -----
+        filename (str): Specifies full path to file
+    Returns:
+    ---------
+        size (int): size of file in bytes
+    """
+    return os.path.getsize(filename)
 def is_server_running(ip, port):
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)