Skip to content
Snippets Groups Projects
Commit 584ad739 authored by fima's avatar fima :beers:
Browse files

Merge branch 'memory_warning' into 'main'

File size warning

See merge request !79
parents 8632c599 83e03a11
No related branches found
No related tags found
1 merge request!79File size warning
...@@ -27,7 +27,7 @@ from PIL import Image, UnidentifiedImageError ...@@ -27,7 +27,7 @@ from PIL import Image, UnidentifiedImageError
import qim3d import qim3d
from qim3d.io.logger import log from qim3d.io.logger import log
from qim3d.utils.internal_tools import sizeof, stringify_path from qim3d.utils.internal_tools import sizeof, stringify_path, get_file_size
from qim3d.utils.system import Memory from qim3d.utils.system import Memory
...@@ -67,12 +67,15 @@ class DataLoader: ...@@ -67,12 +67,15 @@ class DataLoader:
return_metadata (bool, optional): Specifies whether to return metadata or not. Default is False (only for HDF5, TXRM/TXM/XRM and NIfTI files) return_metadata (bool, optional): Specifies whether to return metadata or not. Default is False (only for HDF5, TXRM/TXM/XRM and NIfTI files)
contains (str, optional): Specifies a part of the name that is common for the TIFF file stack to be loaded (only for TIFF stacks). contains (str, optional): Specifies a part of the name that is common for the TIFF file stack to be loaded (only for TIFF stacks).
Default is None. Default is None.
force_load (bool, optional): If false and user tries to load file that exceeds available memory, throws a MemoryError. If true, this error is
changed to warning and dataloader tries to load the file. Default is False.
dim_order (tuple, optional): The order of the dimensions in the volume. Default is (2,1,0) which corresponds to (z,y,x) dim_order (tuple, optional): The order of the dimensions in the volume. Default is (2,1,0) which corresponds to (z,y,x)
""" """
self.virtual_stack = kwargs.get("virtual_stack", False) self.virtual_stack = kwargs.get("virtual_stack", False)
self.dataset_name = kwargs.get("dataset_name", None) self.dataset_name = kwargs.get("dataset_name", None)
self.return_metadata = kwargs.get("return_metadata", False) self.return_metadata = kwargs.get("return_metadata", False)
self.contains = kwargs.get("contains", None) self.contains = kwargs.get("contains", None)
self.force_load = kwargs.get("force_load", False)
self.dim_order = kwargs.get("dim_order", (2, 1, 0)) self.dim_order = kwargs.get("dim_order", (2, 1, 0))
def load_tiff(self, path): def load_tiff(self, path):
...@@ -258,8 +261,10 @@ class DataLoader: ...@@ -258,8 +261,10 @@ class DataLoader:
) )
if self.virtual_stack: if self.virtual_stack:
if not path.endswith('.txm'): if not path.endswith(".txm"):
log.warning("Virtual stack is only thoroughly tested for reconstructed volumes in TXM format and is thus not guaranteed to load TXRM and XRM files correctly") log.warning(
"Virtual stack is only thoroughly tested for reconstructed volumes in TXM format and is thus not guaranteed to load TXRM and XRM files correctly"
)
# Get metadata # Get metadata
ole = olefile.OleFileIO(path) ole = olefile.OleFileIO(path)
...@@ -268,23 +273,29 @@ class DataLoader: ...@@ -268,23 +273,29 @@ class DataLoader:
# Compute data offsets in bytes for each slice # Compute data offsets in bytes for each slice
offsets = _get_ole_offsets(ole) offsets = _get_ole_offsets(ole)
if len(offsets)!=metadata['number_of_images']: if len(offsets) != metadata["number_of_images"]:
raise ValueError(f'Metadata is erroneous: number of images {metadata["number_of_images"]} is different from number of data offsets {len(offsets)}') raise ValueError(
f'Metadata is erroneous: number of images {metadata["number_of_images"]} is different from number of data offsets {len(offsets)}'
)
slices = [] slices = []
for _, offset in offsets.items(): for _, offset in offsets.items():
slices.append( slices.append(
np.memmap( np.memmap(
path, path,
dtype=dxchange.reader._get_ole_data_type(metadata).newbyteorder('<'), dtype=dxchange.reader._get_ole_data_type(metadata).newbyteorder(
mode='r', "<"
),
mode="r",
offset=offset, offset=offset,
shape = (1,metadata['image_height'],metadata['image_width']) shape=(1, metadata["image_height"], metadata["image_width"]),
) )
) )
vol = da.concatenate(slices, axis=0) vol = da.concatenate(slices, axis=0)
log.warning('Virtual stack volume will be returned as a dask array. To load certain slices into memory, use normal indexing followed by the compute() method, e.g. vol[:,0,:].compute()') log.warning(
"Virtual stack volume will be returned as a dask array. To load certain slices into memory, use normal indexing followed by the compute() method, e.g. vol[:,0,:].compute()"
)
else: else:
vol, metadata = dxchange.read_txrm(path) vol, metadata = dxchange.read_txrm(path)
...@@ -352,11 +363,11 @@ class DataLoader: ...@@ -352,11 +363,11 @@ class DataLoader:
should_indent = True should_indent = True
with open(path, 'r') as f: with open(path, "r") as f:
for line in f: for line in f:
line = line.strip() line = line.strip()
# {NAME} is start of a new object, so should indent # {NAME} is start of a new object, so should indent
if line.startswith('{') and line.endswith('}'): if line.startswith("{") and line.endswith("}"):
section_name = line[1:-1] section_name = line[1:-1]
current_section[section_name] = {} current_section[section_name] = {}
section_stack.append(current_section) section_stack.append(current_section)
...@@ -364,7 +375,7 @@ class DataLoader: ...@@ -364,7 +375,7 @@ class DataLoader:
should_indent = True should_indent = True
# [NAME] is start of a section, so should not indent # [NAME] is start of a section, so should not indent
elif line.startswith('[') and line.endswith(']'): elif line.startswith("[") and line.endswith("]"):
section_name = line[1:-1] section_name = line[1:-1]
if not should_indent: if not should_indent:
...@@ -377,10 +388,10 @@ class DataLoader: ...@@ -377,10 +388,10 @@ class DataLoader:
should_indent = False should_indent = False
# = is a key value pair # = is a key value pair
elif '=' in line: elif "=" in line:
key, value = line.split('=', 1) key, value = line.split("=", 1)
current_section[key.strip()] = value.strip() current_section[key.strip()] = value.strip()
elif line == '': elif line == "":
if len(section_stack) > 1: if len(section_stack) > 1:
current_section = section_stack.pop() current_section = section_stack.pop()
...@@ -404,35 +415,45 @@ class DataLoader: ...@@ -404,35 +415,45 @@ class DataLoader:
path = path.replace(".vol", ".vgi") path = path.replace(".vol", ".vgi")
log.warning("Corrected path to .vgi metadata file from .vol file") log.warning("Corrected path to .vgi metadata file from .vol file")
elif path.endswith(".vol") and not os.path.isfile(path.replace(".vol", ".vgi")): elif path.endswith(".vol") and not os.path.isfile(path.replace(".vol", ".vgi")):
raise ValueError(f"Unsupported file format, should point to .vgi metadata file assumed to be in same folder as .vol file: {path}") raise ValueError(
f"Unsupported file format, should point to .vgi metadata file assumed to be in same folder as .vol file: {path}"
)
meta_data = self._load_vgi_metadata(path) meta_data = self._load_vgi_metadata(path)
# Extracts relevant information from the metadata # Extracts relevant information from the metadata
file_name = meta_data['volume1']["file1"]["Name"] file_name = meta_data["volume1"]["file1"]["Name"]
path = path.rsplit('/', 1)[0] # Remove characters after the last "/" to be replaced with .vol filename path = path.rsplit("/", 1)[
vol_path = os.path.join(path, file_name) # .vol and .vgi files are assumed to be in the same directory 0
dims = meta_data['volume1']['file1']['Size'] ] # Remove characters after the last "/" to be replaced with .vol filename
vol_path = os.path.join(
path, file_name
) # .vol and .vgi files are assumed to be in the same directory
dims = meta_data["volume1"]["file1"]["Size"]
dims = [int(n) for n in dims.split() if n.isdigit()] dims = [int(n) for n in dims.split() if n.isdigit()]
dt = meta_data['volume1']['file1']['Datatype'] dt = meta_data["volume1"]["file1"]["Datatype"]
match dt: match dt:
case 'float': case "float":
dt = np.float32 dt = np.float32
case 'float32': case "float32":
dt = np.float32 dt = np.float32
case 'uint8': case "uint8":
dt = np.uint8 dt = np.uint8
case 'unsigned integer': case "unsigned integer":
dt = np.uint16 dt = np.uint16
case 'uint16': case "uint16":
dt = np.uint16 dt = np.uint16
case _: case _:
raise ValueError(f"Unsupported data type: {dt}") raise ValueError(f"Unsupported data type: {dt}")
dims_order = (dims[self.dim_order[0]], dims[self.dim_order[1]], dims[self.dim_order[2]]) dims_order = (
dims[self.dim_order[0]],
dims[self.dim_order[1]],
dims[self.dim_order[2]],
)
if self.virtual_stack: if self.virtual_stack:
vol = np.memmap(vol_path, dtype=dt, mode='r', shape=dims_order) vol = np.memmap(vol_path, dtype=dt, mode="r", shape=dims_order)
else: else:
vol = np.fromfile(vol_path, dtype=dt, count=np.prod(dims)) vol = np.fromfile(vol_path, dtype=dt, count=np.prod(dims))
vol = np.reshape(vol, dims_order) vol = np.reshape(vol, dims_order)
...@@ -466,11 +487,7 @@ class DataLoader: ...@@ -466,11 +487,7 @@ class DataLoader:
"Please specify a part of the name that is common for the DICOM file stack with the argument 'contains'" "Please specify a part of the name that is common for the DICOM file stack with the argument 'contains'"
) )
dicom_stack = [ dicom_stack = [file for file in os.listdir(path) if self.contains in file]
file
for file in os.listdir(path)
if self.contains in file
]
dicom_stack.sort() # Ensure proper ordering dicom_stack.sort() # Ensure proper ordering
# Check that only one DICOM stack in the directory contains the provided string in its name # Check that only one DICOM stack in the directory contains the provided string in its name
...@@ -498,6 +515,34 @@ class DataLoader: ...@@ -498,6 +515,34 @@ class DataLoader:
else: else:
return vol return vol
def check_file_size(self, filename: str):
"""
Checks if there is enough memory where the file can be loaded.
Args:
------------
filename: (str) Specifies path to file
force_load: (bool, optional) If true, the memory error will not be raised. Warning will be printed insted and
the loader will attempt to load the file.
Raises:
-----------
MemoryError: If filesize is greater then available memory
"""
if (
self.virtual_stack
): # If virtual_stack is True, then data is loaded from the disk, no need for loading into memory
return
file_size = get_file_size(filename)
available_memory = Memory().free
if file_size > available_memory:
message = f"The file {filename} has {sizeof(file_size)} but only {sizeof(available_memory)} of memory is available."
if self.force_load:
log.warning(message)
else:
raise MemoryError(
message + " Set 'force_load=True' to ignore this error."
)
def load(self, path): def load(self, path):
""" """
...@@ -515,6 +560,7 @@ class DataLoader: ...@@ -515,6 +560,7 @@ class DataLoader:
Raises: Raises:
ValueError: If the format is not supported ValueError: If the format is not supported
ValueError: If the file or directory does not exist. ValueError: If the file or directory does not exist.
MemoryError: If file size exceeds available memory and force_load is not set to True. In check_size function.
Example: Example:
loader = qim3d.io.DataLoader() loader = qim3d.io.DataLoader()
...@@ -527,6 +573,7 @@ class DataLoader: ...@@ -527,6 +573,7 @@ class DataLoader:
# Load a file # Load a file
if os.path.isfile(path): if os.path.isfile(path):
# Choose the loader based on the file extension # Choose the loader based on the file extension
self.check_file_size(path)
if path.endswith(".tif") or path.endswith(".tiff"): if path.endswith(".tif") or path.endswith(".tiff"):
return self.load_tiff(path) return self.load_tiff(path)
elif path.endswith(".h5"): elif path.endswith(".h5"):
...@@ -548,7 +595,9 @@ class DataLoader: ...@@ -548,7 +595,9 @@ class DataLoader:
# Load a directory # Load a directory
elif os.path.isdir(path): elif os.path.isdir(path):
# load tiff stack if folder contains tiff files else load dicom directory # load tiff stack if folder contains tiff files else load dicom directory
if any([f.endswith('.tif') or f.endswith('.tiff') for f in os.listdir(path)]): if any(
[f.endswith(".tif") or f.endswith(".tiff") for f in os.listdir(path)]
):
return self.load_tiff_stack(path) return self.load_tiff_stack(path)
else: else:
return self.load_dicom_dir(path) return self.load_dicom_dir(path)
...@@ -556,8 +605,8 @@ class DataLoader: ...@@ -556,8 +605,8 @@ class DataLoader:
# Fails # Fails
else: else:
# Find the closest matching path to warn the user # Find the closest matching path to warn the user
parent_dir = os.path.dirname(path) or '.' parent_dir = os.path.dirname(path) or "."
parent_files = os.listdir(parent_dir) if os.path.isdir(parent_dir) else '' parent_files = os.listdir(parent_dir) if os.path.isdir(parent_dir) else ""
valid_paths = [os.path.join(parent_dir, file) for file in parent_files] valid_paths = [os.path.join(parent_dir, file) for file in parent_files]
similar_paths = difflib.get_close_matches(path, valid_paths) similar_paths = difflib.get_close_matches(path, valid_paths)
if similar_paths: if similar_paths:
...@@ -573,18 +622,24 @@ def _get_h5_dataset_keys(f): ...@@ -573,18 +622,24 @@ def _get_h5_dataset_keys(f):
f.visit(lambda key: keys.append(key) if isinstance(f[key], h5py.Dataset) else None) f.visit(lambda key: keys.append(key) if isinstance(f[key], h5py.Dataset) else None)
return keys return keys
def _get_ole_offsets(ole): def _get_ole_offsets(ole):
slice_offset = {} slice_offset = {}
for stream in ole.listdir(): for stream in ole.listdir():
if stream[0].startswith('ImageData'): if stream[0].startswith("ImageData"):
sid = ole._find(stream) sid = ole._find(stream)
direntry = ole.direntries[sid] direntry = ole.direntries[sid]
sect_start = direntry.isectStart sect_start = direntry.isectStart
offset = ole.sectorsize * (sect_start + 1) offset = ole.sectorsize * (sect_start + 1)
slice_offset[f'{stream[0]}/{stream[1]}']=offset slice_offset[f"{stream[0]}/{stream[1]}"] = offset
# sort dictionary after natural sorting (https://blog.codinghorror.com/sorting-for-humans-natural-sort-order/) # sort dictionary after natural sorting (https://blog.codinghorror.com/sorting-for-humans-natural-sort-order/)
sorted_keys = sorted(slice_offset.keys(),key=lambda string_: [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]) sorted_keys = sorted(
slice_offset.keys(),
key=lambda string_: [
int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_)
],
)
slice_offset_sorted = {key: slice_offset[key] for key in sorted_keys} slice_offset_sorted = {key: slice_offset[key] for key in sorted_keys}
return slice_offset_sorted return slice_offset_sorted
...@@ -596,6 +651,7 @@ def load( ...@@ -596,6 +651,7 @@ def load(
dataset_name=None, dataset_name=None,
return_metadata=False, return_metadata=False,
contains=None, contains=None,
force_load: bool = False,
dim_order=(2, 1, 0), dim_order=(2, 1, 0),
**kwargs, **kwargs,
): ):
...@@ -611,6 +667,8 @@ def load( ...@@ -611,6 +667,8 @@ def load(
return_metadata (bool, optional): Specifies whether to return metadata or not. Default is False (only for HDF5 and TXRM/TXM/XRM files) return_metadata (bool, optional): Specifies whether to return metadata or not. Default is False (only for HDF5 and TXRM/TXM/XRM files)
contains (str, optional): Specifies a part of the name that is common for the TIFF file stack to be loaded (only for TIFF stacks). contains (str, optional): Specifies a part of the name that is common for the TIFF file stack to be loaded (only for TIFF stacks).
Default is None. Default is None.
force_load (bool, optional): If the file size exceeds available memory, a MemoryError is raised.
If force_load is True, the error is changed to warning and the loader tries to load it anyway. Default is False.
dim_order (tuple, optional): The order of the dimensions in the volume for .vol files. Default is (2,1,0) which corresponds to (z,y,x) dim_order (tuple, optional): The order of the dimensions in the volume for .vol files. Default is (2,1,0) which corresponds to (z,y,x)
**kwargs: Additional keyword arguments to be passed **kwargs: Additional keyword arguments to be passed
to the DataLoader constructor. to the DataLoader constructor.
...@@ -621,6 +679,9 @@ def load( ...@@ -621,6 +679,9 @@ def load(
If `virtual_stack=True`, returns `numpy.memmap`, `h5py._hl.dataset.Dataset` or `nibabel.arrayproxy.ArrayProxy` depending on file format If `virtual_stack=True`, returns `numpy.memmap`, `h5py._hl.dataset.Dataset` or `nibabel.arrayproxy.ArrayProxy` depending on file format
If `return_metadata=True` and file format is either HDF5, NIfTI or TXRM/TXM/XRM, returns `tuple` (volume, metadata). If `return_metadata=True` and file format is either HDF5, NIfTI or TXRM/TXM/XRM, returns `tuple` (volume, metadata).
Raises:
MemoryError: if the given file size exceeds available memory
Example: Example:
```python ```python
import qim3d import qim3d
...@@ -634,13 +695,13 @@ def load( ...@@ -634,13 +695,13 @@ def load(
dataset_name=dataset_name, dataset_name=dataset_name,
return_metadata=return_metadata, return_metadata=return_metadata,
contains=contains, contains=contains,
force_load=force_load,
dim_order=dim_order, dim_order=dim_order,
**kwargs, **kwargs,
) )
data = loader.load(path) data = loader.load(path)
def log_memory_info(data): def log_memory_info(data):
mem = Memory() mem = Memory()
log.info( log.info(
...@@ -650,16 +711,18 @@ def load( ...@@ -650,16 +711,18 @@ def load(
mem.report() mem.report()
if return_metadata and not isinstance(data, tuple): if return_metadata and not isinstance(data, tuple):
log.warning('The file format does not contain metadata') log.warning("The file format does not contain metadata")
if not virtual_stack: if not virtual_stack:
log_memory_info(data) log_memory_info(data)
else: else:
# Only log if file type is not a np.ndarray, i.e., it is some kind of memmap object # Only log if file type is not a np.ndarray, i.e., it is some kind of memmap object
if not isinstance( type(data[0]) if isinstance(data,tuple) else type(data), np.ndarray ): if not isinstance(
type(data[0]) if isinstance(data, tuple) else type(data), np.ndarray
):
log.info("Using virtual stack") log.info("Using virtual stack")
else: else:
log.warning('Virtual stack is not supported for this file format') log.warning("Virtual stack is not supported for this file format")
log_memory_info(data) log_memory_info(data)
return data return data
...@@ -695,10 +758,6 @@ class ImgExamples: ...@@ -695,10 +758,6 @@ class ImgExamples:
img_examples_path = Path(qim3d.__file__).parents[0] / "img_examples" img_examples_path = Path(qim3d.__file__).parents[0] / "img_examples"
img_paths = list(img_examples_path.glob("*.tif")) img_paths = list(img_examples_path.glob("*.tif"))
img_names = []
for path in img_paths:
img_names.append(path.stem)
# Generate loader for each image found update_dict = {path.stem : load(path) for path in img_paths}
for idx, name in enumerate(img_names): self.__dict__.update(update_dict)
exec(f"self.{name} = load(path = img_paths[idx])")
\ No newline at end of file
...@@ -132,5 +132,5 @@ def level(log_level): ...@@ -132,5 +132,5 @@ def level(log_level):
# create the logger # create the logger
log = logging.getLogger("qim3d") log = logging.getLogger("qim3d")
set_simple_output() # set_simple_output() #TODO: This used to work, but now it gives duplicated messages. Need to be investigated.
set_level_warning() set_level_warning()
...@@ -179,6 +179,18 @@ def sizeof(num, suffix="B"): ...@@ -179,6 +179,18 @@ def sizeof(num, suffix="B"):
num /= 1024.0 num /= 1024.0
return f"{num:.1f} Y{suffix}" return f"{num:.1f} Y{suffix}"
def get_file_size(filename:str) -> int:
"""
Args:
-----
filename (str): Specifies full path to file
Returns:
---------
size (int): size of file in bytes
"""
return os.path.getsize(filename)
def is_server_running(ip, port): def is_server_running(ip, port):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment