Skip to content
Snippets Groups Projects
Commit 584ad739 authored by fima's avatar fima :beers:
Browse files

Merge branch 'memory_warning' into 'main'

File size warning

See merge request !79
parents 8632c599 83e03a11
No related branches found
No related tags found
1 merge request!79File size warning
......@@ -27,7 +27,7 @@ from PIL import Image, UnidentifiedImageError
import qim3d
from qim3d.io.logger import log
from qim3d.utils.internal_tools import sizeof, stringify_path
from qim3d.utils.internal_tools import sizeof, stringify_path, get_file_size
from qim3d.utils.system import Memory
......@@ -67,12 +67,15 @@ class DataLoader:
return_metadata (bool, optional): Specifies whether to return metadata or not. Default is False (only for HDF5, TXRM/TXM/XRM and NIfTI files)
contains (str, optional): Specifies a part of the name that is common for the TIFF file stack to be loaded (only for TIFF stacks).
Default is None.
force_load (bool, optional): If false and user tries to load file that exceeds available memory, throws a MemoryError. If true, this error is
changed to warning and dataloader tries to load the file. Default is False.
dim_order (tuple, optional): The order of the dimensions in the volume. Default is (2,1,0) which corresponds to (z,y,x)
"""
self.virtual_stack = kwargs.get("virtual_stack", False)
self.dataset_name = kwargs.get("dataset_name", None)
self.return_metadata = kwargs.get("return_metadata", False)
self.contains = kwargs.get("contains", None)
self.force_load = kwargs.get("force_load", False)
self.dim_order = kwargs.get("dim_order", (2, 1, 0))
def load_tiff(self, path):
......@@ -258,8 +261,10 @@ class DataLoader:
)
if self.virtual_stack:
if not path.endswith('.txm'):
log.warning("Virtual stack is only thoroughly tested for reconstructed volumes in TXM format and is thus not guaranteed to load TXRM and XRM files correctly")
if not path.endswith(".txm"):
log.warning(
"Virtual stack is only thoroughly tested for reconstructed volumes in TXM format and is thus not guaranteed to load TXRM and XRM files correctly"
)
# Get metadata
ole = olefile.OleFileIO(path)
......@@ -268,23 +273,29 @@ class DataLoader:
# Compute data offsets in bytes for each slice
offsets = _get_ole_offsets(ole)
if len(offsets)!=metadata['number_of_images']:
raise ValueError(f'Metadata is erroneous: number of images {metadata["number_of_images"]} is different from number of data offsets {len(offsets)}')
if len(offsets) != metadata["number_of_images"]:
raise ValueError(
f'Metadata is erroneous: number of images {metadata["number_of_images"]} is different from number of data offsets {len(offsets)}'
)
slices = []
for _, offset in offsets.items():
slices.append(
np.memmap(
path,
dtype=dxchange.reader._get_ole_data_type(metadata).newbyteorder('<'),
mode='r',
dtype=dxchange.reader._get_ole_data_type(metadata).newbyteorder(
"<"
),
mode="r",
offset=offset,
shape = (1,metadata['image_height'],metadata['image_width'])
shape=(1, metadata["image_height"], metadata["image_width"]),
)
)
vol = da.concatenate(slices, axis=0)
log.warning('Virtual stack volume will be returned as a dask array. To load certain slices into memory, use normal indexing followed by the compute() method, e.g. vol[:,0,:].compute()')
log.warning(
"Virtual stack volume will be returned as a dask array. To load certain slices into memory, use normal indexing followed by the compute() method, e.g. vol[:,0,:].compute()"
)
else:
vol, metadata = dxchange.read_txrm(path)
......@@ -352,11 +363,11 @@ class DataLoader:
should_indent = True
with open(path, 'r') as f:
with open(path, "r") as f:
for line in f:
line = line.strip()
# {NAME} is start of a new object, so should indent
if line.startswith('{') and line.endswith('}'):
if line.startswith("{") and line.endswith("}"):
section_name = line[1:-1]
current_section[section_name] = {}
section_stack.append(current_section)
......@@ -364,7 +375,7 @@ class DataLoader:
should_indent = True
# [NAME] is start of a section, so should not indent
elif line.startswith('[') and line.endswith(']'):
elif line.startswith("[") and line.endswith("]"):
section_name = line[1:-1]
if not should_indent:
......@@ -377,10 +388,10 @@ class DataLoader:
should_indent = False
# = is a key value pair
elif '=' in line:
key, value = line.split('=', 1)
elif "=" in line:
key, value = line.split("=", 1)
current_section[key.strip()] = value.strip()
elif line == '':
elif line == "":
if len(section_stack) > 1:
current_section = section_stack.pop()
......@@ -404,35 +415,45 @@ class DataLoader:
path = path.replace(".vol", ".vgi")
log.warning("Corrected path to .vgi metadata file from .vol file")
elif path.endswith(".vol") and not os.path.isfile(path.replace(".vol", ".vgi")):
raise ValueError(f"Unsupported file format, should point to .vgi metadata file assumed to be in same folder as .vol file: {path}")
raise ValueError(
f"Unsupported file format, should point to .vgi metadata file assumed to be in same folder as .vol file: {path}"
)
meta_data = self._load_vgi_metadata(path)
# Extracts relevant information from the metadata
file_name = meta_data['volume1']["file1"]["Name"]
path = path.rsplit('/', 1)[0] # Remove characters after the last "/" to be replaced with .vol filename
vol_path = os.path.join(path, file_name) # .vol and .vgi files are assumed to be in the same directory
dims = meta_data['volume1']['file1']['Size']
file_name = meta_data["volume1"]["file1"]["Name"]
path = path.rsplit("/", 1)[
0
] # Remove characters after the last "/" to be replaced with .vol filename
vol_path = os.path.join(
path, file_name
) # .vol and .vgi files are assumed to be in the same directory
dims = meta_data["volume1"]["file1"]["Size"]
dims = [int(n) for n in dims.split() if n.isdigit()]
dt = meta_data['volume1']['file1']['Datatype']
dt = meta_data["volume1"]["file1"]["Datatype"]
match dt:
case 'float':
case "float":
dt = np.float32
case 'float32':
case "float32":
dt = np.float32
case 'uint8':
case "uint8":
dt = np.uint8
case 'unsigned integer':
case "unsigned integer":
dt = np.uint16
case 'uint16':
case "uint16":
dt = np.uint16
case _:
raise ValueError(f"Unsupported data type: {dt}")
dims_order = (dims[self.dim_order[0]], dims[self.dim_order[1]], dims[self.dim_order[2]])
dims_order = (
dims[self.dim_order[0]],
dims[self.dim_order[1]],
dims[self.dim_order[2]],
)
if self.virtual_stack:
vol = np.memmap(vol_path, dtype=dt, mode='r', shape=dims_order)
vol = np.memmap(vol_path, dtype=dt, mode="r", shape=dims_order)
else:
vol = np.fromfile(vol_path, dtype=dt, count=np.prod(dims))
vol = np.reshape(vol, dims_order)
......@@ -466,11 +487,7 @@ class DataLoader:
"Please specify a part of the name that is common for the DICOM file stack with the argument 'contains'"
)
dicom_stack = [
file
for file in os.listdir(path)
if self.contains in file
]
dicom_stack = [file for file in os.listdir(path) if self.contains in file]
dicom_stack.sort() # Ensure proper ordering
# Check that only one DICOM stack in the directory contains the provided string in its name
......@@ -498,6 +515,34 @@ class DataLoader:
else:
return vol
def check_file_size(self, filename: str):
"""
Checks if there is enough memory where the file can be loaded.
Args:
------------
filename: (str) Specifies path to file
force_load: (bool, optional) If true, the memory error will not be raised. Warning will be printed insted and
the loader will attempt to load the file.
Raises:
-----------
MemoryError: If filesize is greater then available memory
"""
if (
self.virtual_stack
): # If virtual_stack is True, then data is loaded from the disk, no need for loading into memory
return
file_size = get_file_size(filename)
available_memory = Memory().free
if file_size > available_memory:
message = f"The file {filename} has {sizeof(file_size)} but only {sizeof(available_memory)} of memory is available."
if self.force_load:
log.warning(message)
else:
raise MemoryError(
message + " Set 'force_load=True' to ignore this error."
)
def load(self, path):
"""
......@@ -515,6 +560,7 @@ class DataLoader:
Raises:
ValueError: If the format is not supported
ValueError: If the file or directory does not exist.
MemoryError: If file size exceeds available memory and force_load is not set to True. In check_size function.
Example:
loader = qim3d.io.DataLoader()
......@@ -527,6 +573,7 @@ class DataLoader:
# Load a file
if os.path.isfile(path):
# Choose the loader based on the file extension
self.check_file_size(path)
if path.endswith(".tif") or path.endswith(".tiff"):
return self.load_tiff(path)
elif path.endswith(".h5"):
......@@ -548,7 +595,9 @@ class DataLoader:
# Load a directory
elif os.path.isdir(path):
# load tiff stack if folder contains tiff files else load dicom directory
if any([f.endswith('.tif') or f.endswith('.tiff') for f in os.listdir(path)]):
if any(
[f.endswith(".tif") or f.endswith(".tiff") for f in os.listdir(path)]
):
return self.load_tiff_stack(path)
else:
return self.load_dicom_dir(path)
......@@ -556,8 +605,8 @@ class DataLoader:
# Fails
else:
# Find the closest matching path to warn the user
parent_dir = os.path.dirname(path) or '.'
parent_files = os.listdir(parent_dir) if os.path.isdir(parent_dir) else ''
parent_dir = os.path.dirname(path) or "."
parent_files = os.listdir(parent_dir) if os.path.isdir(parent_dir) else ""
valid_paths = [os.path.join(parent_dir, file) for file in parent_files]
similar_paths = difflib.get_close_matches(path, valid_paths)
if similar_paths:
......@@ -573,18 +622,24 @@ def _get_h5_dataset_keys(f):
f.visit(lambda key: keys.append(key) if isinstance(f[key], h5py.Dataset) else None)
return keys
def _get_ole_offsets(ole):
slice_offset = {}
for stream in ole.listdir():
if stream[0].startswith('ImageData'):
if stream[0].startswith("ImageData"):
sid = ole._find(stream)
direntry = ole.direntries[sid]
sect_start = direntry.isectStart
offset = ole.sectorsize * (sect_start + 1)
slice_offset[f'{stream[0]}/{stream[1]}']=offset
slice_offset[f"{stream[0]}/{stream[1]}"] = offset
# sort dictionary after natural sorting (https://blog.codinghorror.com/sorting-for-humans-natural-sort-order/)
sorted_keys = sorted(slice_offset.keys(),key=lambda string_: [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)])
sorted_keys = sorted(
slice_offset.keys(),
key=lambda string_: [
int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_)
],
)
slice_offset_sorted = {key: slice_offset[key] for key in sorted_keys}
return slice_offset_sorted
......@@ -596,6 +651,7 @@ def load(
dataset_name=None,
return_metadata=False,
contains=None,
force_load: bool = False,
dim_order=(2, 1, 0),
**kwargs,
):
......@@ -611,6 +667,8 @@ def load(
return_metadata (bool, optional): Specifies whether to return metadata or not. Default is False (only for HDF5 and TXRM/TXM/XRM files)
contains (str, optional): Specifies a part of the name that is common for the TIFF file stack to be loaded (only for TIFF stacks).
Default is None.
force_load (bool, optional): If the file size exceeds available memory, a MemoryError is raised.
If force_load is True, the error is changed to warning and the loader tries to load it anyway. Default is False.
dim_order (tuple, optional): The order of the dimensions in the volume for .vol files. Default is (2,1,0) which corresponds to (z,y,x)
**kwargs: Additional keyword arguments to be passed
to the DataLoader constructor.
......@@ -621,6 +679,9 @@ def load(
If `virtual_stack=True`, returns `numpy.memmap`, `h5py._hl.dataset.Dataset` or `nibabel.arrayproxy.ArrayProxy` depending on file format
If `return_metadata=True` and file format is either HDF5, NIfTI or TXRM/TXM/XRM, returns `tuple` (volume, metadata).
Raises:
MemoryError: if the given file size exceeds available memory
Example:
```python
import qim3d
......@@ -634,13 +695,13 @@ def load(
dataset_name=dataset_name,
return_metadata=return_metadata,
contains=contains,
force_load=force_load,
dim_order=dim_order,
**kwargs,
)
data = loader.load(path)
def log_memory_info(data):
mem = Memory()
log.info(
......@@ -650,16 +711,18 @@ def load(
mem.report()
if return_metadata and not isinstance(data, tuple):
log.warning('The file format does not contain metadata')
log.warning("The file format does not contain metadata")
if not virtual_stack:
log_memory_info(data)
else:
# Only log if file type is not a np.ndarray, i.e., it is some kind of memmap object
if not isinstance( type(data[0]) if isinstance(data,tuple) else type(data), np.ndarray ):
if not isinstance(
type(data[0]) if isinstance(data, tuple) else type(data), np.ndarray
):
log.info("Using virtual stack")
else:
log.warning('Virtual stack is not supported for this file format')
log.warning("Virtual stack is not supported for this file format")
log_memory_info(data)
return data
......@@ -695,10 +758,6 @@ class ImgExamples:
img_examples_path = Path(qim3d.__file__).parents[0] / "img_examples"
img_paths = list(img_examples_path.glob("*.tif"))
img_names = []
for path in img_paths:
img_names.append(path.stem)
# Generate loader for each image found
for idx, name in enumerate(img_names):
exec(f"self.{name} = load(path = img_paths[idx])")
\ No newline at end of file
update_dict = {path.stem : load(path) for path in img_paths}
self.__dict__.update(update_dict)
......@@ -132,5 +132,5 @@ def level(log_level):
# create the logger
log = logging.getLogger("qim3d")
set_simple_output()
# set_simple_output() #TODO: This used to work, but now it gives duplicated messages. Need to be investigated.
set_level_warning()
......@@ -179,6 +179,18 @@ def sizeof(num, suffix="B"):
num /= 1024.0
return f"{num:.1f} Y{suffix}"
def get_file_size(filename:str) -> int:
"""
Args:
-----
filename (str): Specifies full path to file
Returns:
---------
size (int): size of file in bytes
"""
return os.path.getsize(filename)
def is_server_running(ip, port):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment