dataset.py 73.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# coding: utf-8
# /*##########################################################################
#
# Copyright (c) 2016-2017 European Synchrotron Radiation Facility
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# ###########################################################################*/


__authors__ = ["J. Garriga"]
__license__ = "MIT"
29
__date__ = "09/07/2021"
30

31
import copy
32
import glob
33
import logging
34
35
import os
import threading
36
37
38
import multiprocessing
from functools import partial
from multiprocessing import Pool
39
40
41
42
from enum import IntEnum

import h5py
import numpy
43

44
import fabio
45
from silx.io import utils
46
from silx.io import fabioh5
47
48
from silx.io.url import DataUrl

49
from darfix.core.dimension import AcquisitionDims, Dimension, POSITIONER_METADATA
50
from darfix.core.imageOperations import img2img_mean, chunk_image
51
52
from darfix.core.imageOperations import background_subtraction, background_subtraction_2D
from darfix.core.imageOperations import hot_pixel_removal_3D, hot_pixel_removal_2D
53
from darfix.core.imageOperations import threshold_removal
54
from darfix.core.imageOperations import Method
55
from darfix.core.imageRegistration import shift_detection, apply_shift
56
from darfix.core.mapping import fit_data, compute_moments, compute_rsm, compute_magnification
57
from darfix.core.roi import apply_2D_ROI, apply_3D_ROI
58
from darfix.core.utils import wrapTo2pi
59
from darfix.decomposition.ipca import IPCA
60
from darfix.decomposition.nmf import NMF
61
from darfix.decomposition.nica import NICA
62
from darfix.io import utils as io_utils
63

64
65
_logger = logging.getLogger(__file__)

Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
66

67
68
class Operation(IntEnum):
    """
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
69
    Flags for different operations in Dataset
70
71
72
73
    """
    PARTITION = 0
    BS = 1
    HP = 2
74
    THRESHOLD = 3
75
76
77
78
    SHIFT = 4
    ROI = 5
    MOMENTS = 6
    FIT = 7
79

80

81
class Dataset():
82
83
84
85
    """Class to define a dataset from a series of data files.

    :param _dir: Global directory to use and save all the data in the different
        operations.
86
    :type _dir: str
87
88
    :param data: If not None, sets the Data array with the data of the dataset
        to use, default None
89
    :type data: :class:`Data`, optional
90
91
    :param raw_folder: Path to the folder that contains the data, defaults to
        None
92
    :type raw_folder: Union[None,str], optional
93
    :param filenames: Ordered list of filenames, defaults to None.
94
    :type filenames: Union[Generator,Iterator,List], optional
95
    :param treated: Name of the folder with the treated data, defaults 'treated_data'
96
    :type treated: str, optional
97
98
    :param in_memory: If True, data is loaded into memory, else, data is read in
        chunks depending on the algorithm to apply, defaults to False.
99
    :type in_memory: bool, optional
100
    """
101

102
    def __init__(self, _dir, data=None, first_filename=None, filenames=None,
103
                 dims=None, transformation=None, in_memory=True, copy_files=False):
104

105
        self._data = None
106
        self._frames_intensity = []
107
        self._lock = threading.Lock()
108
        self.running_data = None
109
        self.moments_dims = {}
110
        self.operations_state = numpy.zeros(len(Operation))
111
        self._dir = _dir
112
        self._transformation = transformation
113
114
115
116
117
118
119
        if copy_files:
            self._dir += "/treated"
            if not os.path.isdir(self._dir):
                try:
                    os.mkdir(self._dir)
                except PermissionError:
                    raise PermissionError("Add a directory in the Treated Data tab with WRITE permission")
120
121
122
123
124
125

        if dims is None:
            self.__dims = AcquisitionDims()
        else:
            assert isinstance(dims, AcquisitionDims), "Attribute dims has to be of class AcquisitionDims"
            self.__dims = dims
126
        # Keys: dimensions names, values: dimensions values
127
        self._dimensions_values = {}
128
        self._in_memory = in_memory
129

130
131
132
        if data is not None:
            self._data = data
        else:
133
            if filenames is None and first_filename is None:
134
                filenames = sorted([x for x in glob.glob(_dir + "/*") if os.path.isfile(x)])
135
            self.filenames = filenames
136
            self.first_filename = first_filename
137
138

            metadata = []
139
            data_urls = []
140

141
142
143
144
145
146
147
148
            with fabio.open_series(filenames=filenames, first_filename=first_filename) as series:
                for frame in series.frames():
                    filename = frame.file_container.filename
                    data_urls.append(DataUrl(file_path=filename,
                                             scheme='fabio'))
                    fabio_reader = fabioh5.EdfFabioReader(file_name=filename)
                    metadata.append(fabio_reader)
                    fabio_reader.close()
149

150
            self._data = Data(numpy.array(data_urls), metadata=metadata, in_memory=self._in_memory)
151

Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
152
    def stop_operation(self, operation):
153
        """
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
154
        Method used for cases where threads are created to apply functions to the dataset.
155
        If method is called, the flag concerning the stop is set to 0 so that if the concerned
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
156
157
158
159
        operation is running in another thread it knows to stop.

        :param int operation: operation to stop
        :type int: Union[int, `Operation`]
160
        """
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
161
        if self.operations_state[operation]:
162
            self._lock.acquire()
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
163
            self.operations_state[operation] = 0
164
            self._lock.release()
165
166
        if self.running_data is not None:
            self.running_data.stop_operation(operation)
167

168
    @property
169
170
    def transformation(self):
        return self._transformation
171

172
173
174
    @transformation.setter
    def transformation(self, value):
        self._transformation = value
175

176
177
178
179
180
181
182
183
    @property
    def dir(self):
        return self._dir

    @property
    def treated(self):
        return self._treated

184
    def compute_frames_intensity(self, kernel=(3, 3), sigma=20):
185
186
187
188
        """
        Returns for every image a number representing its intensity. This number
        is obtained by first blurring the image and then computing its variance.
        """
189
        _logger.info("Computing intensity per frame")
190
        io_utils.advancement_display(0, self.nframes, "Computing intensity")
191
        frames_intensity = []
192
193
194
195
        self._lock.acquire()
        self.operations_state[Operation.PARTITION] = 1
        self._lock.release()

196
        for i in range(self.nframes):
197
            import cv2
198
199
            if not self.operations_state[Operation.PARTITION]:
                return
200
            frames_intensity += \
201
                [cv2.GaussianBlur(self.get_data(i), kernel, sigma).var()]
202
            io_utils.advancement_display(i + 1, self.nframes, "Computing intensity")
203
        self._frames_intensity = frames_intensity
204
205
206
        self._lock.acquire()
        self.operations_state[Operation.PARTITION] = 0
        self._lock.release()
207
208
        return self._frames_intensity

209
    def partition_by_intensity(self, bins=None, num_bins=1):
210
        """
211
212
213
214
215
216
217
        Function that computes the data from the set of urls.
        If the filter_data flag is activated it filters the data following the next:
        -- First, it computes the intensity for each frame, by calculating the variance after
        passing a gaussian filter.
        -- Second, computes the histogram of the intensity.
        -- Finally, saves the data of the frames with an intensity bigger than a threshold.
        The threshold is set to be the second bin of the histogram.
218

219
        :param int num_bins: Number of bins to use as threshold.
220
        """
221
        frames_intensity = self._frames_intensity if self._frames_intensity else self.compute_frames_intensity()
222
223
        if frames_intensity is None:
            return
224
        values, bins = numpy.histogram(frames_intensity, self.nframes if bins is None else bins)
225
        threshold = numpy.array(frames_intensity) >= bins[num_bins]
226
        return numpy.flatnonzero(threshold), numpy.flatnonzero(~threshold)
227

228
229
230
231
232
    @property
    def in_memory(self):
        return self._in_memory

    @in_memory.setter
233
    def in_memory(self, in_memory):
234
235
236
        """
        Removes data from memory and sets that from now on data will be read from disk.
        """
237
238
        if self._in_memory is not in_memory:
            self._in_memory = in_memory
239
240
            self._data = Data(self.data.urls, self.data.metadata, self._in_memory)

241
242
243
    @property
    def data(self):
        return self._data
244

245
    def get_data(self, indices=None, dimension=None, return_indices=False):
246
        """
247
        Returns the data corresponding to certains indices and given some dimensions values.
248
249
250
251
252
        The data is always flattened to be a stack of images.

        :param array_like indices: If not None, data is filtered using this array.
        :param array_like dimension: If not None, return only the data corresponding to
            the given dimension. Dimension is a 2d vector, where the first component is
253
254
255
256
257
            a list of the axis and the second is a list of the indices of the values to extract.
            The dimension and value list length can be up to the number of dimensions - 1.
            The call get_data(dimension=[[1, 2], [2, 3]]) is equivalent to data[:, 2, 3] when data
            is in memory.
            The axis of the dimension is so that lower the axis, fastest the dimension (higher changing
258
            value).
259
260
261

        :return: Array with the new data.
        """
262
        if dimension is not None and len(self._data.shape) > 3:
263
264
265
266
267
            # Make sure dimension and value are lists
            if type(dimension[0]) is int:
                dimension[0] = [dimension[0]]
                dimension[1] = [dimension[1]]
            data = self.data
268
269
270
271
272
            indices = numpy.arange(self.nframes) if indices is None else indices
            # Init list of bool indices
            bool_indices = numpy.zeros(self.data.flatten().shape[:-2], dtype=bool)
            bool_indices[indices] = True
            bool_indices = bool_indices.reshape(self.data.shape[:-2])
273
            indx = numpy.arange(self.nframes).reshape(self.data.shape[:-2])
274
275
276
277
278
            # For every axis, get corresponding elements
            for i, dim in enumerate(sorted(dimension[0])):
                # Flip axis to be consistent with the data shape
                axis = self.dims.ndim - dim - 1
                data = data.take(indices=dimension[1][i], axis=axis)
279
280
                bool_indices = bool_indices.take(indices=dimension[1][i], axis=axis)
                indx = indx.take(indices=dimension[1][i], axis=axis)
281
            data = data[bool_indices]
282
            indx = indx[bool_indices]
283
            if return_indices:
284
                return data.flatten(), indx.flatten()
285
            return data.flatten()
286
        else:
287
288
            data = self.data.flatten()
            return data if indices is None else data[indices]
289

290
291
    @property
    def nframes(self):
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
292
293
294
        """
        Return number of frames
        """
295
        if self.data is None:
296
297
            return 0
        else:
298
            return len(self.data.flatten())
299

300
    def to_memory(self, indices):
301
302
303
304
305
306
307
308
309
        """
        Method to load only part of the data into memory.
        Returns a new dataset with the data corresponding to given indices into memory.
        The new indices array has to be given, if all the data has to be set into
        memory please set `in_memory` to True instead, this way no new dataset will be
        created.

        :param array_like indices: Indices of the new dataset.
        """
310
311
312
313
314
315
316
        if not self._in_memory:
            data = self.get_data(indices)
            new_data = Data(data.urls, data.metadata, True)
        else:
            new_data = self.get_data(indices)
        return Dataset(_dir=self._dir, data=new_data, dims=self.__dims, in_memory=True)

317
318
319
320
    @property
    def dims(self):
        return self.__dims

321
322
    @dims.setter
    def dims(self, _dims):
323
324
        assert isinstance(_dims, AcquisitionDims), "Dimensions dictionary has " \
            "to be of class `AcquisitionDims`"
325
326
        self.__dims = _dims

327
328
329
    def clear_dims(self):
        self.__dims = AcquisitionDims()

330
331
332
333
334
    def add_dim(self, axis, dim):
        """
        Adds a dimension to the dimension's dictionary.

        :param int axis: axis of the dimension.
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
335
        :param `Dimension` dim: dimension to be added.
336
337
338
        """
        self.__dims.add_dim(axis, dim)

339
340
341
342
343
344
345
346
    def remove_dim(self, axis):
        """
        Removes a dimension from the dimension's dictionary.

        :param int axis: axis of the dimension.
        """
        self.__dims.remove_dim(axis)

347
    def zsum(self, indices=None, dimension=None):
348
349
        data = self.get_data(indices, dimension)
        return data.sum(axis=0)
350

351
352
353
354
    def reshape_data(self):
        """
        Function that reshapes the data to fit the dimensions.
        """
355
        if self.__dims.ndim == 1 and self.__dims.get(0).size == self.nframes:
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
356
            return self
357
        elif self.__dims.ndim > 1:
358
359
            try:
                shape = list(self.__dims.shape)
360
361
362
                shape.append(self.get_data().shape[-2])
                shape.append(self.get_data().shape[-1])
                data = self.get_data().reshape(shape)
363
                return Dataset(_dir=self.dir, data=data, dims=self.__dims, in_memory=self._in_memory)
364
365
366
367
368
369
            except Exception:
                raise ValueError("Failed to reshape data into dimensions {} \
                                  Try using another tolerance value.".format(' '.join(self.__dims.get_names())))
        else:
            raise ValueError("Not enough dimensions where found")

370
    def find_dimensions(self, kind, tolerance=1e-9):
371
        """
372
373
374
375
376
        Goes over all the headers from a given kind and finds the dimensions
        that move (have more than one value) along the data.

        Note: Before, only the dimensions that could fit where shown, now it
        shows all the dimensions and let the user choose the valid ones.
377

378
        :param int kind: Type of metadata to find the dimensions.
379
        :param float tolerance: Tolerance that will be used to compute the
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
380
            unique values.
381
        """
382
        self.__dims.clear()
383
        self._dimensions_values = {}
384

385
        keys = numpy.array(list(self.data.metadata[0].get_keys(kind)))
386
        values = numpy.array([[data.get_value(kind=kind, name=key)[0] for data
387
                             in self.data.metadata] for key in keys])
388
389
390
        # Unique values for each key.
        unique_values = [numpy.unique(value, return_counts=True) for value in values]
        dimensions = []
391
        dataset_size = len(self.data.metadata)
392
393
394
        # For every key that has more than one different value, creates a new Dimension.
        for i, value in enumerate(unique_values):
            if value[1][0] != dataset_size:
395
                dimension = Dimension(kind, keys[i], tolerance=tolerance)
396
                dimension.set_unique_values(numpy.unique(value[0]))
397
398
                # Value that tells when does the change of value occur. It is used to know the order
                # of the reshaping.
399
                dimension.changing_value = self.__compute_changing_value(values[i])
400
401
                dimensions.append(dimension)

402
        for dimension in sorted(dimensions, key=lambda x: x.changing_value, reverse=True):
403
            self.__dims.add_dim(axis=self.__dims.ndim, dim=dimension)
404
405
            _logger.info("Dimension {} of size {} has been added for reshaping"
                         .format(dimension.name, dimension.size))
406

407
408
    def get_metadata_values(self, kind, key):
        return numpy.unique([data.get_value(kind=kind, name=key)[0] for data
409
                             in self.get_data().metadata])
410

411
    def __compute_changing_value(self, values, changing_value=1):
412
413
414
415
        """
        Recursive method used to calculate how fast is a dimension. The speed of a dimension is the number of
        times the dimension changes of value while moving through the dataset.
        """
416
417
418
419
420
        if len(numpy.unique(values)) > 1:
            return self.__compute_changing_value(values[:int(len(values) / 2)], changing_value + 1)
        else:
            return changing_value

421
    def get_dimensions_values(self, indices=None):
422
423
424
        """
        Returns all the metadata values of the dimensions.
        The values are assumed to be numbers.
425

426
427
        :returns: array_like
        """
428
        if not self._dimensions_values:
429
            data = self.get_data(indices).metadata
430
431
432
433
434
435
            for dimension in self.__dims:
                values = numpy.empty((len(data)))
                for row, metadata_frame in enumerate(data):
                    values[row] = (metadata_frame.get_value(kind=dimension[1].kind,
                                   name=dimension[1].name)[0])
                self._dimensions_values[dimension[1].name] = values
436
        return self._dimensions_values
437

438
    def compute_mosaicity_colorkey(self, scale=100):
439
440
441
442
443
444
445
446
447
448
        """
        Computes a mosaicity colorkey from the dimensions, and returns also the
        orientation distribution image.
        """
        ori_dist = self.get_data().sum(axis=1)

        if self.__dims:
            new_shape = []
            dims_list = []
            for axis, dim in self.__dims:
449
                dims_list += [numpy.linspace(-1, 1, (dim.size - 1) * scale)]
450
                new_shape += [dim.size]
451

452
453
454
455
456
457
458
459
            dims_list.reverse()
            mesh = numpy.meshgrid(*dims_list)
            if len(mesh) == 2:
                data = numpy.arctan2(mesh[0], mesh[1])
                normalized_data = wrapTo2pi(data) / numpy.pi / 2
                sqr = numpy.sqrt(numpy.power(
                    mesh[0], 2) + numpy.power(mesh[1], 2)) / numpy.sqrt(2)
                hsv_key = numpy.stack((normalized_data, sqr, numpy.ones(
460
                    (len(dims_list[1]), len(dims_list[0])))), axis=2)
461
                return ori_dist.reshape(numpy.flip(new_shape)).T, hsv_key
462
        return None, None
463

464
    def apply_background_subtraction(self, background=None, method="median", indices=None,
465
                                     step=None, chunk_shape=[100, 100], _dir=None):
julia's avatar
julia committed
466
467
468
469
        """
        Applies background subtraction to the data and saves the new data
        into disk.

470
        :param background: Data to be used as background. If None, data with indices `indices` is used.
julia's avatar
julia committed
471
            If Dataset, data of the dataset is used. If array, use data with indices in the array.
472
        :type background: Union[None, array_like, Dataset]
julia's avatar
julia committed
473
474
        :param method: Method to use to compute the background.
        :type method: Method
475
476
477
        :param indices: Indices of the images to apply background subtraction.
            If None, the background subtraction is applied to all the data.
        :type indices: Union[None, array_like]
478
479
480
481
482
483
484
        :param int step: Distance between images to be used when computing the median.
            Parameter used only when flag in_memory is False and method is `Method.median`.
            If `step` is not None, all images with distance of `step`, starting at 0,
            will be loaded into memory for computing the median, if the data loading throws
            a MemoryError, the median computation is tried with `step += 1`.
        :param chunk_shape: Shape of the chunk image to use per iteration.
            Parameter used only when flag in_memory is False and method is `Method.median`.
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
485
486
487
488
        :type chunk_shape: array_like
        :returns: dataset with data of same size as `self.data` but with the
            modified images. The urls of the modified images are replaced with
            the new urls.
julia's avatar
julia committed
489
490
        :rtype: Dataset
        """
491
492
493
494
495
496

        _dir = self.dir if _dir is None else _dir

        if not os.path.isdir(_dir):
            os.mkdir(_dir)

497
        method = Method.from_value(method)
498

499
        self.running_data = self.get_data(indices)
500

501
502
503
504
        self._lock.acquire()
        self.operations_state[Operation.BS] = 1
        self._lock.release()

505
        if background is None:
506
            bg_data = self.running_data
507
            if indices is None:
508
                _logger.info("Computing background from " + method.name + " of raw data")
509
            else:
510
                _logger.info("Computing background from " + method.name + " of high intensity data")
511
512
        elif isinstance(background, Dataset):
            bg_data = background.data
513
            _logger.info("Computing background from " + method.name + " of `background` set")
514
        else:
515
            bg_data = self.get_data(background)
516
            _logger.info("Computing background from " + method.name + " of low intensity data")
517

518
        if self._in_memory:
519
            new_data = background_subtraction(self.running_data, bg_data, method).view(Data)
520
            new_data.save(_dir + "/data.hdf5")
521
            urls = new_data.urls
522
        else:
523
            bg = numpy.zeros(self.running_data[0].shape, self.running_data.dtype)
524
525
526
527
            if method == Method.mean:
                if bg_data.in_memory:
                    numpy.mean(bg_data, out=bg, axis=0)
                else:
528
                    io_utils.advancement_display(0, len(bg_data), "Computing mean image")
529
                    for i in range(len(bg_data)):
530
531
                        if not self.operations_state[Operation.BS]:
                            return
532
                        bg = img2img_mean(bg_data[i], bg, i)
533
                        io_utils.advancement_display(i + 1, len(bg_data), "Computing mean image")
534
535
536
537
            elif method == Method.median:
                if bg_data.in_memory:
                    numpy.median(bg_data, out=bg, axis=0)
                else:
538
                    if step is not None:
539
                        bg_indices = numpy.arange(0, len(bg_data), step)
540
541
                        try:
                            numpy.median(
542
                                Data(bg_data.urls[bg_indices], bg_data.metadata[bg_indices]),
543
544
545
546
                                out=bg, axis=0)
                        except MemoryError:
                            if not self.operations_state[Operation.BS]:
                                return
547
548
                            print("MemoryError, trying with step {}".format(step + 1))
                            return self.apply_background_subtraction(background, method, indices, step + 1)
549
550
                    else:
                        start = [0, 0]
551
                        img = self.running_data[0]
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
                        bg = numpy.empty(img.shape, img.dtype)
                        chunks_1 = int(numpy.ceil(img.shape[0] / chunk_shape[0]))
                        chunks_2 = int(numpy.ceil(img.shape[1] / chunk_shape[1]))
                        io_utils.advancement_display(0, chunks_1 * chunks_2 * len(bg_data), "Computing median image")
                        for i in range(chunks_1):
                            for j in range(chunks_2):
                                if not self.operations_state[Operation.BS]:
                                    return
                                c_images = []
                                cpus = multiprocessing.cpu_count()
                                with Pool(cpus - 1) as p:
                                    c_images = p.map(partial(chunk_image, start, chunk_shape), bg_data)
                                io_utils.advancement_display(
                                    i * chunks_2 + j + 1,
                                    chunks_1 * chunks_2,
                                    "Computing median image")
                                numpy.median(c_images, out=bg[start[0]: start[0] + chunk_shape[0],
                                             start[1]:start[1] + chunk_shape[1]], axis=0)
                                start[1] = chunk_shape[1] * (j + 1)
                            start[0] = chunk_shape[0] * (i + 1)
                            start[1] = 0
573
574
575
576

            if not self.operations_state[Operation.BS]:
                return

577
            urls = self.running_data.apply_funcs([(background_subtraction_2D, [bg])],
578
                                                 save=_dir + "/data.hdf5", text="Applying background subtraction",
579
                                                 operation=Operation.BS)
580
581
            if urls is None:
                return
582

583
584
585
586
        self._lock.acquire()
        self.operations_state[Operation.BS] = 0
        self._lock.release()

Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
587
        # Set urls as shape and dimension of original urls.
588
        if indices is not None:
589
            new_urls = numpy.array(self.get_data().urls, dtype=object)
590
            new_urls[indices] = urls
591
592
            new_data = Data(new_urls.reshape(self.data.urls.shape), self.data.metadata,
                            self._in_memory)
593
        else:
594
595
            new_data = Data(urls.reshape(self.data.urls.shape), self.data.metadata,
                            self._in_memory)
596

597
        return Dataset(_dir=_dir, data=new_data, dims=self.__dims, transformation=self.transformation,
598
                       in_memory=self._in_memory)
599

600
    def apply_hot_pixel_removal(self, kernel=3, indices=None, _dir=None):
julia's avatar
julia committed
601
602
603
        """
        Applies hot pixel removal to Data, and saves the new data
        into disk.
604

julia's avatar
julia committed
605
606
        :param int kernel: size of the kernel used to find the hot
            pixels.
607
        :param indices: Indices of the images to apply background subtraction.
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
608
            If None, the hot pixel removal is applied to all the data.
609
        :type indices: Union[None, array_like]
610
        :return: dataset with data of same size as `self.data` but with the
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
611
612
            modified images. The urls of the modified images are replaced with
            the new urls.
julia's avatar
julia committed
613
614
        :rtype: Dataset
        """
615

616
617
        self.running_data = self.get_data(indices)

618
619
620
621
        _dir = self.dir if _dir is None else _dir

        if not os.path.isdir(_dir):
            os.mkdir(_dir)
622

623
        if self._in_memory:
624
            new_data = hot_pixel_removal_3D(self.running_data, kernel).view(Data)
625
            new_data.save(_dir + "/data.hdf5")
626
            urls = new_data.urls
627
        else:
628
            urls = self.running_data.apply_funcs([(hot_pixel_removal_2D, [kernel])],
629
                                                 save=_dir + "/data.hdf5", text="Applying hot pixel removal",
630
                                                 operation=Operation.HP)
631
632
            if urls is None:
                return
633
634

        if indices is not None:
635
            new_urls = numpy.array(self.get_data().urls, dtype=object)
636
            new_urls[indices] = urls
637
638
            new_data = Data(new_urls.reshape(self.data.urls.shape), self.data.metadata,
                            self._in_memory)
639
        else:
640
641
            new_data = Data(urls.reshape(self.data.urls.shape), self.data.metadata,
                            self._in_memory)
642

643
644
        return Dataset(_dir=_dir, data=new_data, dims=self.__dims, transformation=self.transformation,
                       in_memory=self._in_memory)
645

646
    def apply_threshold_removal(self, bottom=None, top=None, indices=None, _dir=None):
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
        """
        Applies bottom threshold to Data, and saves the new data
        into disk.

        :param int bottom: bottom threshold to apply.
        :param int top: top threshold to apply.
        :param indices: Indices of the images to apply background subtraction.
            If None, the hot pixel removal is applied to all the data.
        :type indices: Union[None, array_like]
        :return: dataset with data of same size as `self.data` but with the
            modified images. The urls of the modified images are replaced with
            the new urls.
        :rtype: Dataset
        """

        self.running_data = self.get_data(indices)

664
665
666
667
        _dir = self.dir if _dir is None else _dir

        if not os.path.isdir(_dir):
            os.mkdir(_dir)
668
669
670

        if self._in_memory:
            new_data = threshold_removal(self.running_data, bottom, top).view(Data)
671
            new_data.save(_dir + "/data.hdf5")
672
673
674
            urls = new_data.urls
        else:
            urls = self.running_data.apply_funcs([(threshold_removal, [bottom, top])],
675
                                                 save=_dir + "/data.hdf5", text="Applying threshold",
676
                                                 operation=Operation.THRESHOLD)
677
678
679
680
681
682
683
684
685
686
687
688
            if urls is None:
                return

        if indices is not None:
            new_urls = numpy.array(self.get_data().urls, dtype=object)
            new_urls[indices] = urls
            new_data = Data(new_urls.reshape(self.data.urls.shape), self.data.metadata,
                            self._in_memory)
        else:
            new_data = Data(urls.reshape(self.data.urls.shape), self.data.metadata,
                            self._in_memory)

689
690
        return Dataset(_dir=_dir, data=new_data, dims=self.__dims, transformation=self.transformation,
                       in_memory=self._in_memory)
691

692
    def apply_roi(self, origin=None, size=None, center=None, indices=None, roi_dir=None):
julia's avatar
julia committed
693
694
695
696
697
698
699
700
701
        """
        Applies a region of interest to the data.

        :param origin: Origin of the roi
        :param size: [Height, Width] of the roi.
        :param center: Center of the roi
        :type origin: Union[2d vector, None]
        :type center: Union[2d vector, None]
        :type size: Union[2d vector, None]
702
        :param indices: Indices of the images to apply background subtraction.
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
703
            If None, the roi is applied to all the data.
704
        :type indices: Union[None, array_like]
705
706
        :param roi_dir: Directory path for the new dataset
        :type roi_dir: str
julia's avatar
julia committed
707
        :returns: dataset with data with roi applied.
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
708
709
            Note: To preserve consistence of shape between images, if `indices`
            is not None, only the data modified is returned.
julia's avatar
julia committed
710
711
        :rtype: Dataset
        """
712

713
        roi_dir = self.dir if roi_dir is None else roi_dir
714
715
        if not os.path.isdir(roi_dir):
            os.mkdir(roi_dir)
716
717

        self.running_data = self.get_data(indices)
718
        if self._in_memory:
719
            new_data = apply_3D_ROI(self.running_data, origin, size, center).view(Data)
720
            new_data.save(roi_dir + "/data.hdf5", new_shape=new_data.shape)
721
        else:
722
            shape = numpy.append([self.running_data.shape[0]], apply_2D_ROI(self.running_data[0], origin, size, center).shape)
723
            urls = self.running_data.apply_funcs([(apply_2D_ROI, [origin, size, center])],
724
725
726
                                                 save=roi_dir + "/data.hdf5", text="Applying roi",
                                                 operation=Operation.ROI,
                                                 new_shape=shape)
727
728
            if urls is None:
                return
729
            new_data = Data(urls, self.running_data.metadata, self._in_memory)
730

731
732
        transformation = ([apply_2D_ROI(axis, origin, size, center) for axis in self.transformation]
                          if self.transformation else None)
733
734
735
736
        if indices is None:
            shape = list(self.data.shape)[:-2]
            shape.append(new_data.shape[-2])
            shape.append(new_data.shape[-1])
737
            new_data = new_data.reshape(shape)
738
739
        return Dataset(_dir=roi_dir, data=new_data, dims=self.__dims, transformation=transformation,
                       in_memory=self._in_memory)
740

741
    def find_shift(self, dimension=None, steps=50, indices=None):
julia's avatar
julia committed
742
743
744
745
746
747
748
749
        """
        Find shift of the data or part of it.

        :param dimension: Parametes with the position of the data in the reshaped
            array.
        :type dimension: Union[None, tuple, array_like]
        :param float h_max: See `core.imageRegistration.shift_detection`
        :param float h_step: See `core.imageRegistration.shift_detection`
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
750
751
752
        :param indices: Boolean index list with True in the images to apply the shift to.
            If None, the hot pixel removal is applied to all the data.
        :type indices: Union[None, array_like]
julia's avatar
julia committed
753
754
        :returns: Array with shift per frame.
        """
755
        return shift_detection(self.get_data(indices, dimension), steps)
756

757
758
    def apply_shift(self, shift, dimension=None, shift_approach="fft", indices=None,
                    callback=None, _dir=None):
julia's avatar
julia committed
759
760
761
762
763
764
765
766
        """
        Apply shift of the data or part of it and save new data into disk.

        :param array_like shift: Shift per frame.
        :param dimension: Parametes with the position of the data in the reshaped
            array.
        :type dimension: Union[None, tuple, array_like]
        :param Union['fft', 'linear'] shift_approach: Method to use to apply the shift.
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
767
768
769
        :param indices: Boolean index list with True in the images to apply the shift to.
            If None, the hot pixel removal is applied to all the data.
        :type indices: Union[None, array_like]
julia's avatar
julia committed
770
        :param Union[function, None] callback: Callback
Julia Garriga Ferrer's avatar
Julia Garriga Ferrer committed
771
772
773
        :returns: dataset with data of same size as `self.data` but with the
            modified images. The urls of the modified images are replaced with
            the new urls.
julia's avatar
julia committed
774
        """
775
776
        assert len(shift) > 0, "Shift list can't be empty"

777
778
779
        if not numpy.any(shift):
            return self

780
781
782
783
        _dir = self.dir if _dir is None else _dir

        if not os.path.isdir(_dir):
            os.mkdir(_dir)
784

785
        data = self.get_data(indices, dimension)
786
787
788
        self._lock.acquire()
        self.operations_state[Operation.SHIFT] = 1
        self._lock.release()
789
790
791
        _file = h5py.File(_dir + '/data.hdf5', 'a')
        dataset_name = "dataset"
        if "dataset" in _file:
792
            _file.create_dataset("update_dataset", data=_file["dataset"])
793
794
            dataset_name = "update_dataset"
        else:
795
            _file.create_dataset("dataset", data.shape, dtype=data.dtype)
796

797
        io_utils.advancement_display(0, len(data), "Applying shift")
798
        if dimension is not None:
799

800
801
802
803
            # Convert dimension and value into list
            if type(dimension[0]) is int:
                dimension[0] = [dimension[0]]
                dimension[1] = [dimension[1]]
804
            urls = []
805
            for i in range(len(data)):
806
                if not self.operations_state[Operation.SHIFT]:
807
                    del _file["update_dataset"]
808
                    return
809
                img = apply_shift(data[i], shift[:, i], shift_approach)
810
                if shift[:, i].all() > 1:
811
                    shift_approach = "linear"
812
813
                _file[dataset_name][i] = img
                urls.append(DataUrl(file_path=_dir + '/data.hdf5', data_path="/dataset", data_slice=i, scheme='silx'))
814
                io_utils.advancement_display(i + 1, len(data), "Applying shift")
815
816

            # Replace specific urls that correspond to the modified data
817
            new_urls = numpy.array(self.data.urls, dtype=object)
818
            copy_urls = new_urls
819
            if indices is not None:
820
821
822
823
                # Create array of booleans to know which indices we have
                bool_indices = numpy.zeros(self.get_data().shape[:-2], dtype=bool)
                bool_indices[indices] = True
                bool_indices = bool_indices.reshape(self.data.shape[:-2])
824
825
826
827
828
829
                for i, dim in enumerate(sorted(dimension[0])):
                    # Flip axis to be consistent with the data shape
                    axis = self.dims.ndim - dim - 1
                    copy_urls = numpy.swapaxes(copy_urls, 0, axis)[dimension[1][i], :]
                    bool_indices = numpy.swapaxes(bool_indices, 0, axis)[dimension[1][i], :]
                copy_urls[bool_indices] = urls
830
            else:
831
832
833
834
835
                for i, dim in enumerate(sorted(dimension[0])):
                    # Flip axis to be consistent with the data shape
                    axis = self.dims.ndim - dim - 1
                    copy_urls = numpy.swapaxes(copy_urls, 0, axis)[dimension[1][i], :]
                copy_urls[:] = urls
836
        else:
837
838
            urls = []
            for i in range(len(data)):
839
                if not self.operations_state[Operation.SHIFT]:
840
                    del _file["update_dataset"]
841
                    return
842
                if shift[:, i].all() > 1:
843
                    shift_approach