Skip to content

metadata

Functions to create a metadata dataframe from Yokogawa files.

_create_well_ids(row_series, col_series, plate_type)

Create well_id list from XML metadata

Handles the conversion of Cellvoyager XML metadata into well indentifiers. Returns well identifiers like A01, B02 etc. for 96 & 384 well plates. Returns well identifiers like A01.a1, A01.b2 etc. for 1536 well plates. Defaults to the processing used for 96 & 384 well plates, unless the plate_type is 1536. For 1536 well plates, the first 4x4 wells go into A01.a1 - A01.d4 and so on.

PARAMETER DESCRIPTION
row_series

Series with index being the index of the image and the value the row position (starting at 1 for top left).

TYPE: Series

col_series

Series with index being the index of the image and the value the col position (starting at 1 for top left).

TYPE: Series

plate_type

Number of wells in the plate layout. Used to determine whether it's a 1536 well plate or a different layout.

TYPE: int

RETURNS DESCRIPTION
list[str]

list of well_ids

Source code in fractal_tasks_core/cellvoyager/metadata.py
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
def _create_well_ids(
    row_series: pd.Series,
    col_series: pd.Series,
    plate_type: int,
) -> list[str]:
    """
    Create well_id list from XML metadata

    Handles the conversion of Cellvoyager XML metadata into well indentifiers.
    Returns well identifiers like A01, B02 etc. for 96 & 384 well plates.
    Returns well identifiers like A01.a1, A01.b2 etc. for 1536 well plates.
    Defaults to the processing used for 96 & 384 well plates, unless the
    plate_type is 1536. For 1536 well plates, the first 4x4 wells go into
    A01.a1 - A01.d4 and so on.

    Args:
        row_series: Series with index being the index of the image and the
            value the row position (starting at 1 for top left).
        col_series: Series with index being the index of the image and the
            value the col position (starting at 1 for top left).
        plate_type: Number of wells in the plate layout. Used to determine
            whether it's a 1536 well plate or a different layout.

    Returns:
        list of well_ids

    """
    if plate_type == 1536:
        # Row are built of a base letter (matching to the 96 well plate layout)
        # and a sub letter (position of the 1536 well within the 4x4 grid,
        # can be a-d) of that well
        row_base = [chr(math.floor((x - 1) / 4) + 65) for x in (row_series)]
        row_sub = [chr((x - 1) % 4 + 97) for x in (row_series)]
        # Columns are built of a base number (matching to the 96 well plate
        # layout) and a sub integer (position of the 1536 well within the
        # 4x4 grid, can be 1-4) of that well
        col_base = [math.floor((x - 1) / 4) + 1 for x in col_series]
        col_sub = [(x - 1) % 4 + 1 for x in col_series]
        well_ids = []
        for i in range(len(row_base)):
            well_ids.append(
                f"{row_base[i]}{col_base[i]:02}.{row_sub[i]}{col_sub[i]}"
            )
    else:
        row_str = [chr(x) for x in (row_series + 64)]
        well_ids = [f"{a}{b:02}" for a, b in zip(row_str, col_series)]

    return well_ids

calculate_steps(site_series)

TBD

PARAMETER DESCRIPTION
site_series

TBD

TYPE: Series

Source code in fractal_tasks_core/cellvoyager/metadata.py
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
def calculate_steps(site_series: pd.Series):
    """
    TBD

    Args:
        site_series: TBD
    """

    # site_series is the z_micrometer series for a given site of a given
    # channel. This function calculates the step size in Z

    # First diff is always NaN because there is nothing to compare it to
    steps = site_series.diff().dropna().astype(float)
    if not np.allclose(steps.iloc[0], np.array(steps)):
        raise NotImplementedError(
            "When parsing the Yokogawa mlf file, some sites "
            "had varying step size in Z. "
            "That is not supported for the OME-Zarr parsing"
        )
    return steps.mean()

check_group_consistency(grouped_df, message='')

TBD

PARAMETER DESCRIPTION
grouped_df

TBD

TYPE: DataFrame

message

TBD

TYPE: str DEFAULT: ''

Source code in fractal_tasks_core/cellvoyager/metadata.py
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
def check_group_consistency(grouped_df: pd.DataFrame, message: str = ""):
    """
    TBD

    Args:
        grouped_df: TBD
        message: TBD
    """

    # Check consistency in grouped df for multi-index, multi-column dataframes
    # raises an exception if there is variability
    diff_df = grouped_df.max() - grouped_df.min()
    if not np.isclose(np.sum(np.sum(diff_df)), 0.0):
        raise ValueError(
            "During metadata parsing, a consistency check failed: \n"
            f"{message}\n"
            f"Difference dataframe: \n{diff_df}"
        )

get_earliest_time_per_site(mlf_frame)

TBD

PARAMETER DESCRIPTION
mlf_frame

TBD

TYPE: DataFrame

Source code in fractal_tasks_core/cellvoyager/metadata.py
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
def get_earliest_time_per_site(mlf_frame: pd.DataFrame) -> pd.DataFrame:
    """
    TBD

    Args:
        mlf_frame: TBD
    """

    # Get the time information per site
    # Because a site will contain time information for each plane
    # of each channel, we just return the earliest time infromation
    # per site.
    return pd.to_datetime(
        mlf_frame.groupby(["well_id", "FieldIndex"]).min()["Time"], utc=True
    )

get_z_steps(mlf_frame)

TBD

PARAMETER DESCRIPTION
mlf_frame

TBD

TYPE: DataFrame

Source code in fractal_tasks_core/cellvoyager/metadata.py
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
def get_z_steps(mlf_frame: pd.DataFrame) -> pd.DataFrame:
    """
    TBD

    Args:
        mlf_frame: TBD
    """

    # Process mlf_frame to extract Z information (pixel size & steps).
    # Run checks on consistencies & return site-based z step dataframe
    # Group by well, field & channel
    grouped_sites_z = (
        mlf_frame.loc[
            :,
            ["well_id", "FieldIndex", "ActionIndex", "Ch", "Z"],
        ]
        .set_index(["well_id", "FieldIndex", "ActionIndex", "Ch"])
        .groupby(level=[0, 1, 2, 3])
    )

    # If there is only 1 Z step, set the Z spacing to the count of planes => 1
    if grouped_sites_z.count()["Z"].max() == 1:
        z_data = grouped_sites_z.count().groupby(["well_id", "FieldIndex"])
    else:
        # Group the whole site (combine channels), because Z steps need to be
        # consistent between channels for OME-Zarr.
        z_data = grouped_sites_z.apply(calculate_steps).groupby(
            ["well_id", "FieldIndex"]
        )

    check_group_consistency(
        z_data, message="Comparing Z steps between channels"
    )

    # Ensure that channels have the same number of z planes and
    # reduce it to one value.
    # Only check if there is more than one channel available
    if any(
        grouped_sites_z.count().groupby(["well_id", "FieldIndex"]).count() > 1
    ):
        check_group_consistency(
            grouped_sites_z.count().groupby(["well_id", "FieldIndex"]),
            message="Checking number of Z steps between channels",
        )

    z_steps = (
        grouped_sites_z.count()
        .groupby(["well_id", "FieldIndex"])
        .mean()
        .astype(int)
    )

    # Combine the two dataframes
    z_frame = pd.concat([z_data.mean(), z_steps], axis=1)
    z_frame.columns = ["pixel_size_z", "z_pixel"]
    return z_frame

parse_yokogawa_metadata(mrf_path, mlf_path, *, include_patterns=None, exclude_patterns=None)

Parse Yokogawa CV7000 metadata files and prepare site-level metadata.

PARAMETER DESCRIPTION
mrf_path

Full path to MeasurementDetail.mrf metadata file.

TYPE: Union[str, Path]

mlf_path

Full path to MeasurementData.mlf metadata file.

TYPE: Union[str, Path]

include_patterns

List of patterns to filter the image filenames in the mlf metadata table. Patterns must be defined as in https://docs.python.org/3/library/fnmatch.html

TYPE: Optional[list[str]] DEFAULT: None

exclude_patterns

List of exclusion patterns. Any file matching any of those patterns is excluded. Patterns must be defined as in https://docs.python.org/3/library/fnmatch.html

TYPE: Optional[list[str]] DEFAULT: None

Source code in fractal_tasks_core/cellvoyager/metadata.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def parse_yokogawa_metadata(
    mrf_path: Union[str, Path],
    mlf_path: Union[str, Path],
    *,
    include_patterns: Optional[list[str]] = None,
    exclude_patterns: Optional[list[str]] = None,
) -> tuple[pd.DataFrame, dict[str, int]]:
    """
    Parse Yokogawa CV7000 metadata files and prepare site-level metadata.

    Args:
        mrf_path: Full path to MeasurementDetail.mrf metadata file.
        mlf_path: Full path to MeasurementData.mlf metadata file.
        include_patterns: List of patterns to filter the image filenames in
            the mlf metadata table. Patterns must be defined as in
            https://docs.python.org/3/library/fnmatch.html
        exclude_patterns: List of exclusion patterns. Any file matching any
            of those patterns is excluded. Patterns must be defined as in
            https://docs.python.org/3/library/fnmatch.html
    """

    # Convert paths to strings
    mrf_str = Path(mrf_path).as_posix()
    mlf_str = Path(mlf_path).as_posix()

    # Ensure mrf & mlf files exist
    if not Path(mrf_str).exists() and not Path(mlf_str).exists():
        raise FileNotFoundError(
            "Could not find the mlf & mrf metadata files. Expected to find "
            "them at: \n"
            f"{mrf_str=}\n"
            f"{mlf_str=}"
        )

    mrf_frame, mlf_frame, error_count = read_metadata_files(
        mrf_str,
        mlf_str,
        include_patterns=include_patterns,
        exclude_patterns=exclude_patterns,
    )

    # Aggregate information from the mlf file
    per_site_parameters = ["X", "Y"]

    grouping_params = ["well_id", "FieldIndex"]
    grouped_sites = mlf_frame.loc[
        :, grouping_params + per_site_parameters
    ].groupby(by=grouping_params)

    check_group_consistency(grouped_sites, message="X & Y stage positions")
    site_metadata = grouped_sites.mean()
    site_metadata.columns = ["x_micrometer", "y_micrometer"]
    site_metadata["z_micrometer"] = 0

    site_metadata = pd.concat(
        [
            site_metadata,
            get_z_steps(mlf_frame),
            get_earliest_time_per_site(mlf_frame),
        ],
        axis=1,
    )

    # Aggregate information from the mrf file
    mrf_columns = [
        "HorizontalPixelDimension",
        "VerticalPixelDimension",
        "HorizontalPixels",
        "VerticalPixels",
        "InputBitDepth",
    ]
    check_group_consistency(
        mrf_frame.loc[:, mrf_columns], message="Image dimensions"
    )
    site_metadata["pixel_size_x"] = mrf_frame.loc[
        :, "HorizontalPixelDimension"
    ].max()
    site_metadata["pixel_size_y"] = mrf_frame.loc[
        :, "VerticalPixelDimension"
    ].max()
    site_metadata["x_pixel"] = int(mrf_frame.loc[:, "HorizontalPixels"].max())
    site_metadata["y_pixel"] = int(mrf_frame.loc[:, "VerticalPixels"].max())
    site_metadata["bit_depth"] = int(mrf_frame.loc[:, "InputBitDepth"].max())

    if error_count > 0:
        logger.info(
            f"There were {error_count} ERR entries in the metadatafile. "
            f"Still succesfully parsed {len(site_metadata)} sites. "
        )

    # Compute expected number of image files for each well
    list_of_wells = set(site_metadata.index.get_level_values("well_id"))
    number_of_files = {}
    for this_well_id in list_of_wells:
        num_images = (mlf_frame.well_id == this_well_id).sum()
        logger.info(
            f"Expected number of images for well {this_well_id}: {num_images}"
        )
        number_of_files[this_well_id] = num_images
    # Check that the sum of per-well file numbers correspond to the total
    # file number
    if not sum(number_of_files.values()) == len(mlf_frame):
        raise ValueError(
            "Error while counting the number of image files per well.\n"
            f"{len(mlf_frame)=}\n"
            f"{number_of_files=}"
        )

    return site_metadata, number_of_files

read_metadata_files(mrf_path, mlf_path, include_patterns=None, exclude_patterns=None)

Create tables for mrf & mlf Yokogawa metadata.

PARAMETER DESCRIPTION
mrf_path

Full path to MeasurementDetail.mrf metadata file.

TYPE: str

mlf_path

Full path to MeasurementData.mlf metadata file.

TYPE: str

include_patterns

List of patterns to filter the image filenames in the mlf metadata table. Patterns must be defined as in https://docs.python.org/3/library/fnmatch.html

TYPE: Optional[list[str]] DEFAULT: None

exclude_patterns

List of exclusion patterns. Any file matching any of those patterns is excluded. Patterns must be defined as in https://docs.python.org/3/library/fnmatch.html

TYPE: Optional[list[str]] DEFAULT: None

Returns:

Source code in fractal_tasks_core/cellvoyager/metadata.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
def read_metadata_files(
    mrf_path: str,
    mlf_path: str,
    include_patterns: Optional[list[str]] = None,
    exclude_patterns: Optional[list[str]] = None,
) -> tuple[pd.DataFrame, pd.DataFrame, int]:
    """
    Create tables for mrf & mlf Yokogawa metadata.

    Args:
        mrf_path: Full path to MeasurementDetail.mrf metadata file.
        mlf_path: Full path to MeasurementData.mlf metadata file.
        include_patterns: List of patterns to filter the image filenames in
            the mlf metadata table. Patterns must be defined as in
            https://docs.python.org/3/library/fnmatch.html
        exclude_patterns: List of exclusion patterns. Any file matching any
            of those patterns is excluded. Patterns must be defined as in
            https://docs.python.org/3/library/fnmatch.html

    Returns:

    """

    # parsing of mrf & mlf files are based on the
    # yokogawa_image_collection_task v0.5 in drogon, written by Dario Vischi.
    # https://github.com/fmi-basel/job-system-workflows/blob/00bbf34448972d27f258a2c28245dd96180e8229/src/gliberal_workflows/tasks/yokogawa_image_collection_task/versions/version_0_5.py  # noqa
    # Now modified for Fractal use

    mrf_frame, plate_type = read_mrf_file(mrf_path)

    # filter_position & filter_wheel_position are parsed, but not
    # processed further. Figure out how to save them as relevant metadata for
    # use e.g. during illumination correction

    mlf_frame, error_count = read_mlf_file(
        mlf_path,
        plate_type,
        include_patterns=include_patterns,
        exclude_patterns=exclude_patterns,
    )

    # Filter the mrf channel dataframe to only keep channels that were imaged
    # and are included in the filters (see issue #287)
    relevant_channels = mlf_frame["Ch"].unique()
    mrf_frame = mrf_frame[mrf_frame["Ch"].isin(relevant_channels)]

    # Time points are parsed as part of the mlf_frame, but currently not
    # processed further. Once we tackle time-resolved data, parse from here.

    return mrf_frame, mlf_frame, error_count

read_mlf_file(mlf_path, plate_type, include_patterns=None, exclude_patterns=None)

Process the mlf metadata file of a Cellvoyager CV7K/CV8K.

PARAMETER DESCRIPTION
mlf_path

Full path to MeasurementData.mlf metadata file.

TYPE: str

plate_type

Plate layout, integer for the number of potential wells.

TYPE: int

include_patterns

List of patterns to filter the image filenames in the mlf metadata table. Patterns must be defined as in https://docs.python.org/3/library/fnmatch.html

TYPE: Optional[list[str]] DEFAULT: None

exclude_patterns

List of exclusion patterns. Any file matching any of those patterns is excluded. Patterns must be defined as in https://docs.python.org/3/library/fnmatch.html

TYPE: Optional[list[str]] DEFAULT: None

RETURNS DESCRIPTION
mlf_frame

pd.DataFrame with relevant metadata per image

TYPE: DataFrame

error_count

Count of errors found during metadata processing

TYPE: int

Source code in fractal_tasks_core/cellvoyager/metadata.py
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
def read_mlf_file(
    mlf_path: str,
    plate_type: int,
    include_patterns: Optional[list[str]] = None,
    exclude_patterns: Optional[list[str]] = None,
) -> tuple[pd.DataFrame, int]:
    """
    Process the mlf metadata file of a Cellvoyager CV7K/CV8K.

    Args:
        mlf_path: Full path to MeasurementData.mlf metadata file.
        plate_type: Plate layout, integer for the number of potential wells.
        include_patterns: List of patterns to filter the image filenames in
            the mlf metadata table. Patterns must be defined as in
            https://docs.python.org/3/library/fnmatch.html
        exclude_patterns: List of exclusion patterns. Any file matching any
            of those patterns is excluded. Patterns must be defined as in
            https://docs.python.org/3/library/fnmatch.html

    Returns:
        mlf_frame: pd.DataFrame with relevant metadata per image
        error_count: Count of errors found during metadata processing
    """

    # Load the whole MeasurementData.mlf file
    mlf_frame_raw = pd.read_xml(mlf_path)

    # Remove all rows that do not match the given patterns
    logger.info(
        f"Read {mlf_path}, apply following include patterns to "
        f"image filenames: {include_patterns} apply the following exlcude "
        f"patterns to image filenames: {exclude_patterns}"
    )

    if include_patterns or exclude_patterns:
        filenames = mlf_frame_raw.MeasurementRecord
        keep_row = None
        exclude_row = None
        # Include patterns
        if include_patterns:
            for pattern in include_patterns:
                actual_pattern = fnmatch.translate(pattern)
                new_matches = filenames.str.fullmatch(actual_pattern)
                if new_matches.sum() == 0:
                    raise ValueError(
                        f"In {mlf_path} there is no image filename "
                        f'matching "{actual_pattern}".'
                    )
                if keep_row is None:
                    keep_row = new_matches.copy()
                else:
                    keep_row = keep_row & new_matches
        else:
            # If no include pattern is specified, keep all rows
            keep_row = pd.Series([True] * len(mlf_frame_raw))
        # Exclude patterns
        if exclude_patterns:
            for pattern in exclude_patterns:
                actual_pattern = fnmatch.translate(pattern)
                new_matches = filenames.str.fullmatch(actual_pattern)
                if exclude_row is None:
                    exclude_row = new_matches.copy()
                else:
                    exclude_row = exclude_row | new_matches
        else:
            # Create an all False df => exclude nothing
            exclude_row = pd.Series([False] * len(mlf_frame_raw))

        # Combine included list with exclusions
        keep_row = keep_row & ~exclude_row

        if keep_row.sum() == 0:
            raise ValueError(
                f"In {mlf_path} there is no image filename "
                f"matching {include_patterns} but not excluded by the pattern "
                f"{exclude_patterns}."
            )
        mlf_frame_matching = mlf_frame_raw[keep_row.values].copy()
    else:
        mlf_frame_matching = mlf_frame_raw.copy()

    # Create a well ID column
    # Row & column are provided as int from XML metadata
    mlf_frame_matching["well_id"] = _create_well_ids(
        mlf_frame_matching["Row"], mlf_frame_matching["Column"], plate_type
    )

    # Flip Y axis to align to image coordinate system
    mlf_frame_matching["Y"] = -mlf_frame_matching["Y"]

    # Compute number or errors
    error_count = (mlf_frame_matching["Type"] == "ERR").sum()

    # We're only interested in the image metadata
    mlf_frame = mlf_frame_matching[mlf_frame_matching["Type"] == "IMG"]

    return mlf_frame, error_count

read_mrf_file(mrf_path)

Parses the mrf metadata file

PARAMETER DESCRIPTION
mrf_path

Full path to MeasurementDetail.mrf metadata file.

TYPE: str

RETURNS DESCRIPTION
DataFrame

Parsed mrf pandas table with one row per channel imaged

int

The plate_type: The number of wells

Source code in fractal_tasks_core/cellvoyager/metadata.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
def read_mrf_file(mrf_path: str) -> tuple[pd.DataFrame, int]:
    """
    Parses the mrf metadata file

    Args:
        mrf_path: Full path to MeasurementDetail.mrf metadata file.

    Returns:
        Parsed mrf pandas table with one row per channel imaged
        The plate_type: The number of wells
    """
    # Define the namespaces
    ns = {"bts": "http://www.yokogawa.co.jp/BTS/BTSSchema/1.0"}
    channel_df = pd.read_xml(
        mrf_path, xpath=".//bts:MeasurementChannel", namespaces=ns
    )
    meas_df = pd.read_xml(
        mrf_path, xpath="//bts:MeasurementDetail", namespaces=ns
    )
    row_count = int(meas_df["RowCount"])
    column_count = int(meas_df["ColumnCount"])
    plate_type = row_count * column_count
    return channel_df, plate_type