metadata

Functions to create a metadata dataframe from Yokogawa files.

`_create_well_ids(row_series, col_series, plate_type)` ¶

Create well_id list from XML metadata

Handles the conversion of Cellvoyager XML metadata into well indentifiers. Returns well identifiers like A01, B02 etc. for 96 & 384 well plates. Returns well identifiers like A01.a1, A01.b2 etc. for 1536 well plates. Defaults to the processing used for 96 & 384 well plates, unless the plate_type is 1536. For 1536 well plates, the first 4x4 wells go into A01.a1 - A01.d4 and so on.

PARAMETER	DESCRIPTION
`row_series`	Series with index being the index of the image and the value the row position (starting at 1 for top left). TYPE: `Series`
`col_series`	Series with index being the index of the image and the value the col position (starting at 1 for top left). TYPE: `Series`
`plate_type`	Number of wells in the plate layout. Used to determine whether it's a 1536 well plate or a different layout. TYPE: `int`

RETURNS	DESCRIPTION
`list[str]`	list of well_ids

Source code in fractal_tasks_core/cellvoyager/metadata.py

def _create_well_ids(
    row_series: pd.Series,
    col_series: pd.Series,
    plate_type: int,
) -> list[str]:
    """
    Create well_id list from XML metadata

    Handles the conversion of Cellvoyager XML metadata into well indentifiers.
    Returns well identifiers like A01, B02 etc. for 96 & 384 well plates.
    Returns well identifiers like A01.a1, A01.b2 etc. for 1536 well plates.
    Defaults to the processing used for 96 & 384 well plates, unless the
    plate_type is 1536. For 1536 well plates, the first 4x4 wells go into
    A01.a1 - A01.d4 and so on.

    Args:
        row_series: Series with index being the index of the image and the
            value the row position (starting at 1 for top left).
        col_series: Series with index being the index of the image and the
            value the col position (starting at 1 for top left).
        plate_type: Number of wells in the plate layout. Used to determine
            whether it's a 1536 well plate or a different layout.

    Returns:
        list of well_ids

    """
    if plate_type == 1536:
        # Row are built of a base letter (matching to the 96 well plate layout)
        # and a sub letter (position of the 1536 well within the 4x4 grid,
        # can be a-d) of that well
        row_base = [chr(math.floor((x - 1) / 4) + 65) for x in (row_series)]
        row_sub = [chr((x - 1) % 4 + 97) for x in (row_series)]
        # Columns are built of a base number (matching to the 96 well plate
        # layout) and a sub integer (position of the 1536 well within the
        # 4x4 grid, can be 1-4) of that well
        col_base = [math.floor((x - 1) / 4) + 1 for x in col_series]
        col_sub = [(x - 1) % 4 + 1 for x in col_series]
        well_ids = []
        for i in range(len(row_base)):
            well_ids.append(
                f"{row_base[i]}{col_base[i]:02}.{row_sub[i]}{col_sub[i]}"
            )
    else:
        row_str = [chr(x) for x in (row_series + 64)]
        well_ids = [f"{a}{b:02}" for a, b in zip(row_str, col_series)]

    return well_ids

`calculate_steps(site_series)` ¶

TBD

PARAMETER	DESCRIPTION
`site_series`	TBD TYPE: `Series`

Source code in fractal_tasks_core/cellvoyager/metadata.py

def calculate_steps(site_series: pd.Series):
    """
    TBD

    Args:
        site_series: TBD
    """

    # site_series is the z_micrometer series for a given site of a given
    # channel. This function calculates the step size in Z

    # First diff is always NaN because there is nothing to compare it to
    steps = site_series.diff().dropna().astype(float)
    if not np.allclose(steps.iloc[0], np.array(steps)):
        raise NotImplementedError(
            "When parsing the Yokogawa mlf file, some sites "
            "had varying step size in Z. "
            "That is not supported for the OME-Zarr parsing"
        )
    return steps.mean()

`check_group_consistency(grouped_df, message='')` ¶

TBD

PARAMETER	DESCRIPTION
`grouped_df`	TBD TYPE: `DataFrame`
`message`	TBD TYPE: `str` DEFAULT: `''`

Source code in fractal_tasks_core/cellvoyager/metadata.py

def check_group_consistency(grouped_df: pd.DataFrame, message: str = ""):
    """
    TBD

    Args:
        grouped_df: TBD
        message: TBD
    """

    # Check consistency in grouped df for multi-index, multi-column dataframes
    # raises an exception if there is variability
    diff_df = grouped_df.max() - grouped_df.min()
    if not np.isclose(diff_df.to_numpy().sum(), 0.0):
        raise ValueError(
            "During metadata parsing, a consistency check failed: \n"
            f"{message}\n"
            f"Difference dataframe: \n{diff_df}"
        )

`get_earliest_time_per_site(mlf_frame)` ¶

TBD

PARAMETER	DESCRIPTION
`mlf_frame`	TBD TYPE: `DataFrame`

Source code in fractal_tasks_core/cellvoyager/metadata.py

def get_earliest_time_per_site(mlf_frame: pd.DataFrame) -> pd.DataFrame:
    """
    TBD

    Args:
        mlf_frame: TBD
    """

    # Get the time information per site
    # Because a site will contain time information for each plane
    # of each channel, we just return the earliest time infromation
    # per site.
    return pd.to_datetime(
        mlf_frame.groupby(["well_id", "FieldIndex"]).min()["Time"], utc=True
    )

`get_z_steps(mlf_frame)` ¶

TBD

PARAMETER	DESCRIPTION
`mlf_frame`	TBD TYPE: `DataFrame`

Source code in fractal_tasks_core/cellvoyager/metadata.py

def get_z_steps(mlf_frame: pd.DataFrame) -> pd.DataFrame:
    """
    TBD

    Args:
        mlf_frame: TBD
    """

    # Process mlf_frame to extract Z information (pixel size & steps).
    # Run checks on consistencies & return site-based z step dataframe
    # Group by well, field & channel
    grouped_sites_z = (
        mlf_frame.loc[
            :,
            ["well_id", "FieldIndex", "ActionIndex", "Ch", "Z"],
        ]
        .set_index(["well_id", "FieldIndex", "ActionIndex", "Ch"])
        .groupby(level=[0, 1, 2, 3])
    )

    # If there is only 1 Z step, set the Z spacing to the count of planes => 1
    if grouped_sites_z.count()["Z"].max() == 1:
        z_data = grouped_sites_z.count().groupby(["well_id", "FieldIndex"])
    else:
        # Group the whole site (combine channels), because Z steps need to be
        # consistent between channels for OME-Zarr.
        z_data = grouped_sites_z.apply(calculate_steps).groupby(
            ["well_id", "FieldIndex"]
        )

    check_group_consistency(
        z_data, message="Comparing Z steps between channels"
    )

    # Ensure that channels have the same number of z planes and
    # reduce it to one value.
    # Only check if there is more than one channel available
    if any(
        grouped_sites_z.count().groupby(["well_id", "FieldIndex"]).count() > 1
    ):
        check_group_consistency(
            grouped_sites_z.count().groupby(["well_id", "FieldIndex"]),
            message="Checking number of Z steps between channels",
        )

    z_steps = (
        grouped_sites_z.count()
        .groupby(["well_id", "FieldIndex"])
        .mean()
        .astype(int)
    )

    # Combine the two dataframes
    z_frame = pd.concat([z_data.mean(), z_steps], axis=1)
    z_frame.columns = ["pixel_size_z", "z_pixel"]
    return z_frame

`parse_yokogawa_metadata(mrf_path, mlf_path, *, include_patterns=None, exclude_patterns=None)` ¶

Parse Yokogawa CV7000 metadata files and prepare site-level metadata.

PARAMETER	DESCRIPTION
`mrf_path`	Full path to MeasurementDetail.mrf metadata file. TYPE: `Union[str, Path]`
`mlf_path`	Full path to MeasurementData.mlf metadata file. TYPE: `Union[str, Path]`
`include_patterns`	List of patterns to filter the image filenames in the mlf metadata table. Patterns must be defined as in https://docs.python.org/3/library/fnmatch.html TYPE: `Optional[list[str]]` DEFAULT: `None`
`exclude_patterns`	List of exclusion patterns. Any file matching any of those patterns is excluded. Patterns must be defined as in https://docs.python.org/3/library/fnmatch.html TYPE: `Optional[list[str]]` DEFAULT: `None`

Source code in fractal_tasks_core/cellvoyager/metadata.py

def parse_yokogawa_metadata(
    mrf_path: Union[str, Path],
    mlf_path: Union[str, Path],
    *,
    include_patterns: Optional[list[str]] = None,
    exclude_patterns: Optional[list[str]] = None,
) -> tuple[pd.DataFrame, dict[str, int]]:
    """
    Parse Yokogawa CV7000 metadata files and prepare site-level metadata.

    Args:
        mrf_path: Full path to MeasurementDetail.mrf metadata file.
        mlf_path: Full path to MeasurementData.mlf metadata file.
        include_patterns: List of patterns to filter the image filenames in
            the mlf metadata table. Patterns must be defined as in
            https://docs.python.org/3/library/fnmatch.html
        exclude_patterns: List of exclusion patterns. Any file matching any
            of those patterns is excluded. Patterns must be defined as in
            https://docs.python.org/3/library/fnmatch.html
    """

    # Convert paths to strings
    mrf_str = Path(mrf_path).as_posix()
    mlf_str = Path(mlf_path).as_posix()

    # Ensure mrf & mlf files exist
    if not Path(mrf_str).exists() and not Path(mlf_str).exists():
        raise FileNotFoundError(
            "Could not find the mlf & mrf metadata files. Expected to find "
            "them at: \n"
            f"{mrf_str=}\n"
            f"{mlf_str=}"
        )

    mrf_frame, mlf_frame, error_count = read_metadata_files(
        mrf_str,
        mlf_str,
        include_patterns=include_patterns,
        exclude_patterns=exclude_patterns,
    )

    # Aggregate information from the mlf file
    per_site_parameters = ["X", "Y"]

    grouping_params = ["well_id", "FieldIndex"]
    grouped_sites = mlf_frame.loc[
        :, grouping_params + per_site_parameters
    ].groupby(by=grouping_params)

    check_group_consistency(grouped_sites, message="X & Y stage positions")
    site_metadata = grouped_sites.mean()
    site_metadata.columns = ["x_micrometer", "y_micrometer"]
    site_metadata["z_micrometer"] = 0

    site_metadata = pd.concat(
        [
            site_metadata,
            get_z_steps(mlf_frame),
            get_earliest_time_per_site(mlf_frame),
        ],
        axis=1,
    )

    # Aggregate information from the mrf file
    mrf_columns = [
        "HorizontalPixelDimension",
        "VerticalPixelDimension",
        "HorizontalPixels",
        "VerticalPixels",
        "InputBitDepth",
    ]
    check_group_consistency(
        mrf_frame.loc[:, mrf_columns], message="Image dimensions"
    )
    site_metadata["pixel_size_x"] = mrf_frame.loc[
        :, "HorizontalPixelDimension"
    ].max()
    site_metadata["pixel_size_y"] = mrf_frame.loc[
        :, "VerticalPixelDimension"
    ].max()
    site_metadata["x_pixel"] = int(mrf_frame.loc[:, "HorizontalPixels"].max())
    site_metadata["y_pixel"] = int(mrf_frame.loc[:, "VerticalPixels"].max())
    site_metadata["bit_depth"] = int(mrf_frame.loc[:, "InputBitDepth"].max())

    if error_count > 0:
        logger.info(
            f"There were {error_count} ERR entries in the metadatafile. "
            f"Still succesfully parsed {len(site_metadata)} sites. "
        )

    # Compute expected number of image files for each well
    list_of_wells = set(site_metadata.index.get_level_values("well_id"))
    number_of_files = {}
    for this_well_id in list_of_wells:
        num_images = (mlf_frame.well_id == this_well_id).sum()
        logger.info(
            f"Expected number of images for well {this_well_id}: {num_images}"
        )
        number_of_files[this_well_id] = num_images
    # Check that the sum of per-well file numbers correspond to the total
    # file number
    if not sum(number_of_files.values()) == len(mlf_frame):
        raise ValueError(
            "Error while counting the number of image files per well.\n"
            f"{len(mlf_frame)=}\n"
            f"{number_of_files=}"
        )

    return site_metadata, number_of_files

`read_metadata_files(mrf_path, mlf_path, include_patterns=None, exclude_patterns=None)` ¶

Create tables for mrf & mlf Yokogawa metadata.

PARAMETER	DESCRIPTION
`mrf_path`	Full path to MeasurementDetail.mrf metadata file. TYPE: `str`
`mlf_path`	Full path to MeasurementData.mlf metadata file. TYPE: `str`
`include_patterns`	List of patterns to filter the image filenames in the mlf metadata table. Patterns must be defined as in https://docs.python.org/3/library/fnmatch.html TYPE: `Optional[list[str]]` DEFAULT: `None`
`exclude_patterns`	List of exclusion patterns. Any file matching any of those patterns is excluded. Patterns must be defined as in https://docs.python.org/3/library/fnmatch.html TYPE: `Optional[list[str]]` DEFAULT: `None`

Returns:

Source code in fractal_tasks_core/cellvoyager/metadata.py

def read_metadata_files(
    mrf_path: str,
    mlf_path: str,
    include_patterns: Optional[list[str]] = None,
    exclude_patterns: Optional[list[str]] = None,
) -> tuple[pd.DataFrame, pd.DataFrame, int]:
    """
    Create tables for mrf & mlf Yokogawa metadata.

    Args:
        mrf_path: Full path to MeasurementDetail.mrf metadata file.
        mlf_path: Full path to MeasurementData.mlf metadata file.
        include_patterns: List of patterns to filter the image filenames in
            the mlf metadata table. Patterns must be defined as in
            https://docs.python.org/3/library/fnmatch.html
        exclude_patterns: List of exclusion patterns. Any file matching any
            of those patterns is excluded. Patterns must be defined as in
            https://docs.python.org/3/library/fnmatch.html

    Returns:

    """

    # parsing of mrf & mlf files are based on the
    # yokogawa_image_collection_task v0.5 in drogon, written by Dario Vischi.
    # https://github.com/fmi-basel/job-system-workflows/blob/00bbf34448972d27f258a2c28245dd96180e8229/src/gliberal_workflows/tasks/yokogawa_image_collection_task/versions/version_0_5.py  # noqa
    # Now modified for Fractal use

    mrf_frame, plate_type = read_mrf_file(mrf_path)

    # filter_position & filter_wheel_position are parsed, but not
    # processed further. Figure out how to save them as relevant metadata for
    # use e.g. during illumination correction

    mlf_frame, error_count = read_mlf_file(
        mlf_path,
        plate_type,
        include_patterns=include_patterns,
        exclude_patterns=exclude_patterns,
    )

    # Filter the mrf channel dataframe to only keep channels that were imaged
    # and are included in the filters (see issue #287)
    relevant_channels = mlf_frame["Ch"].unique()
    mrf_frame = mrf_frame[mrf_frame["Ch"].isin(relevant_channels)]

    # Time points are parsed as part of the mlf_frame, but currently not
    # processed further. Once we tackle time-resolved data, parse from here.

    return mrf_frame, mlf_frame, error_count

`read_mlf_file(mlf_path, plate_type, include_patterns=None, exclude_patterns=None)` ¶

Process the mlf metadata file of a Cellvoyager CV7K/CV8K.

PARAMETER	DESCRIPTION
`mlf_path`	Full path to MeasurementData.mlf metadata file. TYPE: `str`
`plate_type`	Plate layout, integer for the number of potential wells. TYPE: `int`
`include_patterns`	List of patterns to filter the image filenames in the mlf metadata table. Patterns must be defined as in https://docs.python.org/3/library/fnmatch.html TYPE: `Optional[list[str]]` DEFAULT: `None`
`exclude_patterns`	List of exclusion patterns. Any file matching any of those patterns is excluded. Patterns must be defined as in https://docs.python.org/3/library/fnmatch.html TYPE: `Optional[list[str]]` DEFAULT: `None`

RETURNS	DESCRIPTION
`mlf_frame`	pd.DataFrame with relevant metadata per image TYPE: `DataFrame`
`error_count`	Count of errors found during metadata processing TYPE: `int`

Source code in fractal_tasks_core/cellvoyager/metadata.py

def read_mlf_file(
    mlf_path: str,
    plate_type: int,
    include_patterns: Optional[list[str]] = None,
    exclude_patterns: Optional[list[str]] = None,
) -> tuple[pd.DataFrame, int]:
    """
    Process the mlf metadata file of a Cellvoyager CV7K/CV8K.

    Args:
        mlf_path: Full path to MeasurementData.mlf metadata file.
        plate_type: Plate layout, integer for the number of potential wells.
        include_patterns: List of patterns to filter the image filenames in
            the mlf metadata table. Patterns must be defined as in
            https://docs.python.org/3/library/fnmatch.html
        exclude_patterns: List of exclusion patterns. Any file matching any
            of those patterns is excluded. Patterns must be defined as in
            https://docs.python.org/3/library/fnmatch.html

    Returns:
        mlf_frame: pd.DataFrame with relevant metadata per image
        error_count: Count of errors found during metadata processing
    """

    # Load the whole MeasurementData.mlf file
    mlf_frame_raw = pd.read_xml(mlf_path)

    # Remove all rows that do not match the given patterns
    logger.info(
        f"Read {mlf_path}, apply following include patterns to "
        f"image filenames: {include_patterns} apply the following exlcude "
        f"patterns to image filenames: {exclude_patterns}"
    )

    if include_patterns or exclude_patterns:
        filenames = mlf_frame_raw.MeasurementRecord
        keep_row = None
        exclude_row = None
        # Include patterns
        if include_patterns:
            for pattern in include_patterns:
                actual_pattern = fnmatch.translate(pattern)
                new_matches = filenames.str.fullmatch(actual_pattern)
                if new_matches.sum() == 0:
                    raise ValueError(
                        f"In {mlf_path} there is no image filename "
                        f'matching "{actual_pattern}".'
                    )
                if keep_row is None:
                    keep_row = new_matches.copy()
                else:
                    keep_row = keep_row & new_matches
        else:
            # If no include pattern is specified, keep all rows
            keep_row = pd.Series([True] * len(mlf_frame_raw))
        # Exclude patterns
        if exclude_patterns:
            for pattern in exclude_patterns:
                actual_pattern = fnmatch.translate(pattern)
                new_matches = filenames.str.fullmatch(actual_pattern)
                if exclude_row is None:
                    exclude_row = new_matches.copy()
                else:
                    exclude_row = exclude_row | new_matches
        else:
            # Create an all False df => exclude nothing
            exclude_row = pd.Series([False] * len(mlf_frame_raw))

        # Combine included list with exclusions
        keep_row = keep_row & ~exclude_row

        if keep_row.sum() == 0:
            raise ValueError(
                f"In {mlf_path} there is no image filename "
                f"matching {include_patterns} but not excluded by the pattern "
                f"{exclude_patterns}."
            )
        mlf_frame_matching = mlf_frame_raw[keep_row.values].copy()
    else:
        mlf_frame_matching = mlf_frame_raw.copy()

    # Create a well ID column
    # Row & column are provided as int from XML metadata
    mlf_frame_matching["well_id"] = _create_well_ids(
        mlf_frame_matching["Row"], mlf_frame_matching["Column"], plate_type
    )

    # Flip Y axis to align to image coordinate system
    mlf_frame_matching["Y"] = -mlf_frame_matching["Y"]

    # Compute number or errors
    error_count = (mlf_frame_matching["Type"] == "ERR").sum()

    # We're only interested in the image metadata
    mlf_frame = mlf_frame_matching[mlf_frame_matching["Type"] == "IMG"]

    return mlf_frame, error_count

`read_mrf_file(mrf_path)` ¶

Parses the mrf metadata file

PARAMETER	DESCRIPTION
`mrf_path`	Full path to MeasurementDetail.mrf metadata file. TYPE: `str`

RETURNS	DESCRIPTION
`DataFrame`	Parsed mrf pandas table with one row per channel imaged
`int`	The plate_type: The number of wells

Source code in fractal_tasks_core/cellvoyager/metadata.py

def read_mrf_file(mrf_path: str) -> tuple[pd.DataFrame, int]:
    """
    Parses the mrf metadata file

    Args:
        mrf_path: Full path to MeasurementDetail.mrf metadata file.

    Returns:
        Parsed mrf pandas table with one row per channel imaged
        The plate_type: The number of wells
    """
    # Define the namespaces
    ns = {"bts": "http://www.yokogawa.co.jp/BTS/BTSSchema/1.0"}
    channel_df = pd.read_xml(
        mrf_path, xpath=".//bts:MeasurementChannel", namespaces=ns
    )
    meas_df = pd.read_xml(
        mrf_path, xpath="//bts:MeasurementDetail", namespaces=ns
    )
    row_count = int(meas_df["RowCount"].iloc[0])
    column_count = int(meas_df["ColumnCount"].iloc[0])
    plate_type = row_count * column_count
    return channel_df, plate_type

`sanitize_string(value)` ¶

Make string safe to be used in file/folder names.

Replace any special character with an underscore, where special characters are:

>>> string.punctuation
'!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~'
>>> string.whitespace
' \t\n\r\x0b\x0c'

PARAMETER	DESCRIPTION
`value`	Input string TYPE: `str`

RETURNS	DESCRIPTION
`str`	Sanitized value

Source code in fractal_tasks_core/cellvoyager/metadata.py

def sanitize_string(value: str) -> str:
    """
    Make string safe to be used in file/folder names.

    Replace any special character with an
    underscore, where special characters are:


        >>> string.punctuation
        '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
        >>> string.whitespace
        ' \\t\\n\\r\\x0b\\x0c'

    Args:
        value: Input string

    Returns:
        Sanitized value
    """
    for character in __SPECIAL_CHARACTERS__:
        new_value = value.replace(character, "_")
    return new_value

metadata

_create_well_ids(row_series, col_series, plate_type) ¶

calculate_steps(site_series) ¶

check_group_consistency(grouped_df, message='') ¶

get_earliest_time_per_site(mlf_frame) ¶

get_z_steps(mlf_frame) ¶

parse_yokogawa_metadata(mrf_path, mlf_path, *, include_patterns=None, exclude_patterns=None) ¶

read_metadata_files(mrf_path, mlf_path, include_patterns=None, exclude_patterns=None) ¶

read_mlf_file(mlf_path, plate_type, include_patterns=None, exclude_patterns=None) ¶

read_mrf_file(mrf_path) ¶

sanitize_string(value) ¶

`_create_well_ids(row_series, col_series, plate_type)` ¶

`calculate_steps(site_series)` ¶

`check_group_consistency(grouped_df, message='')` ¶

`get_earliest_time_per_site(mlf_frame)` ¶

`get_z_steps(mlf_frame)` ¶

`parse_yokogawa_metadata(mrf_path, mlf_path, *, include_patterns=None, exclude_patterns=None)` ¶

`read_metadata_files(mrf_path, mlf_path, include_patterns=None, exclude_patterns=None)` ¶

`read_mlf_file(mlf_path, plate_type, include_patterns=None, exclude_patterns=None)` ¶

`read_mrf_file(mrf_path)` ¶

`sanitize_string(value)` ¶