Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Dataset.example_media and cache Datacards #84

Merged
merged 14 commits into from
May 8, 2024
248 changes: 145 additions & 103 deletions audbcards/core/datacard.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import audiofile
import audplot

from audbcards.core.config import config
from audbcards.core.dataset import Dataset
from audbcards.core.utils import set_plot_margins

Expand Down Expand Up @@ -41,14 +42,19 @@ class Datacard(object):
a call to :meth:`audbcards.Datacard.player`
will store an example audio file
under
``<sphinx_build_dir>/<path>/<db-name>/<media-file-in-db>``
``<sphinx_build_dir>/<path>/<dataset-name>/``
sphinx_src_dir: source dir of sphinx.
If not ``None``
and ``example`` is ``True``,
a call to :meth:`audbcards.Datacard.player`
will store a wavplot of the example audio file
will store a wavform plot of the example audio file
ChristianGeng marked this conversation as resolved.
Show resolved Hide resolved
under
``<sphinx_src_dir>/<path>/<db-name>/<db-name>.png``
``<sphinx_src_dir>/<path>/<dataset-name>/``
cache_root: cache folder.
If ``None``,
the environmental variable ``AUDBCARDS_CACHE_ROOT``,
or :attr:`audbcards.config.CACHE_ROOT`
is used

"""

Expand All @@ -60,6 +66,7 @@ def __init__(
example: bool = True,
sphinx_build_dir: str = None,
sphinx_src_dir: str = None,
cache_root: str = None,
):
self.dataset = dataset
"""Dataset object."""
Expand All @@ -76,6 +83,11 @@ def __init__(
self.sphinx_src_dir = sphinx_src_dir
"""Sphinx source dir."""

if cache_root is None:
cache_root = os.environ.get("AUDBCARDS_CACHE_ROOT") or config.CACHE_ROOT
self.cache_root = audeer.mkdir(cache_root)
r"""Cache root folder."""

self.rst_preamble = ""
"""RST code added at top of data card."""

Expand All @@ -84,47 +96,6 @@ def content(self):
"""Property Accessor for rendered jinja2 content."""
return self._render_template()

@property
def example_media(self) -> typing.Optional[str]:
r"""Select example media file.

This select a media file
based on the median duration
of all files
between 0.5 s and 300 s
and downloads it to the cache.

"""
# Pick a meaningful duration for the example audio file
min_dur = 0.5
max_dur = 300 # 5 min
durations = self.dataset.file_durations
selected_durations = [d for d in durations if d >= min_dur and d <= max_dur]
if len(selected_durations) == 0:
return None
selected_duration = np.median(selected_durations)

# Get index for duration closest to selected duration
# see https://stackoverflow.com/a/9706105
# durations.index(selected_duration)
# is an alternative but fails due to rounding errors
index = min(
range(len(durations)),
key=lambda n: abs(durations[n] - selected_duration),
)
# Download of example data might fail
try:
media = self.dataset.deps.media[index]
audb.load_media(
self.dataset.name,
media,
version=self.dataset.version,
verbose=False,
)
except: # noqa: E722
media = None
return media

@property
def file_duration_distribution(self) -> str:
r"""Minimum and maximum of files durations, and plotted distribution.
Expand All @@ -133,17 +104,32 @@ def file_duration_distribution(self) -> str:
containing the mininimum and maximum values
of files durations.

If :attr:`audbcards.Datacard.sphinx_src_dir` is set
If :attr:`audbcards.Datacard.sphinx_src_dir` is not ``None``
(e.g. when used in the sphinx extension),
an inline image is stored
in the sphinx source folder
under ``<dataset-name>/<dataset-name>-file-durations.png``
and displayed
an image is stored in the file
``<dataset-name>-<dataset-version>-file-duration-distribution.png``,
which is cached in
``<cache-root>/<dataset-name>/<dataset-version>/``
and copied to the sphinx source folder
into
``<sphinx-src-dir>/<path><dataset-name>/``.
The image is displayed inline
between the minimum and maximum values.
If all duration values are the same,
no distribution plot is created.

"""
file_name = (
f"{self.dataset.name}-{self.dataset.version}-file-duration-distribution.png"
)
# Cache is organized as `<cache_root>/<name>/<version>/`
cache_file = audeer.path(
self.cache_root,
self.dataset.name,
self.dataset.version,
file_name,
)

min_ = 0
max_ = 0
unit = "s"
Expand All @@ -161,83 +147,143 @@ def file_duration_distribution(self) -> str:

# Save distribution plot
if self.sphinx_src_dir is not None:
self._plot_distribution(durations)
name = "file-durations"
# Plot distribution to cache,
# if not found there already.
if not os.path.exists(cache_file):
audeer.mkdir(os.path.dirname(cache_file))
self._plot_distribution(durations)
plt.savefig(cache_file, transparent=True)
plt.close()

image_file = audeer.path(
self.sphinx_src_dir,
self.path,
self.dataset.name,
f"{self.dataset.name}-{name}.png",
file_name,
)
audeer.mkdir(os.path.dirname(image_file))
plt.savefig(image_file, transparent=True)
plt.close()
shutil.copyfile(cache_file, image_file)
distribution_str = self._inline_image(
f"{min_:.1f} {unit}",
f"./{self.dataset.name}/{self.dataset.name}-{name}.png",
f"./{self.dataset.name}/{file_name}",
f"{max_:.1f} {unit}",
)

return distribution_str

def player(
self,
file: str = None,
) -> str:
def player(self) -> str:
r"""Create an audio player showing the waveform.

Args:
file: input audio file to be used in the player.
If ``None``,
:attr:`audbcards.Datacard.example_media`
is used
If :attr:`audbcards.Datacard.sphinx_build_dir`
or :attr:`audbcards.Datacard.sphinx_src_dir`
is not ``None``,
an example media file is cached in the folder
``<dataset-name>-<dataset-version>-player-media/``
inside
``<cache-root>/<dataset-name>/<dataset-version>/``,
using the same sub-folder structure
as the media file has inside its dataset.
If :attr:`audbcards.Datacard.sphinx_build_dir`
is not ``None``,
the media sub-folder structure
is also copied
to the sphinx build dir into
``<sphinx-build-dir>/<path>/<dataset-name>/``.

If :attr:`audbcards.Datacard.sphinx_src_dir` is not ``None``,
a plot of the waveform of the media file
is cached under
``<dataset-name>-<dataset-version>-player-waveform.png``
inside
``<cache-root>/<dataset-name>/<dataset-version>/``.
It is also copied to the sphinx source folder into
``<sphinx-src-dir>/<path>/<dataset-name>/``.

"""
if file is None:
file = self.example_media
Returns:
String containing RST code to include the player

# use audb cache instead of dataset.cache_root
media_src_dir = (
f"{audb.default_cache_root()}/"
f"{audb.flavor_path(self.dataset.name, self.dataset.version)}"
"""
# Cache is organized as `<cache_root>/<name>/<version>/`
cache_folder = audeer.path(
self.cache_root,
self.dataset.name,
self.dataset.version,
)
cache_example_media = audeer.path(
ChristianGeng marked this conversation as resolved.
Show resolved Hide resolved
cache_folder,
f"{self.dataset.name}-{self.dataset.version}-player-media",
self.dataset.example_media,
)
cache_waveform_file = audeer.path(
ChristianGeng marked this conversation as resolved.
Show resolved Hide resolved
cache_folder,
f"{self.dataset.name}-{self.dataset.version}-player-waveform.png",
)

# Move file to build folder
if self.sphinx_build_dir is not None:
media_dst_dir = audeer.path(
self.sphinx_build_dir,
def load_media_to_cache(cache_example_media: str):
r"""Load media file with audb and copy to audbcards cache.

Args:
cache_example_media: full path to media file in cache

"""
media = audb.load_media(
self.dataset.name,
self.dataset.example_media,
version=self.dataset.version,
verbose=False,
)[0]
audeer.mkdir(os.path.dirname(cache_example_media))
shutil.copy(media, cache_example_media)

# Add plot of waveform
if self.sphinx_src_dir is not None:
if not os.path.exists(cache_example_media):
ChristianGeng marked this conversation as resolved.
Show resolved Hide resolved
load_media_to_cache(cache_example_media)

if not os.path.exists(cache_waveform_file):
signal, sampling_rate = audiofile.read(
ChristianGeng marked this conversation as resolved.
Show resolved Hide resolved
cache_example_media,
always_2d=True,
)
audeer.mkdir(os.path.dirname(cache_waveform_file))
plt.figure(figsize=[3, 0.5])
ax = plt.subplot(111)
audplot.waveform(signal[0, :], ax=ax)
set_plot_margins()
plt.savefig(cache_waveform_file)
plt.close()

plot_dst_dir = audeer.path(
self.sphinx_src_dir,
self.path,
self.dataset.name,
)
audeer.mkdir(os.path.join(media_dst_dir, os.path.dirname(file)))
audeer.mkdir(plot_dst_dir)
shutil.copy(
os.path.join(media_src_dir, file),
os.path.join(media_dst_dir, file),
cache_waveform_file,
os.path.join(plot_dst_dir, os.path.basename(cache_waveform_file)),
)

# Add plot of waveform
if self.sphinx_src_dir is not None:
signal, sampling_rate = audiofile.read(
os.path.join(media_src_dir, file),
always_2d=True,
)
image_file = audeer.path(
self.sphinx_src_dir,
# Copy media file to build folder
if self.sphinx_build_dir is not None:
if not os.path.exists(cache_example_media):
load_media_to_cache(cache_example_media)

media_dst_dir = audeer.path(
self.sphinx_build_dir,
self.path,
self.dataset.name,
f"{self.dataset.name}.png",
)
audeer.mkdir(os.path.dirname(image_file))
plt.figure(figsize=[3, 0.5])
ax = plt.subplot(111)
audplot.waveform(signal[0, :], ax=ax)
set_plot_margins()
plt.savefig(image_file)
plt.close()

player_src = f"./{self.dataset.name}/{file}"
audeer.mkdir(media_dst_dir, os.path.dirname(self.dataset.example_media))
shutil.copy(
cache_example_media,
os.path.join(media_dst_dir, self.dataset.example_media),
)

waveform_src = f"./{self.dataset.name}/{os.path.basename(cache_waveform_file)}"
player_src = f"./{self.dataset.name}/{self.dataset.example_media}"
player_str = (
f".. image:: ./{self.dataset.name}/{self.dataset.name}.png\n"
f".. image:: {waveform_src}\n"
"\n"
".. raw:: html\n"
"\n"
Expand Down Expand Up @@ -365,14 +411,10 @@ def _expand_dataset(
"""
# Add path of datacard folder
dataset["path"] = self.path
# Add audio player for example file
dataset["example"] = None
if self.example:
example = self.example_media
if example is not None:
player = self.player(example)
if self.dataset.example_media is not None:
player = self.player()
dataset["player"] = player
dataset["example"] = example
dataset["file_duration_distribution"] = self.file_duration_distribution
return dataset

Expand Down
34 changes: 34 additions & 0 deletions audbcards/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import typing

import jinja2
import numpy as np
import pandas as pd

import audb
Expand Down Expand Up @@ -189,6 +190,39 @@ def duration(self) -> pd.Timedelta:
unit="s",
)

@functools.cached_property
def example_media(self) -> typing.Optional[str]:
r"""Example media file.

The media file is selected
by its median duration
from all files in the dataset
with a duration
between 0.5 s and 300 s.
If no media file meets this criterium,
``None`` is returned instead.

"""
# Pick a meaningful duration for the example audio file
min_dur = 0.5
max_dur = 300 # 5 min
durations = self.file_durations
selected_durations = [d for d in durations if d >= min_dur and d <= max_dur]

if len(selected_durations) == 0:
return None

selected_duration = np.median(selected_durations)
# Get index for duration closest to selected duration
# see https://stackoverflow.com/a/9706105
# durations.index(selected_duration)
# is an alternative but fails due to rounding errors
index = min(
range(len(durations)),
key=lambda n: abs(durations[n] - selected_duration),
)
return self.deps.media[index]

@functools.cached_property
def files(self) -> int:
r"""Number of media files in dataset."""
Expand Down
Loading
Loading