diff --git a/src/pyannote/audio/pipelines/utils/diarization.py b/src/pyannote/audio/pipelines/utils/diarization.py index 2130fb961..288da3196 100644 --- a/src/pyannote/audio/pipelines/utils/diarization.py +++ b/src/pyannote/audio/pipelines/utils/diarization.py @@ -215,7 +215,7 @@ def to_annotation( min_duration_off=min_duration_off, ) - return binarize(discrete_diarization).rename_tracks(generator="string") + return binarize(discrete_diarization) @staticmethod def to_diarization( diff --git a/src/pyannote/audio/utils/signal.py b/src/pyannote/audio/utils/signal.py index c02bf01a7..c3c78c33e 100644 --- a/src/pyannote/audio/utils/signal.py +++ b/src/pyannote/audio/utils/signal.py @@ -269,11 +269,28 @@ def __call__(self, scores: SlidingWindowFeature) -> Annotation: frames = scores.sliding_window timestamps = [frames[i].middle for i in range(num_frames)] - # annotation meant to store 'active' regions + if self.onset == self.offset: + active = self._opt_binarize(scores, timestamps) + else: + active = self._binarize(scores, timestamps) + + # because of padding, some active regions might be overlapping: merge them. + # also: fill same speaker gaps shorter than min_duration_off + if self.pad_offset > 0.0 or self.pad_onset > 0.0 or self.min_duration_off > 0.0: + active = active.support(collar=self.min_duration_off) + + # remove tracks shorter than min_duration_on + if self.min_duration_on > 0: + for segment, track in list(active.itertracks()): + if segment.duration < self.min_duration_on: + del active[segment, track] + + return active + + def _binarize(self, scores, timestamps): active = Annotation() for k, k_scores in enumerate(scores.data.T): - label = k if scores.labels is None else scores.labels[k] # initial state @@ -281,7 +298,6 @@ def __call__(self, scores: SlidingWindowFeature) -> Annotation: is_active = k_scores[0] > self.onset for t, y in zip(timestamps[1:], k_scores[1:]): - # currently active if is_active: # switching from active to inactive @@ -303,19 +319,37 @@ def __call__(self, scores: SlidingWindowFeature) -> Annotation: region = Segment(start - self.pad_onset, t + self.pad_offset) active[region, k] = label - # because of padding, some active regions might be overlapping: merge them. - # also: fill same speaker gaps shorter than min_duration_off - if self.pad_offset > 0.0 or self.pad_onset > 0.0 or self.min_duration_off > 0.0: - active = active.support(collar=self.min_duration_off) + return active - # remove tracks shorter than min_duration_on - if self.min_duration_on > 0: - for segment, track in list(active.itertracks()): - if segment.duration < self.min_duration_on: - del active[segment, track] + def _opt_binarize(self, scores, timestamps): + active = Annotation() - return active + for k, k_scores in enumerate(scores.data.T): + label = k if scores.labels is None else scores.labels[k] + + # Detect transitions + is_active = k_scores > self.onset + transitions = np.diff(is_active.astype(int)) + starts = np.where(transitions == 1)[0] + 1 + ends = np.where(transitions == -1)[0] + 1 + + # If the first frame is active, add it as a start + if is_active[0]: + starts = np.insert(starts, 0, 0) + + # If the last frame is active, add it as an end + if is_active[-1]: + ends = np.append(ends, len(is_active) - 1) + + # Create segments + for start, end in zip(starts, ends): + region = Segment( + timestamps[start] - self.pad_onset, + timestamps[end] + self.pad_offset, + ) + active[region, k] = label + return active class Peak: """Peak detection