This paper introduced a method to extract only segments with bird sound from an audio file. Since the paper didn’t give any code, I started to write it by myself.

Here is the Python implementation:

import cv2
import time
import torch
import librosa
import soundfile as sf
import numpy as np

from torchlibrosa.stft import LogmelFilterBank, Spectrogram

class CFG:
    n_fft = 2048
    hop_length = 512
    sample_rate = 32000
    n_mels = 64
    fmin = 150
    fmax = 150000

class SignalExtractor:
    def __init__(self):
        self.spectrogram_extractor = Spectrogram(
            n_fft=CFG.n_fft, hop_length=CFG.hop_length, win_length=CFG.n_fft, window="hann",
            center=True, pad_mode="reflect", freeze_parameters=True)
        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate, n_fft=CFG.n_fft,
            n_mels=CFG.n_mels, fmin=CFG.fmin, fmax=CFG.fmax, ref=1.0, amin=1e-10, top_db=None, freeze_parameters=True)
        self.factors = [2.0, 1.8, 1.6, 1.4, 1.2, 1.1]
        self.kernel_size = 15
        self.sn_threshold = 0.2

    def extract(self, input):
        x = torch.from_numpy(input)
        x = x[None, :].float()

        x = self.spectrogram_extractor(x)
        x = self.logmel_extractor(x)

        x = x.squeeze(0).squeeze(0)
        x = x.permute(1, 0).numpy()
        x = x - np.amin(x)

        for factor in self.factors:
            sound, sn_ratio = self._factor_extract(input, x, factor)
            if sn_ratio >= self.sn_threshold:
                break

        return sound, sn_ratio

    def _factor_extract(self, input, x, factor: float):
        rows, cols = x.shape
        row_median = np.median(x, axis=1)
        row_median_matrix = np.tile(row_median, (cols, 1)).T * factor
        col_median = np.median(x, axis=0)
        col_median_matrix = np.tile(col_median, (rows, 1)) * factor

        y = x > row_median_matrix
        z = x > col_median_matrix
        res = np.logical_and(y, z) + np.zeros(x.shape)

        kernel = np.ones((self.kernel_size, self.kernel_size), np.uint8)
        img = cv2.dilate(res, kernel, iterations=1)

        indicator = np.sum(img, axis=0)
        chunk_size = input.shape[0] // indicator.shape[0]
        sounds = []
        for index, chunk in enumerate(indicator):
            if chunk > 0:
                sounds.append(input[index*chunk_size:(index+1)*chunk_size])
        if len(sounds) <= 0:
            return None, 0.0
        sound = np.concatenate(sounds)
        return sound, sound.shape[0]/input.shape[0]

The implementation has some differences from the method in the paper:

  1. I didn’t use erosion since dilation is good enough for picking up the bird-sound segment
  2. three times bigger than median is too strict for most audio files, so I use an array of ratios. When the 2.0 ratio couldn’t pick up any bird sound, the code will automatically try a 1.8 ratio etc.
  3. I used a big kernel (15, 15) for dilation since it works well in my samples

The original sample:

After extracteing only bird sounds: