Hi and thank you for this amazing project! I was trying to create a

Hi <a class="user-mention notranslate" data-hovercard-type="user" data-hovercard-url="

<a class="user-mention notranslate" data-hovercard-type="user" data-hovercard-url="/us

Inference with new input audio about autopst HOT 4 OPEN

auspicious3000 commented on June 19, 2024

Inference with new input audio

from autopst.

Comments (4)

auspicious3000 commented on June 19, 2024

By conditioning on speaker embedding, it changes the rhythm and timbre at the same time.

from autopst.

shoegazerstella commented on June 19, 2024

Hi @auspicious3000
I am computing the spectrogram like this:

def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a
    
    
def pySTFT(x, fft_length=1024, hop_length=256):
    
    x = np.pad(x, int(fft_length//2), mode='reflect')
    
    noverlap = fft_length - hop_length
    shape = x.shape[:-1]+((x.shape[-1]-noverlap)//hop_length, fft_length)
    strides = x.strides[:-1]+(hop_length*x.strides[-1], x.strides[-1])
    result = np.lib.stride_tricks.as_strided(x, shape=shape,
                                             strides=strides)
    
    fft_window = get_window('hann', fft_length, fftbins=True)
    result = np.fft.rfft(fft_window * result, n=fft_length).T
    
    return np.abs(result)    
    
    
mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T
min_level = np.exp(-100 / 20 * np.log(10))
b, a = butter_highpass(30, 16000, order=5)

AUDIO_DIR = "/content/drive/MyDrive/CODE/VoiceAE/audio/"
filename = os.path.join(AUDIO_DIR, "test.wav")

# Read audio file
#x, fs = sf.read(filename) sr=16000
x, fs = librosa.load(filename, duration=4)

# Remove drifting noise
#y = signal.filtfilt(b, a, x)
y = x

# Ddd a little random noise for model roubstness
#wav = y * 0.96 + (prng.rand(y.shape[0])-0.5)*1e-06
wav = y

# Compute spect
D = pySTFT(wav).T

# Convert to mel and normalize
D_mel = np.dot(D, mel_basis)
D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
S = np.clip((D_db + 100) / 100, 0, 1)    

# save spect    
np.save(os.path.join(AUDIO_DIR, filename[:-4]), S.astype(np.float32), allow_pickle=False)

Then I do the inference like this: