UOMOP

[Librosa Tempo Problem] 본문

Project/Music Plagiarism Project

[Librosa Tempo Problem]

Happy PinGu 2022. 11. 5. 14:41
librosa에서 제공하는 beat_track 등을 이용해서 tempo(bpm)을 구하면 정확성이 많이 떨어진다.
직접 음원의 Feature들을 추출해보고 tempo(bpm)에 영향을 많이 주는 Feature를 찾아보자.
# ========================= 라이브러리 호출 =========================

import numpy as np
import pandas as pd
import librosa
import joblib 
import matplotlib.pyplot as plt
import IPython.display as ipd
import seaborn as sns
from sklearn import metrics

from xgboost import plot_importance
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
import librosa.feature
import librosa.display
def get_features(y, sr) : # Categorizing label data
  
    chroma_shift = librosa.feature.chroma_stft(y, n_fft=2048, hop_length=512) # 음악의 크로마 특징
    rmse = librosa.feature.rms(y, frame_length=512, hop_length=512)           # RMS값
    spectral_centroids = librosa.feature.spectral_centroid(y, sr=sr)          # 스펙트럼 무게 중심
    spec_bw = librosa.feature.spectral_bandwidth(y, sr=sr)                    # 스펙트럼 대역폭
    spectral_rolloff = librosa.feature.spectral_rolloff(y, sr=sr)[0]          # rolloff
    zcr = librosa.feature.zero_crossing_rate(y, hop_length=512)               # zero to crossing
    y_harm, y_perc = librosa.effects.hpss(y)                                  # 하모닉, 충격파
    tempo, _ = librosa.beat.beat_track(y, sr=sr)                              # 템포
    mfcc = librosa.feature.mfcc(y, sr=sr,n_mfcc=20)                           # mfcc 20까지 추출

    features_extracted = np.hstack([                                    
                                    np.mean(chroma_shift),
                                    np.var(chroma_shift),
                                    np.mean(rmse),
                                    np.var(rmse),
                                    np.mean(spectral_centroids),
                                    np.var(spectral_centroids),
                                    np.mean(spec_bw),
                                    np.var(spec_bw),
                                    np.mean(spectral_rolloff),
                                    np.var(spectral_rolloff),
                                    np.mean(zcr),
                                    np.var(zcr),
                                    np.mean(y_harm),
                                    np.var(y_harm),
                                    np.mean(y_perc),
                                    np.var(y_perc),
                                    tempo,
                                    np.mean(mfcc.T, axis=0),
                                    np.var(mfcc.T, axis=0)
                                                            ])

    features = features_extracted.reshape(1, 57)

    col_names = ['chroma_stft_mean', 'chroma_stft_var',	'rms_mean',	'rms_var',	
                 'spectral_centroid_mean', 'spectral_centroid_var',	'spectral_bandwidth_mean',
                 'spectral_bandwidth_var', 'rolloff_mean', 'rolloff_var', 'zero_crossing_rate_mean',
                 'zero_crossing_rate_var', 'harmony_mean', 'harmony_var', 'perceptr_mean', 'perceptr_var',
                 'tempo', 'mfcc1_mean', 'mfcc1_var', 'mfcc2_mean',	'mfcc2_var', 'mfcc3_mean', 'mfcc3_var', 
                 'mfcc4_mean','mfcc4_var', 'mfcc5_mean',	'mfcc5_var', 'mfcc6_mean', 'mfcc6_var',	'mfcc7_mean', 'mfcc7_var', 'mfcc8_mean',
                 'mfcc8_var', 'mfcc9_mean',	'mfcc9_var', 'mfcc10_mean',	'mfcc10_var', 'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean',
                 'mfcc12_var', 'mfcc13_mean', 'mfcc13_var',	'mfcc14_mean', 'mfcc14_var', 'mfcc15_mean',	'mfcc15_var', 'mfcc16_mean',
                 'mfcc16_var', 'mfcc17_mean', 'mfcc17_var',	'mfcc18_mean', 'mfcc18_var', 'mfcc19_mean',	'mfcc19_var', 'mfcc20_mean', 'mfcc20_var']

    
    df = pd.DataFrame(features, columns = col_names)

    features = df.drop(["mfcc13_var", "mfcc17_var", "mfcc14_var", "mfcc11_var"], axis = 1, inplace = False)
    array = (np.array(features)).tolist()


    return df
pd.set_option('display.max_columns', None)
# DataFrame 전체를 확인
song1_1, sr = librosa.load("1-1.wav", sr = 22050)
song1_2, sr = librosa.load("1-2.wav", sr = 22050)
song2_1, sr = librosa.load("2-1.wav", sr = 22050)
song2_2, sr = librosa.load("2-2.wav", sr = 22050)
song3_1, sr = librosa.load("3-1.wav", sr = 22050)
song3_2, sr = librosa.load("3-2.wav", sr = 22050)
song4_1, sr = librosa.load("4-1.wav", sr = 22050)
song4_2, sr = librosa.load("4-2.wav", sr = 22050)
song5_1, sr = librosa.load("5-1.wav", sr = 22050)
song5_2, sr = librosa.load("5-2.wav", sr = 22050)
song6_1, sr = librosa.load("6-1.wav", sr = 22050)
song6_2, sr = librosa.load("6-2.wav", sr = 22050)
song7_1, sr = librosa.load("7-1.wav", sr = 22050)
song7_2, sr = librosa.load("7-2.wav", sr = 22050)
song8_1, sr = librosa.load("8-1.wav", sr = 22050)
song8_2, sr = librosa.load("8-2.wav", sr = 22050)
song9_1, sr = librosa.load("9-1.wav", sr = 22050)
song9_2, sr = librosa.load("9-2.wav", sr = 22050)
song10_1, sr = librosa.load("10-1.wav", sr = 22050)
song10_2, sr = librosa.load("10-2.wav", sr = 22050)

song11_1, sr = librosa.load("11-1.wav", sr = 22050)
song11_2, sr = librosa.load("11-2.wav", sr = 22050)
song12_1, sr = librosa.load("12-1.wav", sr = 22050)
song12_2, sr = librosa.load("12-2.wav", sr = 22050)
song13_1, sr = librosa.load("13-1.wav", sr = 22050)
song13_2, sr = librosa.load("13-2.wav", sr = 22050)
song14_1, sr = librosa.load("14-1.wav", sr = 22050)
song14_2, sr = librosa.load("14-2.wav", sr = 22050)
song15_1, sr = librosa.load("15-1.wav", sr = 22050)
song15_2, sr = librosa.load("15-2.wav", sr = 22050)
song16_1, sr = librosa.load("16-1.wav", sr = 22050)
song16_2, sr = librosa.load("16-2.wav", sr = 22050)
song17_1, sr = librosa.load("17-1.wav", sr = 22050)
song17_2, sr = librosa.load("17-2.wav", sr = 22050)
song18_1, sr = librosa.load("18-1.wav", sr = 22050)
song18_2, sr = librosa.load("18-2.wav", sr = 22050)
song19_1, sr = librosa.load("19-1.wav", sr = 22050)
song19_2, sr = librosa.load("19-2.wav", sr = 22050)


feature1_1 = get_features(song1_1, sr = sr)
feature1_2 = get_features(song1_2, sr = sr)
feature2_1 = get_features(song2_1, sr = sr)
feature2_2 = get_features(song2_2, sr = sr)
feature3_1 = get_features(song3_1, sr = sr)
feature3_2 = get_features(song3_2, sr = sr)
feature4_1 = get_features(song4_1, sr = sr)
feature4_2 = get_features(song4_2, sr = sr)
feature5_1 = get_features(song5_1, sr = sr)
feature5_2 = get_features(song5_2, sr = sr)
feature6_1 = get_features(song6_1, sr = sr)
feature6_2 = get_features(song6_2, sr = sr)
feature7_1 = get_features(song7_1, sr = sr)
feature7_2 = get_features(song7_2, sr = sr)
feature8_1 = get_features(song8_1, sr = sr)
feature8_2 = get_features(song8_2, sr = sr)
feature9_1 = get_features(song9_1, sr = sr)
feature9_2 = get_features(song9_2, sr = sr)
feature10_1 = get_features(song10_1, sr = sr)
feature10_2 = get_features(song10_2, sr = sr)

feature11_1 = get_features(song11_1, sr = sr)
feature11_2 = get_features(song11_2, sr = sr)
feature12_1 = get_features(song12_1, sr = sr)
feature12_2 = get_features(song12_2, sr = sr)
feature13_1 = get_features(song13_1, sr = sr)
feature13_2 = get_features(song13_2, sr = sr)
feature14_1 = get_features(song14_1, sr = sr)
feature14_2 = get_features(song14_2, sr = sr)
feature15_1 = get_features(song15_1, sr = sr)
feature15_2 = get_features(song15_2, sr = sr)
feature16_1 = get_features(song16_1, sr = sr)
feature16_2 = get_features(song16_2, sr = sr)
feature17_1 = get_features(song17_1, sr = sr)
feature17_2 = get_features(song17_2, sr = sr)
feature18_1 = get_features(song18_1, sr = sr)
feature18_2 = get_features(song18_2, sr = sr)
feature19_1 = get_features(song19_1, sr = sr)
feature19_2 = get_features(song19_2, sr = sr)
data = pd.concat([feature1_1, feature1_2, feature2_1, feature2_2, feature3_1, feature3_2, feature4_1, feature4_2, feature5_1, feature5_2,
                  feature6_1, feature6_2, feature7_1, feature7_2, feature8_1, feature8_2, feature9_1, feature9_2, feature10_1, feature10_2,
                  feature11_1, feature11_2, feature12_1, feature12_2, feature13_1, feature13_2, feature14_1, feature14_2, feature15_1, feature15_2,
                  feature16_1, feature16_2, feature17_1, feature17_2, feature18_1, feature18_2, feature19_1, feature19_2])


data = data.drop(['mfcc1_mean', 'mfcc1_var', 'mfcc2_mean',	'mfcc2_var', 'mfcc3_mean', 'mfcc3_var', 
                 'mfcc4_mean','mfcc4_var', 'mfcc5_mean',	'mfcc5_var', 'mfcc6_mean', 'mfcc6_var',	'mfcc7_mean', 'mfcc7_var', 'mfcc8_mean',
                 'mfcc8_var', 'mfcc9_mean',	'mfcc9_var', 'mfcc10_mean',	'mfcc10_var', 'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean',
                 'mfcc12_var', 'mfcc13_mean', 'mfcc13_var',	'mfcc14_mean', 'mfcc14_var', 'mfcc15_mean',	'mfcc15_var', 'mfcc16_mean',
                 'mfcc16_var', 'mfcc17_mean', 'mfcc17_var',	'mfcc18_mean', 'mfcc18_var', 'mfcc19_mean',	'mfcc19_var', 'mfcc20_mean', 'mfcc20_var'], axis = 1, inplace = False)
mfcc Feature는 모두 drop
data = data.reset_index()

data = data.drop(['index'], axis = 1, inplace = False)
data.rename(index = {0 : '1-1', 1 : '1-2', 2 : '2-1', 3 : '2-2', 4 : '3-1', 5 : '3-2', 6 : '4-1', 7 : '4-2', 8 : '5-1', 9 : '5-2',
                     10 : '6-1', 11 : '6-2', 12 : '7-1', 13 : '7-2', 14 : '8-1', 15 : '8-2', 16 : '9-1', 17 : '9-2', 18 : '10-1', 19 : '10-2',
                     20 : '11-1', 21 : '11-2', 22 : '12-1', 23 : '12-2', 24 : '13-1', 25 : '13-2', 26 : '14-1', 27 : '14-2', 28 : '15-1', 29 : '15-2',
                     30 : '16-1', 31 : '16-2', 32 : '17-1', 33 : '17-2', 34 : '18-1', 35 : '18-2', 36 : '19-1', 37 : '19-2'}, inplace = True)
                     
data = data.drop(["chroma_stft_var", "rms_mean", "rms_var", "spectral_centroid_var", "spectral_bandwidth_var",
                  "rolloff_mean", "rolloff_var", "tempo", "harmony_mean", "harmony_var", "zero_crossing_rate_mean", 
                  "spectral_bandwidth_mean", "perceptr_mean", "chroma_stft_mean"], axis = 1, inplace = False)
                  
                  
                  
data.head(40)

from sklearn.preprocessing import StandardScaler, MinMaxScaler

ss = MinMaxScaler()
data = ss.fit_transform(np.array(data.iloc[:, :], dtype = float))
data_df = pd.DataFrame(data, columns = ['spectral_centroid_mean', 'zero_crossing_rate_var', 'perceptr_var'])

data_df.head(40)

perceptr_var, zero_crossing_rate_var, spectral_centroid_mean 가 tempo의 영향을 아주 많이 받는다.
def speed_checker(song1, song2, sr) :

    song1_harm, song1_perc   = librosa.effects.hpss(song1)
    song1_zcr                = librosa.feature.zero_crossing_rate(song1, hop_length = 512)
    song1_spectral_centroid = librosa.feature.spectral_centroid(song1, sr = sr)

    song1_perceptr_var = np.var(song1_perc)
    song1_zcr_var = np.var(song1_zcr)
    song1_spectral_centroid_mean = np.mean(song1_spectral_centroid)

    song1_perceptr_var_scaled       = ((song1_perceptr_var - 4.67 * (10**-8)) / (0.058879 - 4.67 * (10**-8)))
    song1_zcr_var_scaled                = ((song1_zcr_var - 5.02 * (10**-6)) / (0.065185 - 5.02 * (10**-6))) 
    song1_spectral_centroid_scaled = ((song1_spectral_centroid_mean - 300) / (5432.534 - 300)) 

    song1_score = np.mean(song1_perceptr_var_scaled + song1_zcr_var_scaled + song1_spectral_centroid_scaled)


    song2_harm, song2_perc   = librosa.effects.hpss(song2)
    song2_zcr                = librosa.feature.zero_crossing_rate(song2, hop_length = 512)
    song2_spectral_centroid = librosa.feature.spectral_centroid(song2, sr = sr)

    song2_perceptr_var = np.var(song2_perc)
    song2_zcr_var = np.var(song2_zcr)
    song2_spectral_centroid_mean = np.mean(song2_spectral_centroid)

    song2_perceptr_var_scaled       = ((song2_perceptr_var - 4.67 * (10**-8)) / (0.058879 - 4.67 * (10**-8))) 
    song2_zcr_var_scaled                = ((song2_zcr_var - 5.02 * (10**-6)) / (0.065185 - 5.02 * (10**-6))) 
    song2_spectral_centroid_scaled = ((song2_spectral_centroid_mean - 300) / (5432.534 - 300)) 

    song2_score = np.mean(song2_perceptr_var_scaled + song2_zcr_var_scaled + song2_spectral_centroid_scaled)

    return song1_score, song2_score
Scaling의 Param.은 10000개의 음원데이터의  Max, Min 값을 이용.

표절 논란이 있는 힙합곡을 speed_checker로 확인

표절 논란이 있는 매우 느린 클래식 피아노곡을 speed_checker로 확인

 

 

 

기존에 사용하던 tempo_checker에 비해 성능이 좋은 것으로 확

 

 

 

 

 

Comments