-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmatch.py
129 lines (101 loc) · 4.74 KB
/
match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from difflib import SequenceMatcher
import pandas as pd
import Levenshtein
import re
import json
# 1. all Sing! partner artists
OUT_FILE_MSD = '/Users/georgid/Dropbox (Smule-EUDC)/arrs-pa_seeds_english_to_MSD'
OUT_FILE_HMX = '/Users/georgid/Dropbox (Smule-EUDC)/arrs-pa_seeds_to_Harmonix.json'
# df_pa_arrs = pd.read_csv('/Users/georgid/Dropbox (Smule-EUDC)/arrs-pa_seeds_english.csv')
df_pa_arrs = pd.read_csv('/Users/georgid/Dropbox (Smule-EUDC)/arrs-pa_seeds.csv')
# or 2. all Sing! in-house
OUT_FILE_HMX = '/Users/georgid/Dropbox (Smule-EUDC)/arrs-in_house_to_Harmonix.json'
df_pa_arrs = pd.read_csv('/Users/georgid/Dropbox (Smule-EUDC)/arrs-in_house.csv')
arr_key_to_external = {}
def match_arr_key_to_MSD(entry, arr_key):
'''
for entry song check by fuzzy matching on name+ artst if in MSD list
'''
print('working on song: {} - {}'.format(entry[0], entry[1]))
with open("unique_tracks.txt","r") as f_msd:
line_no = 0
for line in f_msd.readlines():
line_no+=1
if line != "\n":
line_norm = line.replace("\\","").strip().upper().split("<SEP>")
msdArtist = line_norm[2]
msdTitle = line_norm[3]
msd_id = line_norm[0]
if match_2(msdArtist, entry[0], msdTitle, entry[1]):
print(str(line_no) + ": " + line.strip() + " for entry " + entry[0] + ";" + entry[1])
if arr_key not in arr_key_to_external:
arr_key_to_external[arr_key] = []
arr_key_to_external[arr_key].append(msd_id)
return arr_key_to_external
def match_arr_key_to_HMX(entry, arr_key):
'''
for entry song check by fuzzy matching on name+ artst if in Harmonix list
entry is arrangement
'''
print('working on song: {} - {}'.format(entry[0], entry[1]))
df = pd.read_csv('/Users/georgid/Downloads/metadata.csv') # from https://github.com/urinieto/harmonixset/blob/master/dataset/metadata.csv
for idx in df.index:
hmx_title = df['Title'][idx]
hmx_artist = df['Artist'][idx]
if match_2(hmx_artist, entry[0], hmx_title, entry[1]):
print( "Harmonix: {} - {} Vs sing! : + {} - {} ".format(hmx_artist, hmx_title, entry[0], entry[1] ) )
a = raw_input() # validate manually if it is a match
if a == 'no':
continue
if arr_key not in arr_key_to_external:
arr_key_to_external[arr_key] = []
arr_key_to_external[arr_key].append(df['File'][idx])
return arr_key_to_external
def match_1(artist,artist1, title, title1):
'''
based on sequence matcher
'''
s0 = SequenceMatcher(None,artist,artist1)
s1 = SequenceMatcher(None,title,title1)
return s0.ratio() > 0.27 and s1.ratio() > 0.27
def match_2(artist,artist1, title, title1):
'''
based on Levenshtein distance
'''
return match_Levenshtein(artist, artist1) and match_Levenshtein(title, title1)
def match_Levenshtein(a, b):
'''
use Levenshtein distance. Adapted from "match" in https://github.com/MTG/pycompmusic/blob/master/tools/sertanscores.py#L146
'''
if isinstance(a, unicode):
a = unidecode.unidecode(a)
if isinstance(b, unicode):
b = unidecode.unidecode(b)
a = a.lower()
b = b.lower()
# lowercase and remove all non-letters (spaces, ', -, etc)
a = re.sub(r"[^a-z]", "", a)
b = re.sub(r"[^a-z]", "", b)
# if a match, return
if a == b:
return True
# otherwise, do edit distance.
# an edit of <= 3 at the end of a string counts as a match
if len(a) > len(b):
a, b = b, a
# now, a is the shorter one. If a starts in b at position 0
# then we have the overlap at the end
# if b.startswith(a):
# return True
# Otherwise, chop both strings to the length of the shortest
# one, and check lev distance between them for <= 3
# b = b[:len(a)]
return Levenshtein.distance(a, b) <= 3
if __name__ == '__main__':
print(df_pa_arrs.columns)
df_pa_arrs = df_pa_arrs.dropna(axis = 0, how ='any') # drop entries with any missing column
for index, row in df_pa_arrs.iterrows():
# arr_key_matched = match_arr_key_to_MSD( [row['artist'], row['titl']], row['arr_key'])
arr_key_matched = match_arr_key_to_HMX( [row['artist'], row['titl']], row['arr_key'])
json.dump(arr_key_matched, open(OUT_FILE_HMX,"w"))
print('written file {}'.format(OUT_FILE_HMX))