Log In
|
Sign Up
Search
ngram+jaccard를 이용한 제품 코드+설명 유사도 비교
Writer
오창민
Description
# 적용 가능 버전: 5.1 이상 ``` import xls import metrics double jaccard_sim(set
x, set
y) { return double(intersection(x, y).length())/union(x, y).length(); } set
kwdset(string v) { list
val; string line; for line in kwd(v, 'Universal', 'utf8', '>1w+t-').trim().split('\n') { val.append(line.split()[0]); } return set
(val); } void main(string infile='bm.xlsx', string sheet_name='BM(2)', int start_line_no=9) { list
> data = xls::read_sheet(infile, sheet_name)[start_line_no:]; list
val; string line; list
d; list
> X, Y; list
X_raw, Y_raw; int i = 0; for d in data { //printz i,"==>",d; if (d[0]=='' || d[16]=='') continue; X.append(kwdset(' '.join(d[0:2]))); Y.append(kwdset(d[16])); X_raw.append(' '.join(d[0:2]).replace('\n',' ')); Y_raw.append(d[16].replace('\n',' ')); i++; } set
x,y; int n = 0; list
y_pred,y_true; int ok_top2 = 0, ok_top3=0; for x in X { double d_max = 0.0; int idx_max = -1; int idx_2nd_max = -1; int idx_3rd_max = -1; i = 0; for y in Y { double z = jaccard_sim(x,y); if (z >= d_max) { d_max = z; idx_3rd_max = idx_2nd_max; idx_2nd_max = idx_max; idx_max = i; } //printz "%03d ==> %.2lf".format(i,jaccard_sim(x, y)); i++; } printz "for %03d max sim idx=%03d (%s)".format(n, idx_max, n==idx_max?"OK":"FAIL"); y_true.append(string(n)); y_pred.append(string(idx_max)); if (n==idx_max || n==idx_2nd_max) ok_top2++; if (n in [idx_max, idx_2nd_max, idx_3rd_max]) ok_top3++; n++; } printz "---------------------------------------------------"; printz "accuracy:%lf".format(metrics::accuracy(y_true, y_pred)); printz "accuracy(top2):%lf".format(double(ok_top2)/n); printz "accuracy(top3):%lf".format(double(ok_top3)/n); } ``` ``` $ ./kql.new -f kql.rc -x "run p.k;" for 000 max sim idx=000 (OK) for 001 max sim idx=001 (OK) for 002 max sim idx=002 (OK) for 003 max sim idx=003 (OK) ... for 101 max sim idx=101 (OK) for 102 max sim idx=101 (FAIL) for 103 max sim idx=103 (OK) --------------------------------------------------- accuracy:0.798077 accuracy(top2):0.855769 accuracy(top3):0.875000 ```
Tag
jaccard n-gram
Module Name
Attachments
Comments
Save
Save
Cancel
Save
Cancel
Warning
Login