기업명 표준화(클리닝)

Archive/통계&코딩이것저것 2021. 9. 6. 15:32

https://www.analyticsinsight.net/company-names-standardization-using-a-fuzzy-nlp-approach/

Company Name Standardization using a Fuzzy NLP Approach |

In this study, we showcase a two-tier automated methodology for Company Name Standardization achieved by using NLP and Fuzzy Logic-based techniques. This reduces the effort required to less than 15% of that when done entirely manually.

www.analyticsinsight.net

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html

sklearn.cluster.AffinityPropagation — scikit-learn 0.24.2 documentation

scikit-learn.org

import pandas as pd
from tqdm import tqdm
import os.path
import collections
from operator import itemgetter
import matplotlib.pyplot as plt
import numpy as np
import unicodedata
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk
from fuzzywuzzy import fuzz
from sklearn import cluster
from difflib import SequenceMatcher
from collections import Counter

df_prob=pd.read_csv("../이슈제거_cname_lists.csv")
df_prob=df_prob.sort_values('cname')

# List of keywords to help identify stop_words
#자주 나타나는 용어(불용어) 식별 후 제거
vendor_stopwords=['biz', 'bv', 'co', 'comp', 'company',
                'corp','corporation', 'dba',
                'inc', 'incorp', 'incorporat',
                'incorporate', 'incorporated', 'incorporation',
                'international', 'intl', 'intnl',
                'limited' ,'llc', 'ltd', 'llp',
                'machines', 'pvt', 'pte', 'private', 'unknown',
                  'group','capital','technologies','solutions']

# Text data encoder function
def filter_ascii(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

# Remove spl characters & digits (optional) function
#특수문자(or 선택시 숫자도)를 없앨건지 선택
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

# Remove vendor specific stop words
#불용어 제거, eng=True일 시 nltk의 불용어 가져와서 제거
def clean_stopwords(text,eng=False):
    if eng == False:
        custom = vendor_stopwords
    else:
        custom = vendor_stopwords + list(ENGLISH_STOP_WORDS)
    for x in custom:
        pattern2 = r'\b'+x+r'\b'
        text=re.sub(pattern2,'',text)
    return text

# Trim the text to remove spaces
def clean_spaces(text):
    text=text.replace('  ', ' ')
    text=text.strip()
    return text

# Function to Preprocess Textual data. Provide input as df['Column Name'] to this function
#input 은 데이터프레임의 컬럼 , lem=True일시 lemmatization 수행
def preprocess_text(column, remove_digits=True, lemm=True, eng=False):
    try:
        column = [filter_ascii(text) for text in column]
        column = [remove_special_characters(text, remove_digits) for text in column]
        column = [text.lower() for text in column]
        column = [clean_stopwords(text, eng) for text in column]
        column = [clean_spaces(text) for text in column]
        ## Lemmatisation (convert the word into root word)
        if lemm == True:
            lem = nltk.stem.wordnet.WordNetLemmatizer()
            column = [lem.lemmatize(text) for text in column]
        return column
    except Exception as e:
        return print(e)

#df_prob.to_csv("C:/Users/윤유진/OneDrive - 데이터마케팅코리아 (Datamarketingkorea)/바탕 화면/삼성전기/210901/test_preprocessing.csv",mode='w',encoding='utf-8-sig')
# Importing the necessary libraries

def fuzz_similarity(cust_names):
    similarity_array=np.ones((len(cust_names),(len(cust_names))))*100
    #먼저 100으로 찬 행렬을 만들어준다!
   #cust_names=cust_names.replace('',0)
    for i in tqdm(range(1,len(cust_names))):
        for j in range(i):
                s1 = fuzz.token_set_ratio(cust_names[i],cust_names[j]) + 0.0000001#나눌 때 0이 들어가면 안되니까
                #문자열을 토큰화하여 정렬 한 다음 공통 토큰으로 레벤슈타인 ratio를 반환(순서를 신경씀)
                s2 = fuzz.partial_ratio(cust_names[i], cust_names[j]) + 0.0000001
                #문자열 중 하나가 다른 문자열의 하위 문자열인지 확인해서 레벤슈타인 ratio를 반환(순서 신경쓰지 않음)
                similarity_array[i][j]=2*s1*s2/(s1+s2) #정확히는 모르겠지만.. cosine similarity와 유사하게 이 둘의 유사도를 구함
    for i in tqdm(range(len(cust_names))):
        for j in range(i+1,len(cust_names)):
            similarity_array[i][j]=similarity_array[j][i]
    #어차피 같으니까 채워줌
    np.fill_diagonal(similarity_array,100)
    #대각원소를 100으로 채움 (같은 문자니까 비율을 100으로 함)
    return similarity_array


def company_clusters(data,nameCol='MiddleLevelCustomerName',dropForeign=True):
    cust_ids=data['preprocessing'].to_list()

    similarity_array=fuzz_similarity(cust_ids)
    clusters=cluster.AffinityPropagation(damping=0.9,affinity='precomputed',max_iter=1000,random_state=100).fit_predict(similarity_array)
    #모든 데이터가 특정한 기준에 따라 자신을 대표할 대표 데이터를 선택한다. 만약 스스로가 자기 자신을 대표하게 되면 클러스터의 중심이 된다.( 클러스터의 수를 정해주는 kmeans와 다르게 데이터에 따라 클러스터의 수를 스스로 선택)
    #damping: 수치 안정화를 위해 존재, 천천히 수렴하는 학습률로 간주 가능

    df_clusters=pd.DataFrame(list(zip(cust_ids,clusters)),columns=['data','cluster'])

    df_eval=pd.concat([df_clusters,data.reset_index()],axis=1)
    return df_eval

#클러스터가 할당되고 나서, 이름 표준화 수행
def standard_name(df_eval):
    d_standard_name={}
    for cluster in df_eval.cluster.unique(): #매 클러스터에 대해
        names=df_eval[df_eval['cluster']==cluster].preprocessing.to_list()
        l_common_substring=[]
        if len(names)>1:#기업 이름 길이가 2 이상이면.
            for i in range(0,len(names)):
                for j in range(i+1,len(names)):
                    seqMatch=SequenceMatcher(None,names[i],names[j])#클러스터 내의 이름 쌍에 대해 가장 긴 공통 부분 문자열을 찾음
                    match=seqMatch.find_longest_match(0,len(names[i]),0,len(names[j]))
                    if (match.size!=0):
                        l_common_substring.append(names[i][match.a:match.a+match.size].strip())
            n=len(l_common_substring)
            counts=Counter(l_common_substring)#Counter:컨테이너에 동일한 값의 자료가 몇개인지를 파악하는데 사용하는 객체
            get_mode=dict(counts)
            mode=[k for k,v in get_mode.items() if v==max(list(counts.values()))]

            d_standard_name[cluster]=";".join(mode)

        else:#기업 이름이 길이 1이면 그냥 해당 이름으로 함
            d_standard_name[cluster]=names[0]

    df_standard_names=pd.DataFrame(list(d_standard_name.items()),columns=['cluster','StandardName'])
    df_eval=df_eval.merge(df_standard_names,on='cluster',how='left')
    df_eval['standard_name_withoutSpaces']=df_eval.StandardName.apply(lambda x: x.replace(" ",""))
    for name in df_eval.standard_name_withoutSpaces.unique():
        if len(df_eval[df_eval.standard_name_withoutSpaces==name].cluster.unique())>1:
            df_eval.loc[df_eval.standard_name_withoutSpaces==name,'StandardName']==name

    return df_eval.drop('standard_name_withoutSpaces',axis=1)

'Archive > 통계&코딩이것저것' 카테고리의 다른 글

Ordinal Regression (0)	2021.10.13
Neural Collaborative Filtering 논문 리뷰 (0)	2021.09.14
n-gram을 이용해서 철자를 교정해보자 (0)	2021.09.02
잠재요인 협업필터링 추천시스템 구축 (0)	2021.08.23
cnn, rnn 차이 설명 (0)	2021.08.23

ABOUT ME

항상 엔진을 켜둘게🚀 항상 엔진을 켜둘게🚀

'Archive > 통계&코딩이것저것' 카테고리의 다른 글

티스토리툴바

ABOUT ME

'Archive > 통계&코딩이것저것' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바