ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • R program(2)-text 전처리
    Cording/R program 2020. 12. 20. 19:27

    install.packages("tm")

    install.packages("stringr")

    install.packages("wordcloud")

    install.packages("SnowballC")

     

    library(tm)

    library(stringr)

    library(wordcloud)

    library(SnowballC)

     

     

    # CSV 파일 불러오기

    setwd("D:/R_Works") # Working Directory 설정

    file <- read.csv("wos_english_120.csv") # CSV 파일 불러오기

    abstract <- file$ABSTRACT # ABSTRACT 추출

    corpus <- VCorpus(VectorSource(abstract)) # Corpus 추출 (VCorpus: 휘발성 말뭉치(Volatile Corpus))

     

    # Corpus 내용 확인

    corpus[[2]]$content # corpus 2열에 있는 내용 출력

    corpus[[2]]$meta # corpus 2열에 있는 Meta 데이터 출력

     

    # 옵션: Meta 데이터를 추가하고 싶은 경우

    meta(corpus[[2]],tag='author') <- 'Kim. won. pyo'

    meta(corpus[[2]])

     

    ###############################################################################

    # 9. 전처리 (Preprocessing)

    ###############################################################################

     

    # 단어 앞 뒤에 특수기호가 사용된 것 확인

    func.check.punct <- function(x) {str_extract_all(x,"[[:alnum:]]{1,}[[:punct:]]{1}?[[:alnum:]]{1,}")}

    corpus.check.punct <- lapply(corpus, func.check.punct)

    table(unlist(corpus.check.punct))

     

    # 수치 자료 확인

    func.check.number <- function(x) {str_extract_all(x,"[[:digit:]]{1,}")}

    corpus.check.number <- lapply(corpus, func.check.number)

    table(unlist(corpus.check.number))

     

    # 대문자로 시작되는 단어 확인 (고유명사 확인에 유용)

    func.check.upper <- function(x) {str_extract_all(x,"[[:upper:]]{1}[[:alpha:]]{1,}")}

    corpus.check.upper <- lapply(corpus, func.check.upper)

    table(unlist(corpus.check.upper))

     

    corpus.pre <- tm_map(corpus, removeNumbers) # 숫자 모두 삭제

    corpus.pre <- tm_map(corpus.pre, removePunctuation) # 특수 문자 제거

    corpus.pre <- tm_map(corpus.pre, content_transformer(tolower)) # 대소문자의 소문자화 (Convert to lowercase)

    corpus.pre <- tm_map(corpus.pre, removeWords, words = stopwords("SMART")) # 불용어(Stopwords) 제거 (SMART 목록)

     

     

    # 불용어/특수문자/숫자 제거된 corous(corpus.pre)의 특수기호 여부 재확인

    func.check.punct <- function(x) {str_extract_all(x,"[[:alnum:]]{1,}[[:punct:]]{1}?[[:alnum:]]{1,}")}

    corpus.check.punct <- lapply(corpus.pre, func.check.punct)

    table(unlist(corpus.check.punct))

     

    # 불용어/특수문자/숫자 제거된 corous(corpus.pre)의 수치 여부 재확인

    func.check.number <- function(x) {str_extract_all(x,"[[:digit:]]{1,}")}

    corpus.check.number <- lapply(corpus.pre, func.check.number)

    table(unlist(corpus.check.number))

     

    # 불용어/특수문자/숫자 제거된 corous(corpus.pre)의 대문자 여부 재확인

    func.check.upper <- function(x) {str_extract_all(x,"[[:upper:]]{1}[[:alpha:]]{1,}")}

    corpus.check.upper <- lapply(corpus.pre, func.check.upper)

    table(unlist(corpus.check.upper))

     

     

    # 추가 단어 제거 혹은 유사단어 일괄처리 처리

    func.substitute <- function(obj, old, new) {

    sub.obj <- tm_map(obj, content_transformer(function(x, pattern) gsub(pattern, new, x)), old)

    sub.obj

    }

     

    corpus.pre <- func.substitute(corpus.pre,"-collar","collar")

    corpus.pre <- func.substitute(corpus.pre,"\\b((c|C)o-)","co")

    corpus.pre <- func.substitute(corpus.pre,"\\b((c|C)ross-)","cross")

    corpus.pre <- func.substitute(corpus.pre,"e\\.g\\.","for example")

    corpus.pre <- func.substitute(corpus.pre,"i\\.e\\.","that is")

    corpus.pre <- func.substitute(corpus.pre,"\\'s","")

    corpus.pre <- func.substitute(corpus.pre,"s’","s")

    corpus.pre <- func.substitute(corpus.pre,"ICD-","ICD")

    corpus.pre <- func.substitute(corpus.pre,"\\b((i|I)nter-)","inter")

    corpus.pre <- func.substitute(corpus.pre,"K-pop","Kpop")

    corpus.pre <- func.substitute(corpus.pre,"\\b((m|M)eta-)","meta")

    corpus.pre <- func.substitute(corpus.pre,"\\b((o|O)pt-)","opt")

    corpus.pre <- func.substitute(corpus.pre,"\\b((p|P)ost-)","post")

    corpus.pre <- func.substitute(corpus.pre,"-end","end")

    corpus.pre <- func.substitute(corpus.pre,"\\b((w|W)ithin-)","within")

    corpus.pre <- func.substitute(corpus.pre,"=","is equal to")

    corpus.pre <- func.substitute(corpus.pre,"and/or","and or")

    corpus.pre <- func.substitute(corpus.pre,"his/her","his her")

    corpus.pre <- func.substitute(corpus.pre,"-"," ")

     

    corpus.pre <- tm_map(corpus.pre, stripWhitespace) # 공백 처리

    corpus.pre <- tm_map(corpus.pre, stemDocument, language = "en") # 어간 추출 (Stemming)

     

    # Corpus 내 문자수와 단어수를 계산하기 위한 Custom 함수 설정

    func.freq.char <- function(x) { str_extract_all(x, ".") }

    func.freq.word <- function(x) { str_extract_all(x, boundary("word")) }

     

    # 전처리 이전 문자수와 단어수 계산

    corpus.freq.char <- lapply(corpus, func.freq.char)

    corpus.freq.char.length <- length(table(unlist(corpus.freq.char)))

    corpus.freq.char.total <- sum(table(unlist(corpus.freq.char)))

    corpus.freq.word <- lapply(corpus, func.freq.word)

    corpus.freq.word.length <- length(table(unlist(corpus.freq.word)))

    corpus.freq.word.total <- sum(table(unlist(corpus.freq.word)))

     

    # 전처리 이후 문자수와 단어수 계산

    corpus.pre.freq.char <- lapply(corpus.pre, func.freq.char)

    corpus.pre.freq.char.length <- length(table(unlist(corpus.pre.freq.char)))

    corpus.pre.freq.char.total <- sum(table(unlist(corpus.pre.freq.char)))

    corpus.pre.freq.word <- lapply(corpus.pre, func.freq.word)

    corpus.pre.freq.word.length <- length(table(unlist(corpus.pre.freq.word)))

    corpus.pre.freq.word.total <- sum(table(unlist(corpus.pre.freq.word)))

     

    #전처리 전-후 문자수와 단어수 비교

    corpus.compare.result <- rbind(

    c(corpus.freq.char.length, corpus.pre.freq.char.length),

    c(corpus.freq.char.total, corpus.pre.freq.char.total),

    c(corpus.freq.word.length, corpus.pre.freq.word.length),

    c(corpus.freq.word.total, corpus.pre.freq.word.total))

    colnames(corpus.compare.result) <- c("before", "after")

    rownames(corpus.compare.result) <- c("고유문자수", "총문자수", "고유단어수", "총단어수")

    corpus.compare.result

     

     

    'Cording > R program' 카테고리의 다른 글

    R program(5)  (0) 2020.12.23
    R program(4)  (0) 2020.12.22
    R program(3)-Wordcloud  (0) 2020.12.21
    R(2)  (0) 2020.12.19
    about R-programming  (0) 2020.12.17

    댓글

Designed by Tistory.