-
R program(2)-text 전처리Cording/R program 2020. 12. 20. 19:27
install.packages("tm")
install.packages("stringr")
install.packages("wordcloud")
install.packages("SnowballC")
library(tm)
library(stringr)
library(wordcloud)
library(SnowballC)
# CSV 파일 불러오기
setwd("D:/R_Works") # Working Directory 설정
file <- read.csv("wos_english_120.csv") # CSV 파일 불러오기
abstract <- file$ABSTRACT # ABSTRACT 추출
corpus <- VCorpus(VectorSource(abstract)) # Corpus 추출 (VCorpus: 휘발성 말뭉치(Volatile Corpus))
# Corpus 내용 확인
corpus[[2]]$content # corpus 2열에 있는 내용 출력
corpus[[2]]$meta # corpus 2열에 있는 Meta 데이터 출력
# 옵션: Meta 데이터를 추가하고 싶은 경우
meta(corpus[[2]],tag='author') <- 'Kim. won. pyo'
meta(corpus[[2]])
###############################################################################
# 9. 전처리 (Preprocessing)
###############################################################################
# 단어 앞 뒤에 특수기호가 사용된 것 확인
func.check.punct <- function(x) {str_extract_all(x,"[[:alnum:]]{1,}[[:punct:]]{1}?[[:alnum:]]{1,}")}
corpus.check.punct <- lapply(corpus, func.check.punct)
table(unlist(corpus.check.punct))
# 수치 자료 확인
func.check.number <- function(x) {str_extract_all(x,"[[:digit:]]{1,}")}
corpus.check.number <- lapply(corpus, func.check.number)
table(unlist(corpus.check.number))
# 대문자로 시작되는 단어 확인 (고유명사 확인에 유용)
func.check.upper <- function(x) {str_extract_all(x,"[[:upper:]]{1}[[:alpha:]]{1,}")}
corpus.check.upper <- lapply(corpus, func.check.upper)
table(unlist(corpus.check.upper))
corpus.pre <- tm_map(corpus, removeNumbers) # 숫자 모두 삭제
corpus.pre <- tm_map(corpus.pre, removePunctuation) # 특수 문자 제거
corpus.pre <- tm_map(corpus.pre, content_transformer(tolower)) # 대소문자의 소문자화 (Convert to lowercase)
corpus.pre <- tm_map(corpus.pre, removeWords, words = stopwords("SMART")) # 불용어(Stopwords) 제거 (SMART 목록)
# 불용어/특수문자/숫자 제거된 corous(corpus.pre)의 특수기호 여부 재확인
func.check.punct <- function(x) {str_extract_all(x,"[[:alnum:]]{1,}[[:punct:]]{1}?[[:alnum:]]{1,}")}
corpus.check.punct <- lapply(corpus.pre, func.check.punct)
table(unlist(corpus.check.punct))
# 불용어/특수문자/숫자 제거된 corous(corpus.pre)의 수치 여부 재확인
func.check.number <- function(x) {str_extract_all(x,"[[:digit:]]{1,}")}
corpus.check.number <- lapply(corpus.pre, func.check.number)
table(unlist(corpus.check.number))
# 불용어/특수문자/숫자 제거된 corous(corpus.pre)의 대문자 여부 재확인
func.check.upper <- function(x) {str_extract_all(x,"[[:upper:]]{1}[[:alpha:]]{1,}")}
corpus.check.upper <- lapply(corpus.pre, func.check.upper)
table(unlist(corpus.check.upper))
# 추가 단어 제거 혹은 유사단어 일괄처리 처리
func.substitute <- function(obj, old, new) {
sub.obj <- tm_map(obj, content_transformer(function(x, pattern) gsub(pattern, new, x)), old)
sub.obj
}
corpus.pre <- func.substitute(corpus.pre,"-collar","collar")
corpus.pre <- func.substitute(corpus.pre,"\\b((c|C)o-)","co")
corpus.pre <- func.substitute(corpus.pre,"\\b((c|C)ross-)","cross")
corpus.pre <- func.substitute(corpus.pre,"e\\.g\\.","for example")
corpus.pre <- func.substitute(corpus.pre,"i\\.e\\.","that is")
corpus.pre <- func.substitute(corpus.pre,"\\'s","")
corpus.pre <- func.substitute(corpus.pre,"s’","s")
corpus.pre <- func.substitute(corpus.pre,"ICD-","ICD")
corpus.pre <- func.substitute(corpus.pre,"\\b((i|I)nter-)","inter")
corpus.pre <- func.substitute(corpus.pre,"K-pop","Kpop")
corpus.pre <- func.substitute(corpus.pre,"\\b((m|M)eta-)","meta")
corpus.pre <- func.substitute(corpus.pre,"\\b((o|O)pt-)","opt")
corpus.pre <- func.substitute(corpus.pre,"\\b((p|P)ost-)","post")
corpus.pre <- func.substitute(corpus.pre,"-end","end")
corpus.pre <- func.substitute(corpus.pre,"\\b((w|W)ithin-)","within")
corpus.pre <- func.substitute(corpus.pre,"=","is equal to")
corpus.pre <- func.substitute(corpus.pre,"and/or","and or")
corpus.pre <- func.substitute(corpus.pre,"his/her","his her")
corpus.pre <- func.substitute(corpus.pre,"-"," ")
corpus.pre <- tm_map(corpus.pre, stripWhitespace) # 공백 처리
corpus.pre <- tm_map(corpus.pre, stemDocument, language = "en") # 어간 추출 (Stemming)
# Corpus 내 문자수와 단어수를 계산하기 위한 Custom 함수 설정
func.freq.char <- function(x) { str_extract_all(x, ".") }
func.freq.word <- function(x) { str_extract_all(x, boundary("word")) }
# 전처리 이전 문자수와 단어수 계산
corpus.freq.char <- lapply(corpus, func.freq.char)
corpus.freq.char.length <- length(table(unlist(corpus.freq.char)))
corpus.freq.char.total <- sum(table(unlist(corpus.freq.char)))
corpus.freq.word <- lapply(corpus, func.freq.word)
corpus.freq.word.length <- length(table(unlist(corpus.freq.word)))
corpus.freq.word.total <- sum(table(unlist(corpus.freq.word)))
# 전처리 이후 문자수와 단어수 계산
corpus.pre.freq.char <- lapply(corpus.pre, func.freq.char)
corpus.pre.freq.char.length <- length(table(unlist(corpus.pre.freq.char)))
corpus.pre.freq.char.total <- sum(table(unlist(corpus.pre.freq.char)))
corpus.pre.freq.word <- lapply(corpus.pre, func.freq.word)
corpus.pre.freq.word.length <- length(table(unlist(corpus.pre.freq.word)))
corpus.pre.freq.word.total <- sum(table(unlist(corpus.pre.freq.word)))
#전처리 전-후 문자수와 단어수 비교
corpus.compare.result <- rbind(
c(corpus.freq.char.length, corpus.pre.freq.char.length),
c(corpus.freq.char.total, corpus.pre.freq.char.total),
c(corpus.freq.word.length, corpus.pre.freq.word.length),
c(corpus.freq.word.total, corpus.pre.freq.word.total))
colnames(corpus.compare.result) <- c("before", "after")
rownames(corpus.compare.result) <- c("고유문자수", "총문자수", "고유단어수", "총단어수")
corpus.compare.result
'Cording > R program' 카테고리의 다른 글
R program(5) (0) 2020.12.23 R program(4) (0) 2020.12.22 R program(3)-Wordcloud (0) 2020.12.21 R(2) (0) 2020.12.19 about R-programming (0) 2020.12.17