-
R program(3)-WordcloudCording/R program 2020. 12. 21. 18:38
corpus.pre <- tm_map(corpus, removeNumbers) # 숫자 모두 삭제
corpus.pre <- tm_map(corpus.pre, removePunctuation) # 특수 문자 제거
corpus.pre <- tm_map(corpus.pre, content_transformer(tolower)) # 대소문자의 소문자화 (Convert to lowercase)
corpus.pre <- tm_map(corpus.pre, removeWords, words = stopwords("SMART")) # 불용어(Stopwords) 제거 (SMART 목록)
corpus.pre <- tm_map(corpus.pre, stripWhitespace) # 공백 처리
corpus.pre <- tm_map(corpus.pre, stemDocument, language = "en") # 어간 추출 (Stemming)
dtm <- DocumentTermMatrix(corpus.pre) # DocumentTermMatrix: 행(문서)x열(단어)
tdm <- TermDocumentMatrix(corpus.pre) # TermDocumentMatrix: 행(단어)x열(문서)
# 단어 빈도 확인
word.freq <- apply(dtm[,], 2, sum) # apply(x, 행/열, 함수), 행: 1, 열: 2
# word.freq <- apply(tdm[,], 1, sum)로 진행해도 됨
# 단어 높은 빈도순으로 정렬
word.freq.sort <- sort(word.freq, decreasing = TRUE)
word.freq.sort[1:20]
# 단어 누적빈도 구하기
word.freq.sum <- cumsum(word.freq.sort)
word.freq.sum[1:20]
# 단어 비율 구하기
word.freq.ratio <- word.freq.sum / word.freq.sum[length(word.freq.sum)]
word.freq.ratio[1:20]
# 단어 빈도와 비율간의 관계 파악 (그래프)
plot(1:length(word.freq), word.freq.ratio, type = 'l', xlab = 'Order of word frequency', ylab = 'Cumulative proportion', main = "", axes = FALSE)
axis(1, at = round(0.1*length(word.freq) * (0:10)), labels = paste(10 * (0:10), "%", sep = ""))
axis(2, at = 0.20 * (0:5), labels = paste(20 * (0:5), "%", sep = ""))
for (i in 1:9) {
text(0.1*length(word.freq) * i, 0.05 + word.freq.ratio[0.1*length(word.freq) * i],
labels = paste(round(100 * word.freq.ratio[0.1*length(word.freq) * i]), "%", sep = ""))
points(0.1*length(word.freq) * i, word.freq.ratio[0.1*length(word.freq) * i], pch = 19)
}
### Wordcloud 분석
install.packages(c('wordcloud', 'RColorBrewer'))
library('wordcloud')
library('RColorBrewer')
# Wordcloud 그리기
wordcloud(names(word.freq), freq = word.freq, scale = c(4, 0.2), rot.per = 0.1, min.freq = 10, max.words = 100,
random.order = FALSE, ordered.colors = TRUE)
wordcloud(names(word.freq), freq = word.freq, scale = c(4, 0.2), rot.per = 0.1, min.freq = 10, random.order = FALSE,
colors = brewer.pal(8, "Dark2"))
rot.per > 세로로 10% 돌리겠다
min.freq=10 > 최소 10번
mix.freq=100 > 최대
# 추가: R의 색 패턴 보기
display.brewer.all()
# Wordcloud2 이용하여 응용
install.packages('wordcloud2')
library(wordcloud2)
word.freq.wordcloud.df2 <- data.frame(unlist(names(word.freq.sort)), unname(word.freq.sort))
word.freq.wordcloud.df <- data.frame(unlist(names(word.freq)), unname(word.freq))
colnames(word.freq.wordcloud.df) <- c('word', 'freq')
word.freq.wordcloud <- wordcloud2(word.freq.wordcloud.df, shape = 'circle', size = 0.7, minSize = 10)
# shape: circle, cardioid, diamond, triangle-forward, triangle, pentagon, star’
word.freq.wordcloud
# 추가: WordCloud2 결과물 저장 (HTML, PDF)
install.packages('webshot')
library(webshot)
webshot::install_phantomjs()
library("htmlwidgets")
saveWidget(word.freq.wordcloud, "wordcloud.html", selfcontained = F)
webshot("wordcloud.html", "wordcloud.pdf", delay = 5, vwidth = 480, vheight = 480)
'Cording > R program' 카테고리의 다른 글
R program(5) (0) 2020.12.23 R program(4) (0) 2020.12.22 R program(2)-text 전처리 (0) 2020.12.20 R(2) (0) 2020.12.19 about R-programming (0) 2020.12.17