R program(3)-Wordcloud

Cording/R program 2020. 12. 21. 18:38

corpus.pre <- tm_map(corpus, removeNumbers) # 숫자 모두 삭제

corpus.pre <- tm_map(corpus.pre, removePunctuation) # 특수 문자 제거

corpus.pre <- tm_map(corpus.pre, content_transformer(tolower)) # 대소문자의 소문자화 (Convert to lowercase)

corpus.pre <- tm_map(corpus.pre, removeWords, words = stopwords("SMART")) # 불용어(Stopwords) 제거 (SMART 목록)

corpus.pre <- tm_map(corpus.pre, stripWhitespace) # 공백 처리

corpus.pre <- tm_map(corpus.pre, stemDocument, language = "en") # 어간 추출 (Stemming)

dtm <- DocumentTermMatrix(corpus.pre) # DocumentTermMatrix: 행(문서)x열(단어)

tdm <- TermDocumentMatrix(corpus.pre) # TermDocumentMatrix: 행(단어)x열(문서)

# 단어 빈도 확인

word.freq <- apply(dtm[,], 2, sum) # apply(x, 행/열, 함수), 행: 1, 열: 2

# word.freq <- apply(tdm[,], 1, sum)로 진행해도 됨

# 단어 높은 빈도순으로 정렬

word.freq.sort <- sort(word.freq, decreasing = TRUE)

word.freq.sort[1:20]

# 단어 누적빈도 구하기

word.freq.sum <- cumsum(word.freq.sort)

word.freq.sum[1:20]

# 단어 비율 구하기

word.freq.ratio <- word.freq.sum / word.freq.sum[length(word.freq.sum)]

word.freq.ratio[1:20]

# 단어 빈도와 비율간의 관계 파악 (그래프)

plot(1:length(word.freq), word.freq.ratio, type = 'l', xlab = 'Order of word frequency', ylab = 'Cumulative proportion', main = "", axes = FALSE)

axis(1, at = round(0.1*length(word.freq) * (0:10)), labels = paste(10 * (0:10), "%", sep = ""))

axis(2, at = 0.20 * (0:5), labels = paste(20 * (0:5), "%", sep = ""))

for (i in 1:9) {

text(0.1*length(word.freq) * i, 0.05 + word.freq.ratio[0.1*length(word.freq) * i],

labels = paste(round(100 * word.freq.ratio[0.1*length(word.freq) * i]), "%", sep = ""))

points(0.1*length(word.freq) * i, word.freq.ratio[0.1*length(word.freq) * i], pch = 19)

}

### Wordcloud 분석

install.packages(c('wordcloud', 'RColorBrewer'))

library('wordcloud')

library('RColorBrewer')

# Wordcloud 그리기

wordcloud(names(word.freq), freq = word.freq, scale = c(4, 0.2), rot.per = 0.1, min.freq = 10, max.words = 100,

random.order = FALSE, ordered.colors = TRUE)

wordcloud(names(word.freq), freq = word.freq, scale = c(4, 0.2), rot.per = 0.1, min.freq = 10, random.order = FALSE,

colors = brewer.pal(8, "Dark2"))

rot.per > 세로로 10% 돌리겠다

min.freq=10 > 최소 10번

mix.freq=100 > 최대

# 추가: R의 색 패턴 보기

display.brewer.all()

# Wordcloud2 이용하여 응용

install.packages('wordcloud2')

library(wordcloud2)

word.freq.wordcloud.df2 <- data.frame(unlist(names(word.freq.sort)), unname(word.freq.sort))

word.freq.wordcloud.df <- data.frame(unlist(names(word.freq)), unname(word.freq))

colnames(word.freq.wordcloud.df) <- c('word', 'freq')

word.freq.wordcloud <- wordcloud2(word.freq.wordcloud.df, shape = 'circle', size = 0.7, minSize = 10)

# shape: circle, cardioid, diamond, triangle-forward, triangle, pentagon, star’

word.freq.wordcloud

# 추가: WordCloud2 결과물 저장 (HTML, PDF)

install.packages('webshot')

library(webshot)

webshot::install_phantomjs()

library("htmlwidgets")

saveWidget(word.freq.wordcloud, "wordcloud.html", selfcontained = F)

webshot("wordcloud.html", "wordcloud.pdf", delay = 5, vwidth = 480, vheight = 480)

'Cording > R program' 카테고리의 다른 글

R program(5) (0)	2020.12.23
R program(4) (0)	2020.12.22
R program(2)-text 전처리 (0)	2020.12.20
R(2) (0)	2020.12.19
about R-programming (0)	2020.12.17

ABOUT ME

고동완의 스토리지 고동완의 스토리지

'Cording > R program' 카테고리의 다른 글

티스토리툴바

ABOUT ME

'Cording > R program' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바