Ongoing

Cording/R program 2023. 12. 25. 19:38

library(tm)

library(stringr)

library(wordcloud)

library(SnowballC)

file <- read.csv("wos_english_120.csv") # CSV 파일 불러오기

abstract <- file$ABSTRACT # ABSTRACT 추출

corpus <- VCorpus(VectorSource(abstract)) # Corpus 추출 (VCorpus: 휘발성 말뭉치(Volatile Corpus))

corpus.pre <- tm_map(corpus, removeNumbers) # 숫자 모두 삭제

corpus.pre <- tm_map(corpus.pre, removePunctuation) # 특수 문자 제거

corpus.pre <- tm_map(corpus.pre, content_transformer(tolower)) # 대소문자의 소문자화 (Convert to lowercase)

corpus.pre <- tm_map(corpus.pre, removeWords, words = stopwords("SMART")) # 불용어(Stopwords) 제거 (SMART 목록)

corpus.pre <- tm_map(corpus.pre, stripWhitespace) # 공백 처리

corpus.pre <- tm_map(corpus.pre, stemDocument, language = "en") # 어간 추출 (Stemming)

dtm.tfidf <- DocumentTermMatrix(corpus.pre, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))

tdm.tfidf <- TermDocumentMatrix(corpus.pre, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))

head(rownames(dtm.tfidf[,]), 20)

head(colnames(dtm.tfidf[,]), 20)

inspect(dtm.tfidf[1:3, 50:100])

head(rownames(tdm.tfidf[,]), 20)

head(colnames(tdm.tfidf[,]), 20)

inspect(tdm.tfidf[50:100, 1:3])

# TF값과 TF-IDF값을 벡터로 추출하기.

dtm.tf.value <- as.vector(as.matrix(dtm[,]))

dtm.tfidf.value <- as.vector(as.matrix(dtm.tfidf[,]))

# 단어명과 문서명 추출하기.

dtm.label.word <- rep(colnames(dtm[,]),each=dim(dtm[,])[1])

dtm.label.doc <- rep(rownames(dtm[,]),dim(dtm[,])[2])

> 1: 가로, 2: 세로

# 단어, 문서, TF, TF-IDF의 값을 하나로 모아 데이터프레임 만들기.

df.tfidf <- data.frame(dtm.label.word, dtm.label.doc, dtm.tf.value, dtm.tfidf.value)

colnames(df.tfidf) <- c('WORD', 'DOCUMENT', 'TF', 'TFIDF')

df.tfidf[2000:2010,]

> 앞 부분은 행, 뒷 부분은 열을 뜻함.

# 상관계수 구하기 전체 (kendall)

# 주의: 분석 시간이 많이 소요됨

cor.test(df.tfidf$TF, df.tfidf$TFIDF, method = "kendall")

# 상관계수 구하기 빈도 0초과 (kendall)

cor.test(df.tfidf$TF[df.tfidf$TF > 0], df.tfidf$TFIDF[df.tfidf$TFIDF > 0], method = "kendall")

# TF는 높지만 TF-IDF는 낮은 단어 찾기

df.tfidf.positive <- subset(df.tfidf, TF > 0 & TFIDF > 0)

df.tfidf.posTFnegIDF <- subset(df.tfidf.positive, TF > median(df.tfidf.positive$TF) & TFIDF < median(df.tfidf.positive$TFIDF))

table(df.tfidf.posTFnegIDF$WORD)[table(df.tfidf.posTFnegIDF$WORD) > 0]

'Cording > R program' 카테고리의 다른 글

R program(8) Data (0)	2021.01.11
R program(7) (0)	2020.12.26
R program(6) (0)	2020.12.24
R program(5) (0)	2020.12.23
R program(4) (0)	2020.12.22

ABOUT ME

고동완의 스토리지 고동완의 스토리지

'Cording > R program' 카테고리의 다른 글

티스토리툴바

ABOUT ME

'Cording > R program' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바