##### Script ver. 2017/01/06/ #####

##### 4 #####
##### 4.1 #####

# ͗K
1 + 2

# Z
2 - 1
# |Z
2 * 3
# Z
4 / 2
# ݏ
3 ^ 4

# ̏Z~RłȂ
2 - 1 ; 2 * 3 ; 4 / 2 ; 3 ^ 4

# ȉ̏͑Sāu3vƂʂԂ
1+2
1+ 2
1 +2
1 + 2
     1     +     2

# ϐɑ
x <- 2

# ϐ̒g̊mF
x

# Ɠɕϐ̒gmF
(x <- 2)

# ϐgvZ
x + 1

# ʂ̕ϐ쐬
y <- 3
# ϐm̌vZ
x + y

##### 4.2 #####

# xNg̍쐬Ƒ
# c֐́CxNg쐬邽߂̊֐
x <- c(1, 2, 3, 4, 5)

# xNg̒g̊mF
x
# xNg̒ivfj̊mF
length(x)

# xNg3Ԗڂ̗vfo
x[3]
# xNg2Ԗڂ4Ԗڂ̗vfo
x[2 : 4]

# xNggvZ
x * 2
# ʂ̃xNg쐬
y <- c(6, 7, 8, 9, 10)
# xNgm̌vZ
x + y

# x, y̏ԂŃxNg
vector.1 <- append(x, y)
vector.1
# y, x̏ԂŃxNg
vector.2 <- append(y, x)
vector.2

# s̍쐬
# xNg̗p
z <- c(1, 2, 3, 4, 5, 6)
# š`ɕϊ
matrix.1 <- matrix(z, nrow = 2, ncol = 3)
matrix.1

# matrix֐̈byrowTRUEw
matrix.2 <- matrix(z, nrow = 2, ncol = 3, byrow = TRUE)
matrix.2

# s̊mF
nrow(matrix.2)
# 񐔂̊mF
ncol(matrix.2)
# sƗ񐔂̊mF
dim(matrix.2)

# sgvZ
matrix.2 + 1
# ʂ̍s쐬imatrix֐c֐qɂj
matrix.3 <- matrix(c(7, 8, 9, 10, 11, 12), nrow = 2, ncol = 3, byrow = TRUE)
# s񓯎m̌vZ
matrix.2 + matrix.3

# šisj
rbind(matrix.2, matrix.3)
# šij
cbind(matrix.2, matrix.3)

# ̍s
matrix.2
# 2sځE3ڂ̗vfo
matrix.2[2, 3]
# 2sڂ̗vfSĂo
matrix.2[2, ]
# 3ڂ̗vfSĂo
matrix.2[, 3]
# 2sڂ̗vfȊȎSĂo
matrix.2[-2, ]
# 3ڂ̗vfȊȎSĂo
matrix.2[, -3]

# ̍s
matrix.2
# s̓]u
t(matrix.2)

matrix.2
# sx̕t^
colnames(matrix.2) <- c("C1", "C2", "C3")
# 񃉃x̕t^
rownames(matrix.2) <- c("R1", "R2")
# x̊mF
matrix.2

# c֐̃wvQ
help(c)

##### 4.3 #####

x <- c(1, 2, 3, 4, 5)
# xNg̑a
sum(x)
# xNg2Ԗڂ4Ԗڂ̗vf̑a
sum(x[2 : 4])

# ϒl
mean(x)
# l
median(x)

# O[vA5l̔N̒iPʂ͖~j
a <- c(100, 200, 300, 400, 500)
mean(a)
median(a)
# O[vB5l̔N̒iPʂ͖~j
b <- c(100, 200, 300, 400, 5000)
mean(b)
median(b)

x <- c(1, 2, 3, 4, 5)
# ől
max(x)
# ŏl
min(x)
# U
var(x)
# W΍
sd(x)

# v񓝌v
summary(x)

matrix.4 <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), nrow = 3, ncol = 3, byrow = TRUE)
matrix.4
# s̑a
sum(matrix.4)
# s̕ϒl
mean(matrix.4)

# s1sڂ̑a
sum(matrix.4[1, ])
# s2`3ڂ̕ϒl
mean(matrix.4[, 2 : 3])

# sƂ̑a
rowSums(matrix.4)
# 񂲂Ƃ̑a
colSums(matrix.4)
# sƂ̕ϒl
rowMeans(matrix.4)
# 񂲂Ƃ̕ϒl
colMeans(matrix.4)

# sƂ̑airowSums(matrix.4)Ɠj
apply(matrix.4, 1, sum)
# 񂲂Ƃ̕ϒlicolMeans(matrix.4)Ɠj
apply(matrix.4, 2, mean)
# sƂ̍ől
apply(matrix.4, 1, max)
# 񂲂Ƃ̗v񓝌v
apply(matrix.4, 2, summary)

##### 4.4 #####

# ϐɑ
str.1 <- "cats"
# xNg̍쐬ipC{j
str.2 <- c("I", "love", "cats")
str.3 <- c("͔LDł")
# ϐ̒g̊mF
str.1
str.2
str.3

# lƂđ
num <- c(1, 2, 3)
num
# Ƃđ
str.4 <- c("1", "2", "3")
str.4

# f[^̃NX̊mF
class(num)
class(str.4)

# At@xbg啶
LETTERS
# At@xbg
letters
# At@xbgSďɕϊ
tolower(LETTERS)
# At@xbgSđ啶ɕϊ
toupper(letters)

# ̌iftHgł́CXy[X܂j
paste("William", "Shakespeare")
paste("Ė", "")
# Xy[XȂŌ
paste("Ė", "", sep = "")
# ̓ϐ
Kawabata <- "["
Yasunari <- "N"
paste(Kawabata, Yasunari)

# AԂ̂ϐ𐶐
paste("No.", 1 : 5, sep = "")

# ̕vZ
nchar("cat")
nchar("L")
# e̕vZ
nchar(c("I", "love", "cats"))
nchar(c("", "", "L", "", "D", "ł"))

# 1Pꂠ̕ϕ
word.length <- nchar(c("I", "love", "cats"))
mean(word.length)
# [hXyNgi1Pꂠ̕̕px\j
# ̗ł́C1̒Pꂪ1ŁC4̒Pꂪ2
table(word.length)

# 1ڂ3ڂ܂ło
substr("ABCDE", start = 1, stop = 3)
# 2ڂ4ڂ܂ło
substr("", start = 2, stop = 4)

# ̉ߋ`̃xNg
verbs <- c("asked", "had", "looked", "took")
# edƂ܂ޗvf̔ԍ𒊏o
verbs.n <- grep("ed", verbs)
# ovf̔ԍmF
verbs.n
# oԍ肪ƂāCɍvvf\
verbs[verbs.n]

# PedƂ܂ޒP
words <- c("asked", "edited", "edition", "education", "looked")
words.n <- grep("ed", words)
words[words.n]
# edƂŏIP݂̂
words.n.2 <- grep("ed$", words, perl = TRUE)
words[words.n.2]
# edƂŎn܂P݂̂
words.n.3 <- grep("^ed", words, perl = TRUE)
words[words.n.3]

# p̌ꖖedsɒu
verbs.2 <- c("asked", "looked", "walked")
gsub("ed$", "s", verbs.2, perl = TRUE)
# {̌ꖖ́uvuvɒu
adverbs <- c("", "", "")
gsub("$", "", adverbs, perl = TRUE)

# u@\p̍폜
nouns <- c("birds", "cats", "dogs")
gsub("s$", "", nouns, perl = TRUE)

# A and B̌`̃̕xNg
and <- c("black and white", "bread and butter", "cats and dogs")
" and "؂ƂāC𕪊
strsplit(and, split = " and ")
# o͂Xg`xNg`ɕϊ
unlist(strsplit(and, split = " and "))

# ЂƂ܂̉p
ulysses <- "Stately, plump Buck Mulligan came from the stairhead, bearing a bowl of lather on which a mirror and a razor lay crossed."
# Xy[X؂Ƃ
unlist(strsplit(ulysses, split = " "))

# {̕
yukiguni <- "̒gl𔲂ƐፑłB"
unlist(strsplit(yukiguni, split = ""))

##### 4.5 #####

# ƃfBNg̊mF
getwd()

# ƃfBNg̕ύX
# ȉ́CuCvhCúuDatavtH_ɕύX
setwd("C:/Data")
# w肵tH_݂Ȃꍇ́CError in setwd("C:/Data") : cannot change working directoryƂG[bZ[W\

# t@CƃfBNgɂꍇ
data01 <- read.csv("data01.csv" , header = FALSE)
# t@CƃfBNgł͂ȂCC:/Dataɂꍇ
data01 <- read.csv("C:/Data/data01.csv", header = FALSE)
data01

# }EXdata01.csvIꍇ
data01 <- read.csv(file.choose(), header = FALSE)

# }EXdata02.csvIꍇ
data02 <- read.csv(file.choose(), header = TRUE)
data02

# }EXdata03.csvIꍇ
# `2sǂ݂Ƃ΂
data03 <- read.csv(file.choose(), header = TRUE, skip = 2)
data03

# }EXdata04.csvIꍇ
data04 <- read.csv(file.choose(), header = TRUE, row.names = 1)
data04

# eLXgt@Cǂݍޏꍇ
# Pʂœǂݍޏꍇidata05.txtIj
data05 <- scan(file.choose(), what = "char", sep = "\n", quiet = TRUE)
data05
PPʂœǂݍޏꍇidata05.txtIj
data05 <- scan(file.choose(), what = "char", quiet = TRUE)
data05

##### 5 #####

##### 5.1 #####

# ǉpbP[W̃CXg[î݁j
install.packages("corpora", dependencies = TRUE)

# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(corpora)

# f[^Zbg̏
data(BNCbiber)
# f[^̖`5ŝ݂\
head(BNCbiber, 5)

# CSVt@C̃f[^ǂݍ݁iBNCbiber.csvIj
BNCbiber <- read.csv(file.choose(), header = TRUE, row.names = 1)

# qXgO̕`
hist(BNCbiber[, 2])

# f[^̃NX̊mF
class(BNCbiber)
# qXgO̕`
hist(BNCbiber$f_01_past_tense)

# qXgÕ^CgƎxύX
hist(BNCbiber[, 2], main = "past tense", xlab = "frequency", ylab = "number of texts")

# qXgO̐FύX
hist(BNCbiber[, 2], main = "past tense", xlab = "frequency", ylab = "number of texts", col = "grey")
# RŎgF̊mF
colors()

##### 5.2 #####

# Ђ}̕`
boxplot(BNCbiber[, 2], range = 0)

# Ђ}̍쐬ɗpĂv񓝌vʂ̊mF
boxplot.stats(BNCbiber[, 2])

# Ђ}̃^CgƐFύX
boxplot(BNCbiber[, 2], range = 0, main = "past tense", col = "grey")

# Ђ}̊Ol\
boxplot(BNCbiber[, 2], main = "past tense", col = "grey")
# CSVt@C̃f[^ǂݍ݁ipym.csvIj
pym <- read.csv(file.choose(), header = TRUE, row.names = 1)

# f[^̖`5ŝ݂\
head(pym, 5)

# O[vʂ̔Ђ}̕`
boxplot(pym[, 2] ~ pym[, 6], names = c("high", "low"), col = "grey")

# mb`̂锠Ђ}̕`
boxplot(pym[, 2] ~ pym[, 6], names = c("high", "low"), col = "grey", notch = TRUE)

# ǉpbP[W̃CXg[î݁j
install.packages("beeswarm", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(beeswarm)
# Ђ}̏ɌX̃f[^̕zd˂ĕ`
boxplot(pym[, 2] ~ pym[, 6], names = c("high", "low"), col = "grey")
beeswarm(pym[, 2] ~ pym[, 6], col = "black", pch = 16, add = TRUE)

# ǉpbP[W̃CXg[î݁j
install.packages("vioplot", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(vioplot)
# @CIvbg`
vioplot(pym[1 : 50, 2], pym[51 : 101, 2], names = c("high", "low"), col = "grey")

##### 5.3 #####

# ǉpbP[W̃CXg[î݁j
install.packages("textometry", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(textometry)
# f[^Zbg̏
data(robespierre)
# f[^Zbg̊mF
robespierre
# f[^ŏIs̍폜
robespierre.2 <- robespierre[-6, ]
# Cf[^Zbg̊mF
robespierre.2
# UCNvbg`
mosaicplot(robespierre.2)

# CSVt@C̃f[^ǂݍ݁irobespierre.csvIj
robespierre <- read.csv(file.choose(), header = TRUE, row.names = 1)
# f[^ŏIs̍폜
robespierre.2 <- robespierre[-6, ]
# UCNvbg`
mosaicplot(robespierre.2)

x̌ύX
mosaicplot(robespierre.2, las = 2)

##### 5.4 #####

# CSVt@C̃f[^ǂݍ݁iFPP.csvIj
FPP <- read.csv(file(file.choose(), encoding = "cp932"), header = TRUE, row.names = 1)

# f[^̖`5ŝ݂\
head(FPP)

# Uz}`
plot(FPP[, 1], FPP[, 4])

# Uz}̈w
plot(FPP[, 1], FPP[, 4], main = "FPP", xlab = "Google", ylab = "BCCWJ")

# _̑傫ƃ^CvƐFw
plot(FPP[, 1], FPP[, 4], main = "FPP", xlab = "Google", ylab = "BCCWJ", cex = 1.2, pch = 16, col = "grey")

# ǉpbP[W̃CXg[î݁j
install.packages("car", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(car)
# Uz}ƔЂ}𓯎ɕ`
scatterplot(FPP[, 1], FPP[, 4], xlab = "Google", ylab = "BCCWJ", smoother = FALSE, reg.line = FALSE)

# Uz}s`
pairs(FPP)

##### 6 #####

##### 6.1 #####

# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(RMeCab)
# Z͂̌`ԑf
RMeCabC("̂")

RMeCabC.result <- RMeCabC("̂")
# f[^`̊mF
class(RMeCabC.result)
# f[^`̕ϊ
RMeCabC.result.2 <- unlist(RMeCabC.result)
RMeCabC.result.2
# f[^̃NX̊mF
class(RMeCabC.result.2)

# ͌ʂ̈ꕔ݂̂\
RMeCabC.result.2[1]
RMeCabC.result.2[2]
RMeCabC.result.2[1 : 3]

# î݂\
names(RMeCabC.result.2)

# P̌`𕜌
RMeCabC.result.3 <- RMeCabC("I[Ђ1914i吳3jNAdCGuOHMv̑nƂƂɑnƂ܂BȗAȊwZp̎GA发AAȏ̔s𒆐SɏoŊsĂ܂܂B2014i26jNɂ͓dCGuOHMvn100N̐ߖڂ}AЂVȎւƐV𓥂ݏo܂B݂͐发AȂǂɉĈʏApAiQlȂǁALł̏oŎƂWJĂ܂BʂāAǎ҂̊FlɊłƂ͂̂ƁAЉɍv邱ƂڕWɂĂ܂B", 1)
RMeCabC.result.4 <- unlist(RMeCabC.result.3)
RMeCabC.result.4

# ǉpbP[W̃CXg[î݁j
install.packages("wordcloud", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(wordcloud)

# RMeCabText֐Ō`ԑf́iwagahaiwa_nekodearu.txtIj
RMeCabText.result <- RMeCabText(file.choose())
# RMeCabText֐̌ʂ̊mF
head(RMeCabText.result, 5)
# PxNg̍쐬
RMeCabText.result.2 <- unlist(sapply(RMeCabText.result, "[[", 1))
# PxNg̊mF
head(RMeCabText.result.2, 5)
# [hNEh`
wordcloud(RMeCabText.result.2, min.freq = 2, random.order = FALSE)

# MacR̃Otœ{ꂪ̂h~
par(family = "HiraKakuProN-W3")
# [hNEh`
wordcloud(RMeCabText.result.2, min.freq = 2, random.order = FALSE)

##### 6.2 #####

# `ԑf͌ʂP̕px\쐬
# table֐ŕpxWv
RMeCabC.result.table <- table(RMeCabC.result.4)
# sort֐ŕpxɕёւ
RMeCabC.result.table.2 <- sort(RMeCabC.result.table, decreasing = TRUE)
# Wvʂ̊mF
head(RMeCabC.result.table.2, 10)

# `ԑf͌ʂi̕px\쐬
# names֐ŕȉ𒊏o
RMeCabC.result.table.3 <- table(names(RMeCabC.result.4))
# ȍ~́CP̕px\쐬ꍇƓ
RMeCabC.result.table.4 <- sort(RMeCabC.result.table.3, decreasing = TRUE)
# Wvʂ̊mF
head(RMeCabC.result.table.4, 10)

# RMeCabFreq֐ɂpx\̍쐬
# t@C̓ǂݍ݁iwagahaiwa_nekodearu.txtIj
RMeCabFreq.result <- RMeCabFreq(file.choose())
# Wvʂ̊mF
head(RMeCabFreq.result, 5)

# RMeCabFreq֐̌ʂpxɕёւ
RMeCabFreq.result.2 <- RMeCabFreq.result[order(RMeCabFreq.result$Freq, decreasing = TRUE), ]
# ёւʂ̊mF
head(RMeCabFreq.result.2, 5)

# ꐔ̌vZ
# ȉ2ނ̏\
sum(RMeCabFreq.result.2[, 4])
sum(RMeCabFreq.result.2$Freq)

# ٌꗦ̌vZ
# ٌꐔ́Cnrow(RMeCabFreq.result.2)ŌvZ
nrow(RMeCabFreq.result.2) / sum(RMeCabFreq.result.2$Freq)

# px\̏o
write.table(RMeCabFreq.result.2, file = "wordlist.csv", sep = ",", row.names = TRUE, col.names = NA)
# MacȂǂŏo͂t@Cꍇ
write.table(RMeCabFreq.result.2, file = "wordlist.csv", sep = ",", row.names = TRUE, col.names = NA, fileEncoding = "UTF-8")
# ۑt@Cǂɂ邩ȂȂꍇ
getwd()

# ̏ɍvP݂̂𒊏o
# uLvƂ܂ޒP݂̂\
RMeCabFreq.result.2[grep("L", RMeCabFreq.result.2$Term), ]
# uvƂ܂ޒP݂̂\
RMeCabFreq.result.2[grep("", RMeCabFreq.result.2$Term), ]
# uLv́uvƂ܂ޒP݂̂\
RMeCabFreq.result.2[grep("L|", RMeCabFreq.result.2$Term), ]

# uLvƂP݂̂\
RMeCabFreq.result.2[grep("^L$", RMeCabFreq.result.2$Term), ]

##### 6.3 #####

# n-gram̒o
# 2-gramiwagahaiwa_nekodearu.txtIj
ngram.result.1 <- Ngram(file.choose(), type = 0)
# Wvʂ̊mF
head(ngram.result.1, 5)
# P2-gramiwagahaiwa_nekodearu.txtIj
ngram.result.2 <- Ngram(file.choose(), type = 1)
# Wvʂ̊mF
head(ngram.result.2, 5)
# i2-gramiwagahaiwa_nekodearu.txtIj
ngram.result.3 <- Ngram(file.choose(), type = 2)
# Wvʂ̊mF
head(ngram.result.3, 5)

# Pn-gram̒oɂi̎wiwagahaiwa_nekodearu.txtIj
ngram.result.4 <- Ngram(file.choose(), type = 1, pos = c("", "", "`e", ""))
# Wvʂ̊mF
head(ngram.result.4, 5)

# n-gram̒ύXiwagahaiwa_nekodearu.txtIj
ngram.result.5 <- Ngram(file.choose(), type = 1, N = 3)
# Wvʂ̊mF
head(ngram.result.5, 5)

# Ngram֐̉͌ʂpxɕёւ
ngram.result.6 <- ngram.result.2[order(ngram.result.2$Freq, decreasing = TRUE), ]
# ёւʂ̊mF
head(ngram.result.6, 5)

# docDF֐ɂn-gram̒oiwagahaiwa_nekodearu.txtIj
docDF.result <- docDF(file.choose(), type = 1, N = 2)
# Wvʂ̊mF
head(docDF.result, 5)

##### 6.4 #####

# NWviwagahaiwa_nekodearu.txtIj
collocate.result <- collocate(file.choose(), node = "y", span = 5)
# Wvʂ̊mF
head(collocate.result, 5)

# TMIvZ
collScores.result <- collScores(collocate.result, node = "y", span = 5)
# vZʂ̊mF
head(collScores.result, 5)

# Nx̌vZʂёւ
# Tŕёւ
collScores.result.2 <- collScores.result[order(collScores.result$T, decreasing = TRUE), ]
# ёւʂ̊mF
head(collScores.result.2, 5)

# MIŕёւ
collScores.result.3 <- collScores.result[order(collScores.result$MI, decreasing = TRUE), ]
# ёւʂ̊mF
head(collScores.result.3, 5)

# ǉpbP[W̃CXg[î݁j
install.packages("igraph", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(igraph)
# NgramDFɂ鋤N̏Wviwagahaiwa_nekodearu.txtIj
NgramDF.result <- NgramDF(file.choose(), type = 1, N = 2, pos = "")
# Npx2ȏ̃yÂ݂𒊏o
NgramDF.result.2 <- subset(NgramDF.result, Freq > 1)
# lbg[N̕`
g <- graph.data.frame(NgramDF.result.2, directed = FALSE)
plot(g, vertex.label = V(g)$name, vertex.color = "grey")

# Npx3ȏ̃yÂ݂𒊏o
NgramDF.result.3 <- subset(NgramDF.result, Freq > 2)
# lbg[N̕`
g.2 <- graph.data.frame(NgramDF.result.3, directed = FALSE)
plot(g.2, vertex.label = V(g.2)$name, vertex.color = "grey")

##### 7 #####

##### 7.1 #####

# t@C̉
# px̏Wv
library(RMeCab)
docDF.result <- docDF("speech", type = 0)
# ͌ʂ̊mF
head(docDF.result, 10)

# t@C̉
# Ppx̏Wv
docDF.result.2 <- docDF("speech", type = 1)
# ͌ʂ̊mF
head(docDF.result.2, 10)

# i肵Wv
docDF.result.3 <- docDF("speech", type = 1, pos = c("", "`e"))
# ͌ʂ̊mF
head(docDF.result.3, 10)

# 2-gram̏Wv
docDF.result.4 <- docDF("speech", type = 0, N = 2)
# Wvʂ̊mF
head(docDF.result.4, 10)
# P2-gram̏Wv
docDF.result.5 <- docDF("speech", type = 1, N = 2)
# Wvʂ̊mF
head(docDF.result.5, 10)
# P2-gram̏WviCC`eĈ݁j
docDF.result.6 <- docDF("speech", type = 1, N = 2, pos = c("", "", "`e", ""))
# Wvʂ̊mF
head(docDF.result.6, 10)

# ȉ폜
docDF.result.7 <- docDF.result.6[, -2]
docDF.result.7 <- docDF.result.7[, -2]
# 폜ʂ̊mF
head(docDF.result.7, 10)

##### 7.2 #####

# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(textometry)
# f[^Zbg̏
data(robespierre)
# f[^Zbg̊mF
robespierre

# 1ځiD1j̑ꐔ
sum(robespierre[, 1])
# 1`10ځiD1`D10j̑ꐔ
colSums(robespierre)

# 100ꂠ̑ΕpxvZ
relative.freq <- robespierre / apply(robespierre, 2, sum) * 100
# _ȉ2ʂ܂ł\
relative.freq

# WpxvZ
scale.result <- scale(robespierre)
# _ȉ2ʂ܂ł\
round(scale.result, 2)

# D1ɂde̊ϑpxi1sځC1ځj
robespierre[1, 1]
# 񂲂Ƃ̕ϒl
apply(robespierre, 2, mean)
# 񂲂Ƃ̕W΍
apply(robespierre, 2, sd)
# Wpx̌Z
(464 - 1399.1667) / 3147.9423
# _ȉ2ʂ܂ł\
round((464 - 1399.1667) / 3147.9423, 2)

# TF-IDF̌vZ
tf.idf <- docDF("speech", type = 1, weight = "tf*idf")
# vZʂ̊mF
head(tf.idf, 5)

# ϑpx̏Wv
speech.result <- docDF("speech", type = 1)
# vZʂ̊mF
head(speech.result)
# Abe.txt ɂu.vi1sځC4ځjTF-IDFvZ
TF <- 1
IDF <- log2(3 / 1)
TF * (IDF + 1)

##### 8 #####

##### 8.1 #####

# NXWv\̏
cross.tab <- matrix(c(96, 54, 52, 48), nrow = 2, ncol = 2, byrow = TRUE)
rownames(cross.tab) <- c("Male", "Female")
colnames(cross.tab) <- c("Jotai", "Keitai")
# NXWv\̊mF
cross.tab

# tBbV[̐mm
fisher.test(cross.tab)

# JC挟
chisq.test(cross.tab, correct = FALSE)

# 2~3̃NXWv\̏
cross.tab.2 <- matrix(c(805, 414, 226, 99, 38, 12), nrow = 2, ncol = 3, byrow = TRUE)
rownames(cross.tab.2) <- c("Correct", "Error")
colnames(cross.tab.2) <- c("Level 1", "Level 2", "Level 3")
# NXWv\̊mF
cross.tab.2
# 2~3̃NXWv\ɃtBbV[̐mms
fisher.test(cross.tab.2)

# 1ڂ2ڂ
fisher.test(cross.tab.2[, c(1, 2)])
# 1ڂ3ڂ
fisher.test(cross.tab.2[, c(1, 3)])
# 2ڂ3ڂ
fisher.test(cross.tab.2[, c(2, 3)])

# \̐lS10{
cross.tab.3 <- cross.tab * 10
# 10{f[^̊mF
cross.tab.3
# tBbV[̐mm
fisher.test(cross.tab.3)

# IbY̌vZ
(cross.tab[1, 1] / cross.tab[2, 1]) / (cross.tab[1, 2] / cross.tab[2, 2])

# 10{f[^ŃIbYvZ
(cross.tab.3[1, 1] / cross.tab.3[2, 1]) / (cross.tab.3[1, 2] / cross.tab.3[2, 2])

# ǉpbP[W̃CXg[î݁j
install.packages("vcd", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(vcd)
# IbY̌vZ
oddsratio(cross.tab, log = FALSE)
# IbY̐MԁilClǰvZ
confint(oddsratio(cross.tab, log = FALSE))

# N[V̌vZ
V <- assocstats(cross.tab.3)
V

# ǉpbP[W̃CXg[î݁j
install.packages("RVAideMemoire", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(RVAideMemoire)
# N[V̐MԁilClǰvZ
cramer.test(cross.tab.3)

##### 8.2 #####

# ǉpbP[W̃CXg[î݁j
install.packages("corpora", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(corpora)
# f[^Zbg̏
data(BNCbiber)
# f[^̖`5ŝ݂\
head(BNCbiber, 5)
# ֌W̌vZ
cor(BNCbiber[, 2], BNCbiber[, 4])

# Uz}̕`
plot(BNCbiber[, 2], BNCbiber[, 4], xlab = "past tense", ylab = "present tense")

# ֌
cor.test(BNCbiber[, 2], BNCbiber[, 4])

# XsA}̏ʑ֌W̌vZ
cor(BNCbiber[, 2], BNCbiber[, 4], method = "spearman")

# sA\̐ϗ֌W
cor(BNCbiber[, 2 : 4])
# XsA}̏ʑ֌W̌vZ
cor(BNCbiber[, 2 : 4], method = "spearman")

# ǉpbP[W̃CXg[î݁j
install.packages("psych", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(psych)
# ֌W\ꂽUz}s̍쐬
pairs.panels(BNCbiber[, 2 : 4])

# PA
lm.result <- lm(BNCbiber[, 2] ~ BNCbiber[, 4])
# ʂ̊mF
lm.result

# Ả
plot(BNCbiber[, 4], BNCbiber[, 2], xlab = "present tense", ylab = "past tense", pch = 16, col = "grey")
abline(lm.result)

# dA
lm.result.2 <- lm(BNCbiber[, 2] ~ BNCbiber[, 3] + BNCbiber[, 4])
# ʂ̊mF
lm.result.2

##### 9 #####

##### 9.1 #####

# ǉpbP[W̃CXg[î݁j
install.packages("ca", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(ca)
# f[^Zbg̏
data(author)
# f[^̖`5ŝ݂\
head(author, 5)

# Ή
ca.result <- ca(author)
# ʂ̉
plot(ca.result)

# sf[^ieLXgĵ݂\
plot(ca.result, what = c("all", "none"))
# f[^iϐĵ݂\
plot(ca.result, what = c("none", "all"))

# Ή͂瓾ꂽڂʂ̊mF
ca.result
# sf[^̕\i1`2̂݁j
ca.result$rowcoord[, 1 : 2]
# sf[^̑1̓_ъ
sort(ca.result$rowcoord[, 1], decreasing = TRUE)
# sf[^̑2̓_ъ
sort(ca.result$rowcoord[, 2], decreasing = TRUE)
# f[^̕\i1`2̂݁j
ca.result$colcoord[, 1 : 2]
# f[^̑1̓_ъ
sort(ca.result$colcoord[, 1], decreasing = TRUE)
# f[^̑2̓_ъ
sort(ca.result$colcoord[, 2], decreasing = TRUE)

# Εpx̌vZ
author.2 <- author / apply(author, 1, sum)
# [NbȟvZ
dist.result <- dist(author.2, method = "euclidean")
# Kw^NX^[́iEH[h@j
hclust.result <- hclust(dist.result, method = "ward.D2")
# ʂ̉
plot(hclust.result)

# f[^Zbg̓]u
author.3 <- t(author.2)
# [NbȟvZ
dist.result.2 <- dist(author.3, method = "euclidean")
# Kw^NX^[́iEH[h@j
hclust.result.2 <- hclust(dist.result.2, method = "ward.D2")
# ʂ̉
plot(hclust.result.2)

# Kw^NX^[̃q[g}bv
heatmap(author.2)

##### 9.2 #####

# ǉpbP[W̃CXg[î݁j
install.packages("kernlab", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(kernlab)
# f[^Zbg̏
data(spam)
# f[^̖`5ŝ݂\
head(spam, 5)

# CSVt@C̃f[^ǂݍ݁ispam.csvIj
spam <- read.csv(file.choose(), header = TRUE)

# Pf[^ƕ]f[^ɕ
# ̃xNg𐶐
n <- seq(1, nrow(spam), by = 2)
# s̃f[^𒊏o
spam.train <- spam[n, ]
# s̃f[^𒊏o
spam.test <- spam[-n, ]
# sf[^̖`5s̊mF
head(spam.train, 5)
# sf[^̖`5s̊mF
head(spam.test, 5)

# MASSpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(MASS)
# `ʕ
# ʎ̍쐬
lda.result <- lda(type ~ ., data = spam.train)
# ʂ̊mF
lda.result

# ʎɊÂ
lda.predict.result <- predict(lda.result, spam.test)
# ތʂ̐mF
lda.tab <- table(spam.test$type, lda.predict.result$class)
# ܂Ƃ߂\\
lda.tab
# ސx̊mFi\̑Ίpvf̑SvfŊj
sum(diag(lda.tab)) / sum(lda.tab)

# rpartpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(rpart)
# ؂ɂ锻ʃf̍\z
rpart.result <- rpart(type ~ ., data = spam.train)
# ʃf̊mF
rpart.result

# ǉpbP[W̃CXg[î݁j
install.packages("partykit", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(partykit)
# ؂̔ʃf̉
plot(as.party(rpart.result))

# }̙̌
plotcp(rpart.result)

# w肵Ĕʃf\z
rpart.result.2 <- rpart(type ~ ., data = spam.train, cp = 0.036)
# ؂̔ʃf̉
plot(as.party(rpart.result.2))

# ؂ɂ鎩
rpart.predict.result <- predict(rpart.result, spam.test, type = "class")
# ތʂ̐mF
rpart.tab <- table(spam.test$type, rpart.predict.result)
# ܂Ƃ߂\\
rpart.tab
# ސx̊mFi\̑Ίpvf̑SvfŊj
sum(diag(rpart.tab)) / sum(rpart.tab)

# ǉpbP[W̃CXg[î݁j
install.packages("randomForest", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(randomForest)
# Œ
set.seed(1)
# _tHXg
randomForest.result <- randomForest(type ~ ., data = spam.train)
# _tHXgɂ鎩
randomForest.predict.result <- predict(randomForest.result, spam.test)
# ތʂ̐mF
randomForest.tab <- table(spam.test$type, randomForest.predict.result)
# ܂Ƃ߂\\
randomForest.tab
# ސx̊mFi\̑Ίpvf̑SvfŊj
sum(diag(randomForest.tab)) / sum(randomForest.tab)

# ϐdvx̉
varImpPlot(randomForest.result)

##### 10 #####

##### 10.1 #####

# ǉpbP[W̃CXg[î݁j
install.packages("languageR", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(languageR)
# f[^Zbg̏
data(alice)
# f[^Zbg̖`20̊mF
head(alice, 20)

# eLXgt@C̃f[^ǂݍ݁iObama.txtIj
text.data <- scan(file.choose(), what = "char", sep = "\n", quiet = TRUE)
# PxNg̍쐬
word.vector <- unlist(strsplit(text.data, "\\W"))
# Xy[X폜
not.blank <- which(word.vector != "")
obama <- word.vector[not.blank]
# f[^̊mF
head(obama, 20)

# C^[lbg̃f[^̓ǂݍ
text.data <- scan("http://www.xxx/yyy.txt", what = "char", sep = "\n", quiet = TRUE)

# ̓eLXg̎w
word.vector <- alice
# 啶ɕϊ
word.vector.lower <- tolower(word.vector)
# ̐Nʒu擾ił́C"rabbit"j
word.positions <- which(word.vector.lower == "rabbit")
# ̑O㉽܂ŕ\邩wił́C5j
context <- 5
# KWICRR[_X̍쐬
for(i in seq(word.positions)) {
   if(word.positions[i] == 1) {
      before <- NULL
   } else {
   start <- word.positions[i] - context
   start <- max(start, 1)
   before <- word.vector.lower[start : (word.positions[i] - 1)]
}
end <- word.positions[i] + context
after <- word.vector.lower[(word.positions[i] + 1) : end]
after[is.na(after)] <- ""
keyword <- word.vector.lower[word.positions[i]]
cat("--------------------", i, "--------------------", "\n")
cat(before, "[", keyword, "]", after, "\n")
}

# ̐Nʒuo
plot(word.vector.lower == "rabbit", type = "h", yaxt = "n", main = "rabbit")

##### 10.2 #####

# ǉpbP[W̃CXg[î݁j
install.packages("tm", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(tm)
# ƋǓ_̍폜
corpus.cleaned <- removeNumbers(word.vector.lower)
corpus.cleaned <- removePunctuation(corpus.cleaned)
# Xy[X폜
not.blank <- which(corpus.cleaned != "")
corpus.cleaned <- corpus.cleaned [not.blank]
# px\̍쐬
freq.list <- table(corpus.cleaned)
sorted.freq.list <- sort(freq.list, decreasing = TRUE)
sorted.table <- paste(names(sorted.freq.list), sorted.freq.list, sep = ": ")
# px\ipx20ʂ܂Łj̊mF
head(sorted.table, 20)

# Xgbv[hʂɐݒił́C"the""and"Oj
corpus.cleaned.2 <- removeWords(corpus.cleaned, c("the", "and"))
# Xy[X폜
not.blank <- which(corpus.cleaned.2 != "")
corpus.cleaned.2 <- corpus.cleaned.2[not.blank]
# px\̍쐬
freq.list.2 <- table(corpus.cleaned.2)
sorted.freq.list.2 <- sort(freq.list.2, decreasing = TRUE)
sorted.table.2 <- paste(names(sorted.freq.list.2), sorted.freq.list.2, sep = ": ")
# px\ipx20ʂ܂Łj̊mF
head(sorted.table.2, 20)

# ꊲ
corpus.cleaned.3 <- stemDocument(corpus.cleaned)
# px\̍쐬
freq.list.3 <- table(corpus.cleaned.3)
sorted.freq.list.3 <- sort(freq.list.3, decreasing = TRUE)
sorted.table.3 <- paste(names(sorted.freq.list.3), sorted.freq.list.3, sep = ": ")
# px\ipx20ʂ܂Łj̊mF
head(sorted.table.3, 20)

# ǉpbP[W̃CXg[î݁j
install.packages("wordcloud", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(wordcloud)
wordcloud(corpus.cleaned, min.freq = 5, random.order = FALSE)

# 2-grams̒o
ngrams <- paste(corpus.cleaned[1 : (length(corpus.cleaned) - 1)], corpus.cleaned[2 : length(corpus.cleaned)])
# pxWv
ngram.freq <- table(ngrams)
sorted.ngram.freq <- sort(ngram.freq, decreasing = TRUE)
sorted.ngram.table <- paste(names(sorted.ngram.freq), sorted.ngram.freq, sep = ": ")
# px20ʂ܂ł\
head(sorted.ngram.table, 20)

##### 10.3 #####

# ̎wił́C"rabbit"j
search.word <- "\\brabbit\\b"
# Xp̎wił́CO2܂Łj
span <- 2
span <- (-span : span)
# o̓t@C̎wił́Coutput.txtj
output.file <- "output.txt"
# ̏oʒu
positions.of.matches <- grep(search.word, corpus.cleaned, perl = TRUE)
# N̏Wv
results <- list()
for(i in 1 : length(span)) { 
	collocate.positions <- positions.of.matches + span[i]
	collocates <- corpus.cleaned[collocate.positions]
	sorted.collocates <- sort(table(collocates), decreasing = TRUE)
	results[[i]] <- sorted.collocates
}
# Wv\̃wb_[o
cat(paste(rep(c("W_", "F_"), length(span)), rep(span, each = 2), sep = ""), "\n", sep = "\t", file = output.file)
# Wvf[^o
lengths <- sapply(results, length)
for(k in 1 : max(lengths)) {
	output.string <- paste(names(sapply(results, "[", k)), sapply(results, "[", k), sep = "\t")
	output.string.2 <- gsub("NA\tNA", "\t", output.string, perl = TRUE)
	cat(output.string.2, "\n", sep = "\t", file = output.file, append = TRUE)
	}

##### 10.4 #####

# ǉpbP[W̃CXg[î݁j
install.packages("koRpus", dependencies = TRUE)
# ǉpbP[W̓ǂݍ݁iRN邲Ƃɖj
library(koRpus)
# eLXg̓ǂݍ݁iObama.txtIj
tok <- tokenize(file.choose(), lang = "en")

# ٌꗦ̌vZ
TTR(tok)

# M[w̌vZ
R.ld(tok)

# MATTŘvZ
MATTR(tok)
# MTLĎvZ
MTLD(tok)

# Flesch-Kincaid Grade LeveľvZ
flesch.kincaid(tok)

# Coleman-Liau Index̌vZ
coleman.liau(tok)
# Automated Readability Index̌vZ
ARI(tok)

