--Jean Arreola--

Variational Gaussian Mixtures for Face Detection

2018-07-13T00:00:00+00:00

Mixture model

A Gaussian mixture model is a probabilistic way of representing subpopulations within an overall population. We only observe the data, not the subpopulation from which observation belongs.

We have $N$ random variables observed, each distributed according to a mixture of K gaussian components. Each gaussian has its own parameters, and we should be able to estimate the category using Expectation Maximization, as we are using a latent variables model.

Now, in a bayesian scenario, each parameter of each gaussian is also a random variable, as well as the mixture weights. To estimate the distributions we use Variational Inference, which can be seen as a generalization of the EM algorithm. Be sure to check this book to learn all the theory behind gaussian mixtures and variational inference.

Here is my implementation for Variational Gaussian Mixture Model.

#Variational Gaussian Mixture Model


#Constant for Dirichlet Distribution
dirConstant <- function(alpha){
  res <- 1
  for(i in 1:length(alpha)){
    res <- res * gamma(alpha[i])
  }
  return(gamma(sum(alpha))/res)
}


BWishart <- function(W, v){
  D <- ncol(W)
  elem1 <- (det(W))^(-v/2)
  elem2 <- (2^(v*D/2)) * (pi^(D*(D-1)/4))
  elem3 <- 1
  for(i in 1:D){
    elem3 <- elem3 * gamma((v+1-i)/2)
  }
  return(elem1 / (elem2 * elem3))
}

#Log precision expected value
espLnPres <- function(W, v){
  res <- 0
  D <- ncol(W)
  for(i in 1:D){
    res <- res + digamma((v+1-i)/2)
  }
  res <- res + D*log(2) + log(det(W))
  return(res)
}

#Wishart distribution entropy
entropyWishart <- function(W, v){
  D <- ncol(W)
  return(-log(BWishart(W,v)) - ((v-D-1)/2) * espLnPres(W,v) + (v*D)/2)
}

# Estimating mixture parameters

vgmm <- function(X, K, iter = 100, eps = 0.001){
  D <- ncol(X)
  N <- nrow(X)

  #Hyperparameters initialization
  m0 <- rep(0, D)  # mean
  W0 <- diag(D)  # precision
  v0 <- D  # degrees of freedom:  n > p-1
  alpha0 <- 1/K # Dirichlet parameter
  beta0 <- 1  # Variance for mean

  #For each category
  #Initialize the means with centroids from k-means
  mk <- kmeans(X,K)$centers

  #Initialize presicions with diagonal  matrix
  Wk <- array(0, c(D, D, K))

  for(i in 1:K)
    Wk[,,i] <- W0

  vk <- rep(v0, K)

  #Initialize hyperparameters
  betak <- rep(beta0, K)
  alphak <- rep(alpha0,K)

  # Necessary terms for calculate responsabilities
  ln_pres <- rep(0,K)
  ln_pi <- rep(0,K)
  E_mu_pres <- matrix(0, N, K)


  # Iterate

  for(it in 1:iter){

    #Responsabilities
    r <- matrix(0,N, K)

    #####################  Variational E-Step  ##########################33

    for(i in 1:K){

      #Log precision
      ln_pres[i] <- 0

      for(j in 1:D){
        ln_pres[i] <- ln_pres[i] + digamma((vk[i] + 1 - j) /2)
      }

      ln_pres[i] <- ln_pres[i] + D * log(2) + log(det(Wk[,,i]))

      alpha <- sum(alphak)

      ln_pi[i] <- digamma(alphak[i]) - digamma(alpha)

      #E[mu,pres] (expected value of joint distribution of mu and pres)
      for(k in 1:N){
        E_mu_pres[k,i] <- (D / betak[i]) + vk[i] * t(X[k,] - mk[i,]) %*%
          Wk[,,i] %*% (X[k,] - mk[i,])  #10.64

        r[k,i] <- ln_pi[i] + 0.5 * ln_pres[i] - (D/2) *log(2*pi) -
          0.5 * E_mu_pres[k,i]

      }
    }

    # Exp-log-sum trick for numerical stability
    rho <- apply(r, 1, function(x){
      offset <- max(x)
      y <- x - offset
      return(exp(y)/sum(exp(y)))
    })

    rho <- t(rho)

    ########################### Variational M-Step  ##################################

    # Auxiliary statistics

    Nk <- apply(rho, 2, sum)

    # Update means

    xBark <- matrix(0, K, D)
    for(i in 1:K){
      xBark[i,] <- colSums(rho[,i] * X) / Nk[i]
    }

    # Update covariances

    Sk <-  array(0, c(D,D,K))

    for(i in 1:K){
      sum_sk <- 0
      for(j in 1:N){
        sum_sk <- sum_sk + rho[j,i] * (X[j,] - xBark[i,]) %*% t(X[j,] - xBark[i,])
      }
      Sk[,,i] <- sum_sk / Nk[i]
    }

    # Update hyperparameters

    for(i in 1:K){
      betak[i] <- beta0 + Nk[i]
      mk[i,] <- (1/betak[i]) * (beta0 * m0 + Nk[i] * xBark[i,])
      Wk[,,i] <- solve(solve(W0) + Nk[i] * Sk[,,i] +
                         ((beta0 * Nk[i]) / (beta0 + Nk[i])) *
                         (xBark[i,] - m0) %*% t(xBark[i,] - m0))
      vk[i] <- v0 + Nk[i]
    }

    #ELBO (Evidence Lower Bound)

    # ELBO is a sum of seven terms

    term1 <- 0  #10.71

    for(i in 1:K){
      term1 <- term1 + Nk[i] * (ln_pres[i] - (D / betak[i]) -
                                  vk[i] * sum(diag(Sk[,,i] %*% Wk[,,i])) -
                                  vk[i] * ( t(xBark[i,] - mk[i,]) %*% Wk[,,i] %*%
                                              (xBark[i,] - mk[i,])) -
                                  D * log(2 * pi))
    }
    term1 <- 0.5 * term1

    term2 <- 0    #10.72
    for(i in 1:N){
      for(j in 1:K){
        term2 <- term2 + (rho[i,j] * ln_pi[j])
      }
    }

    term3 <- 0    #10.73
    for(i in 1:K){
      term3 <- term3 + ln_pi[i]
    }
    term3 <- term3 * (alpha0 -1) + log(dirConstant(alpha0))

    term4 <- 0  #10.74
    sub <- 0
    for(i in 1:K){
      term4 <- term4 + D * log(beta0 / (2 * pi))  + ln_pres[i]-
        ((D * beta0)/betak[i]) - beta0 * vk[i] *
        t(mk[i,]-m0) %*% Wk[,,i] %*% (mk[i,]-m0)
    }
    term4 <- 0.5 * term4 + K * log(BWishart(W0,v0))
    for(i in 1:K){
      sub <- sub + vk[i] * sum(diag(solve(W0) %*% Wk[,,i]))
    }
    term4 <- term4  + sum(ln_pres) * ((v0-D-1)/2) - 0.5 * sub

    term5 <- 0  #10.75
    for(i in 1:N){
      for(j in 1:K){
        stand <- rho[i,j] * log(rho[i,j])
        if(!is.finite(stand))
          stand <- 0
        term5 <- term5 + stand
      }
    }

    term6 <- 0  #10.76
    for(i in 1:K){
      term6 <- term6 + (alphak[i]-1) * ln_pi[i]
    }
    term6 <- term6 + log(dirConstant(alphak))

    term7 <- 0   #10.77
    for(i in 1:K){
      term7 <- term7 + 0.5 * ln_pres[i] + (D/2) * log(betak[i]/(2 * pi)) -
        (D/2) - entropyWishart(Wk[,,i], vk[i])
    }

    if(it > 1){
      prevELBO <- ELBO
    }

    ELBO <- term1 + term2 + term3 + term4 - term5 - term6 - term7

    # Convergence criteria

    if(it > 1 && is.finite(ELBO)){
      if(abs(ELBO - prevELBO) < eps){
      break
      }
    }

  }

  # Return responsabilities, ELBO, covariances and means
  # (You can add whatever parameters (or hyperparameters) you need)

  lista <- list("rho" = rho, "ELBO" = ELBO, "Wk" = Wk, "mk" = mk)

    return(lista)
}

Applications

Gaussian Mixture Models can be seen as a form of clustering, but each observation will belong to all clusters simultaneously, as we are estimating the probabilities for belonging to each gaussian distribution. This is called “soft clustering”, as opposed to other algorithms like k-means, which is a “hard clustering technique” (each observation belongs to only one cluster). As a matter of fact, k-means is a special case of a gaussian mixture when the variances all are the same, and there aren’t covariances (so all the clusters will have a circular shape).

A consequence of this is that gaussian mixture are more flexible than k-means because the clusters can have an “elliptical form”. In particular, in image segmentation, gaussian mixture are the prefered algorithm. For example, in image matting (segment an image by background and foreground pixels), GMM are a natural choice because each pixel will have a probability for belongin to the foreground and the background.

Eigenfaces

In this post, we will use variational GMM to do face detection. We will use the faces94 dataset, and choose the most probable category for each face.

The representation that I choose for the images are the Eigenfaces, which are the eigenvectors of the matrix of faces (each column is an image and each row has all the pixels values of the image). It’s important to note that the images have to be centered (sustract the mean).

To reduce dimensionality, we will work with the eigenvectors of the matrix X’X, so we will have instead a matrix of N x N.

Results

The first five eigenfaces:

Now the results of the classification:

open

We can see that the algorithm only misclassified one point. Notice that the groups are almost linearly separable, so eigenfaces was an extremely helpful representation.

Final thoughts

A gaussian mixture model is a powerful technique for unsupervised learning. With Variational Inference, we can give more abilities to the mixture, like working with missing values, or adding additional levels to the hierarchical model. GMM are also the principles for learning advances models like Hidden Markov Models.

Correspondence Analysis of Mexican Discourses

2018-06-24T00:00:00+00:00

Correspondence Analysis

Correspondence analysis is a multivariate statistical technique that summarizes a set of categorical data in a two dimensional form. It’s like the equivalent of Principal Component Analysis but for categorical data.

Correspondence analysis is usually applied to contigency tables. In this post, we will apply it to a frequency matrix (term document matrix from bag of words representation).

The analysis can be done by row or by column. Below is an implementation of correspondence analysis, where row and column analysis are done at the same time.

correspondence <- function(ct, ind){
  
  #Parameters
  #ct : contingency table (or frequency table)
  #ind: which eigenvectors (first eigenvector is ommited)
  
  n <- sum(ct)
  rows <- nrow(ct)
  cols <- ncol(ct)
  
  #Correspondence Matrix
  F_fisher<-(ct)/n
  
  #Relative frequencies
  rtot<-(apply(ct,1,sum))/n
  ctot<-apply(ct,2,sum)/n
  Dr<-diag(rtot)
  Dc<-diag(ctot)
  
  
  Z<-(sqrt(solve(Dr)))%*%F_fisher%*%(sqrt(solve(Dc)))
  
  #Eigenvalues and eigenvector are obtained with SVD
  dvalsing<-svd(Z)
  
  #Two dimensional representation
  #Row analysis
  Cr<-(sqrt(solve(Dr)))%*%Z%*%dvalsing$v[,ind]
  #Column analysis
  Cc<-(sqrt(solve(Dc)))%*%t(Z)%*%dvalsing$u[,ind]
  
  return(list("Cr" = Cr, "Cc" = Cc))
}

Mexican discourses

In this post we will analize the discourses of mexican politicians, in particular, candidates for Mexico presidency. We have 11 discourses in total:

Roberto Madrazo Pintado (PRI 2006)
Andres Manuel Lopez Obrador (PRD 2006) (PRD 2012) (MORENA 2018)
Enrique Peña Nieto (PRI 2012 before and after being elected)
Josefina Vazquez Mota (PAN 2012)
Felipe Calderon (PAN 2006)
Ricardo Anaya Cortes (PAN 2018)
Jose Antonio Meade Kuribreña (PRI 2018)
Margarita Ester Zavala Gomez del Campo (Independiente 2018)

Our objective is to find patterns in the two dimensional of the discourses, that reflect information of the actual Mexico context regarding politics.

Putting it all together

We will use the bag of words representation for the discourses. The most frequent 500 words will be chosen for the analysis, and our final term document matrix will be a 11 x 500 matrix.

Next, we see the results of the correspondence analysis appplied to our term document matrix:

open

Insights

We can see that Ricardo Anaya and Roberto Madrazo are the furthest. That means in this context that they use words in their discourses that the other candidates don’t use frequently.

The three discourses from Andres Manuel are near from each other, and that was expected. And Margarita Zavala is close to Josefina Vazquez Mota. That makes sense, as their campaings are based on the idea of a woman in the presidency, so it’s logical that they use similar words in their discourses.

Another interesting insight is the closeness between Felipe Calderon and Margarita Zavala. It turns out that the team that helped Zavala in her campaign were former collaborators of Felipe Calderon, so maybe she was advised in the same way that Calderon. Check this new here.

The final insight was the closeness between Margarita Zavala and Jose Antonio Meade. Recently, Zavala has resigned from her candidacy, and, surprisingly, Jorge Camacho (former campaign chief from Zavala campaign) has anounced that he intends to vote for Meade. Perhaps he intends to vote for the candidate with the most similar ideas, and that would explain the closeness in our analysis. Check this new here.

Final thoughts

Correspondence analysis has proven to be useful in finding patterns on frequencey matrices. We saw how some of the political news can be reflected in a discourse analysis. For future work, we can use MDS in the term frequency matrix to obtain “data points” and train a classificator! But correspondence analysis is good for a initial representation.

Discourses

Discourses obtained from animalpolitico.com

Postgresql + R Sandbox

2017-09-24T00:00:00+00:00

ElephantSQL

ElephantSQL offers a free instance of Postgresql, with a limit of 20 MB and 5 concurrent connections. For example, you can upload a shiny application that depends on data from ElephantSQL.

You only need to register to the site and automatically you can acces your free instance.

In this post we will see how to take advantage of this cloud database.

Getting the data

For this example I will use the open data of air quality available in the page of SEDEMA (Environment Secretary) of Mexico City.

The data is structured by one csv file per year, and is avalilable from 1992.

#Auxiliary function to download the files

load_sedema <- function(year){

  #URL to the file
  #from 1992
  link <- paste0("http://148.243.232.112:8080/opendata/IndiceCalidadAire/indice_",year,".csv") 
  
  #Columns classes
  types <- c("character", rep("numeric",26))
  
  #Download the file
  air_data <- read.csv(link,skip = 9, stringsAsFactors = F, encoding = "latin1", header = F,
                       colClasses = types, na.string = "NA")
  
  #Remove missing data
  air_data <- air_data[!air_data[,1]=="",1:27]
  
  #Fix time variable
  air_data$V1 <- paste0(substring(air_data$V1, 1, 6), year) #We need to asure that all dates are from the specified year
  
  return(air_data)

}

Next step is to create the table on Postgresql, now that we know thw structure of the csv.

library(RPostgreSQL)


# SQL query to create main table if it not exists

"
CREATE TABLE IF NOT EXISTS air_quality (
  FECHA date,  
  HORA integer,
  NO_OZONO integer,
  NO_AZUFRE integer,
  NO_NITROGENO integer,
  NO_CARBONO integer,
  NO_PM10 integer,
  NE_OZONO integer,
  NE_AZUFRE integer,
  NE_NITROGENO integer,
  NE_CARBONO integer,
  NE_PM10 integer,
  CE_OZONO integer,
  CE_AZUFRE integer,
  CE_NITROGENO integer,
  CE_CARBONO integer,
  CE_PM10 integer,
  SO_OZONO integer,
  SO_AZUFRE integer,
  SO_NITROGENO integer,
  SO_CARBONO integer,
  SO_PM10 integer,
  SU_OZONO integer,
  SU_AZUFRE integer,
  SU_NITROGENO integer,
  SU_CARBONO integer,
  SU_PM10 integer,
  ID serial,
  PRIMARY KEY (ID)
)
" -> query

#Be sure to change your credentials! You can check them on the Details window on your ElephantSQL instance!

#dbname is the user & default database
#host is the serve
#you can get the port from URL

# Connect to database

drv <- dbDriver("PostgreSQL")

con <- dbConnect(drv, dbname = user, 
                 host = db_url, port = 5432,
                 user = user, password = pwd)


# Create table

dbGetQuery(con, query)

Next we upload the table from one year

data = upload_sedema(2017)

#Correct format for date
data$V1 <- strptime(data$V1, "%d/%m/%Y")
data$V1 <- gsub("/","-",data$V1)

#Set ID
data$id <- seq(ind,nrow(data) + ind - 1)

#Upload data
  
dbWriteTable(conn = con, name = "air_quality",value = data, append = T, row.names = F)

Now you can upload all of the years! Be sure to check the full script

We can query the data now.

query <- 
'
SELECT 
  * 
FROM 
  "public"."air_quality" 
LIMIT 100
  
'

last100 <- dbGetQuery(con, query)

head(last100)

# Close the connection
  
on.exit(dbDisconnect(con)

## Loading required package: methods

## Loading required package: DBI

##        fecha hora no_ozono no_azufre no_nitrogeno no_carbono no_pm10
## 1 1992-04-01    7       55        34           10         43      NA
## 2 1992-04-01    8       72        39           15         46      NA
## 3 1992-04-01    9       80        44           25         52      NA
## 4 1992-04-01   10       84        48           31         62      NA
## 5 1992-04-01   11      161        43           45         73      NA
## 6 1992-04-01   12      250        41           42         82      NA
##   ne_ozono ne_azufre ne_nitrogeno ne_carbono ne_pm10 ce_ozono ce_azufre
## 1       70        24           19         43      NA       56        39
## 2       68        25           21         43      NA       56        37
## 3       62        35           30         46      NA       68        41
## 4       47        40           33         47      NA       85        43
## 5       81        37           28         47      NA      123        45
## 6       89        32           19         47      NA      185        38
##   ce_nitrogeno ce_carbono ce_pm10 so_ozono so_azufre so_nitrogeno
## 1           20         46      NA       34        26            9
## 2           23         45      NA       46        29           10
## 3           36         48      NA       54        32           15
## 4           64         55      NA       62        34           26
## 5           50         59      NA       81        35           19
## 6           38         62      NA      124        35           16
##   so_carbono so_pm10 su_ozono su_azufre su_nitrogeno su_carbono su_pm10 id
## 1         27      NA       25        18           16         64      NA  1
## 2         31      NA       31        20           18         65      NA  2
## 3         38      NA       32        24           21         65      NA  3
## 4         45      NA       42        26           36         65      NA  4
## 5         47      NA       69        24           40         66      NA  5
## 6         49      NA       55        22           27         67      NA  6

I hope this little example can help you to try PostgreSQL even if you don’t have it installed on your computer or if you don’t have a server.

Gradient Descent

2017-03-29T00:00:00+00:00

Trying gradient descent for linear regression

The best way to learn an algorith is to code it. So here it is, my take on Gradient Descent Algorithm for simple linear regression.

First, we fit a simple linear model with lm for comparison with gradient descent values.

#Load libraries

library(dplyr)
library(highcharter)

#Scaling length variables from iris dataset.

iris_demo <- iris[,c("Sepal.Length","Petal.Length")] %>%
  mutate(sepal_length = as.numeric(scale(Sepal.Length)),
         petal_length = as.numeric(scale(Petal.Length))) %>%
  select(sepal_length,petal_length)

#Fit a simple linear model to compare coefficients.

regression <- lm(iris_demo$petal_length~iris_demo$sepal_length)

coef(regression)

##            (Intercept) iris_demo$sepal_length 
##           4.643867e-16           8.717538e-01

iris_demo_reg <- iris_demo

iris_demo_reg$reg <- predict(regression,iris_demo)

#Plot the model with highcharter

highchart() %>%
  hc_add_series(data = iris_demo_reg, type = "scatter", hcaes(x = sepal_length, y = petal_length), name = "Sepal Length VS Petal Length") %>%
  hc_add_series(data = iris_demo_reg, type = "line", hcaes(x = sepal_length, y = reg), name = "Linear Regression") %>%
  hc_title(text = "Linear Regression")

open

We will try to acomplish the same coefficients, this time using Gradient Descent.

library(tidyr)

set.seed(135) #To reproduce results

#Auxiliary function

# y = mx + b

reg <- function(m,b,x)  return(m * x + b)

#Starting point

b <- runif(1)
m <- runif(1)

#Gradient descent function

gradient_desc <- function(b, m, data, learning_rate = 0.01){ # Small steps
  
  # Column names = Code easier to understand
  
  colnames(data) <- c("x","y")
  
  #Values for first iteration
  
  b_iter <- 0     
  m_iter <- 0
  n <- nrow(data)
  
  # Compute the gradient for Mean Squared Error function
  
  for(i in 1:n){
    
    # Partial derivative for b
    
    b_iter <- b_iter + (-2/n) * (data$y[i] - ((m * data$x[i]) + b))
    
    # Partial derivative for m
    
    m_iter <- m_iter + (-2/n) * data$x[i] * (data$y[i] - ((m * data$x[i]) + b))
    
  }
  
  # Move to the OPPOSITE direction of the derivative
  
  new_b <- b - (learning_rate * b_iter)
  new_m <- m - (learning_rate * m_iter)
  
  # Replace values and return
  
  new <- list(new_b,new_m)
  
  return(new)
  
}

# I need to store some values to make the motion plot

vect_m <- m
vect_b <- b

# Iterate to obtain better parameters

for(i in 1:1000){
  if(i %in% c(1,100,250,500)){ # I keep some values in the iteration for the plot
    vect_m <- c(vect_m,m)
    vect_b <- c(vect_b,b)
  } 
  x <- gradient_desc(b,m,iris_demo)
  b <- x[[1]]
  m <- x[[2]]
}

print(paste0("m = ", m))

## [1] "m = 0.871753774273602"

print(paste0("b = ", b))

## [1] "b = 5.52239677041512e-10"

The difference in the coefficients is minimal.

We can see how the iterations work in the next plot:

#Compute new values

iris_demo$preit    <- reg(vect_m[1],vect_b[1],iris_demo$sepal_length)
iris_demo$it1      <- reg(vect_m[2],vect_b[2],iris_demo$sepal_length)
iris_demo$it100    <- reg(vect_m[3],vect_b[3],iris_demo$sepal_length)
iris_demo$it250    <- reg(vect_m[4],vect_b[4],iris_demo$sepal_length)
iris_demo$it500    <- reg(vect_m[5],vect_b[5],iris_demo$sepal_length)
iris_demo$finalit  <- reg(m,b,iris_demo$sepal_length)


iris_gathered <- iris_demo %>% gather(key = gr, value = val, preit:finalit) %>%
  select(-petal_length) %>% 
  distinct()


iris_start <- iris_gathered %>%
  filter(gr == "preit")


iris_seq <- iris_gathered %>%
  group_by(sepal_length) %>%
  do(sequence = list_parse(select(., y = val)))


iris_data <- left_join(iris_start, iris_seq)

#Motion Plot

irhc2 <- highchart() %>%
  hc_add_series(data = iris_data, type = "line", hcaes(x = sepal_length, y = val), name = "Gradient Descent") %>%
  hc_motion(enabled = TRUE, series = 0, startIndex = 0,
            labels = c("Iteration 1","Iteration 100","Iteration 250","Iteration 500","Final Iteration")) %>%
  hc_add_series(data = iris_demo_reg, type = "scatter", hcaes(x = sepal_length, y = petal_length), name = "Sepal Length VS Petal Length") %>%
  hc_title(text = "Gradient Descent Iterations")

irhc2

open

Maybe in a future post we can try a multivariate regression model!

Building a pokemon graph database

2017-02-13T00:00:00+00:00

What happens when you combine Pokemon with Neo4j?

I’m a huge Pokemon fan. So, when I found about this awesome post from Joshua Kunst, I just couldn’t wait to throw all that data into Neo4j.

It also happens to be a great way to learn how to build a graph database from scratch. The objective of this exercise is to build a graph database where the nodes are the pokemon and the types, and the relationships are the effectiveness between the pokemon based only on their types.

Getting the data

First of all, be sure to check Joshua’s post to learn how to import all that pokemon data. We will asume that the data is in a data frame called df.

Then, we need to get the relationships between types. The easiest thing for acomplishing that is to scrape the table from pokemondb.net.

library(RNeo4j)
library(rvest)
library(methods)
library(dplyr)

link <- "http://pokemondb.net/type"

link_html <- read_html(link)

types <- link_html %>%
  html_nodes("table") %>%
  .[[1]] %>%
  html_table()

#Give format

names(types)[1] <- "Type"
types$Type <- tolower(types$Type)
names(types)[2:ncol(types)] <- types$Type
types[is.na(types)] <- 1
types[types == ""] <- 1
types[types == "½"] <- 0.5

knitr::kable(types, format = "html")

Type	normal	fire	water	electric	grass	ice	fighting	poison	ground	flying	psychic	bug	rock	ghost	dragon	dark	steel	fairy
normal	1	1	1	1	1	1	1	1	1	1	1	1	0.5	0	1	1	0.5	1
fire	1	0.5	0.5	1	2	2	1	1	1	1	1	2	0.5	1	0.5	1	2	1
water	1	2	0.5	1	0.5	1	1	1	2	1	1	1	2	1	0.5	1	1	1
electric	1	1	2	0.5	0.5	1	1	1	0	2	1	1	1	1	0.5	1	1	1
grass	1	0.5	2	1	0.5	1	1	0.5	2	0.5	1	0.5	2	1	0.5	1	0.5	1
ice	1	0.5	0.5	1	2	0.5	1	1	2	2	1	1	1	1	2	1	0.5	1
fighting	2	1	1	1	1	2	1	0.5	1	0.5	0.5	0.5	2	0	1	2	2	0.5
poison	1	1	1	1	2	1	1	0.5	0.5	1	1	1	0.5	0.5	1	1	0	2
ground	1	2	1	2	0.5	1	1	2	1	0	1	0.5	2	1	1	1	2	1
flying	1	1	1	0.5	2	1	2	1	1	1	1	2	0.5	1	1	1	0.5	1
psychic	1	1	1	1	1	1	2	2	1	1	0.5	1	1	1	1	0	0.5	1
bug	1	0.5	1	1	2	1	0.5	0.5	1	0.5	2	1	1	0.5	1	2	0.5	0.5
rock	1	2	1	1	1	2	0.5	1	0.5	2	1	2	1	1	1	1	0.5	1
ghost	0	1	1	1	1	1	1	1	1	1	2	1	1	2	1	0.5	1	1
dragon	1	1	1	1	1	1	1	1	1	1	1	1	1	1	2	1	0.5	0
dark	1	1	1	1	1	1	0.5	1	1	1	2	1	1	2	1	0.5	1	0.5
steel	1	0.5	0.5	0.5	1	2	1	1	1	1	1	1	2	1	1	1	0.5	2
fairy	1	0.5	1	1	1	1	2	0.5	1	1	1	1	1	1	2	2	0.5	1

Then we need to separate the types of the pokemon.

df %>% select(id, type =  type_1) -> t1
df %>% select(id, type =  type_2) -> t2

rbind(t1,t2) -> tf

poke_df <- df %>% select(-type_1, -type_2) %>% 
  left_join(tf, by = "id") %>% 
  filter(!is.na(type))

We are ready to import to Neo4j, so we need to set the connection.

Then, we create the pokenodes and the type nodes. We set a relationship for the typing.

#Connect to Graph


graph = startGraph(url = url,
                   username = username,
                   password = password)

#Constraints

addConstraint(graph, "Pokemon", "id")
addConstraint(graph, "Type", "type")


#Create nodes and relationships within the same function

pokenodes <- function(x) {
  pokemon <- getOrCreateNode(graph, "Pokemon", id = x["id"], name = x["pokemon"],
                             height = x["height"], weight = x["weight"],
                             attack = x["attack"], defense = x["defense"],
                             hp = x["hp"], special_attack = x["special_attack"],
                             special_defense = x["special_defense"], speed = x["speed"],
                             url_image = x["url_image"], url_icon = x["url_icon"])
  
  type <- getOrCreateNode(graph, "Type", type = x["type"])
  
  createRel(pokemon,"TYPE",type)
}

#Apply to every row

apply(poke_df[1:nrow(poke_df),],1,pokenodes)

We define the desired relationship (effectiveness) using the scraped table

types <- types %>% gather(Type)

names(types)[2] <- "Type_Rel"

effectiveness <- types %>% filter(value != 1)

And we are ready to upload the effectiveness, this time using a transaction. Thanks to Nicloe White for this useful post

#Query for creating relationships for the pokenodes

query = "
MERGE (n:Type {type:{type_1}})
MERGE (m:Type {type:{type_2}})
CREATE (n)-[r:EFECTIVENESS]->(m)
SET r.value = {value}
"

#Transactiopn endpoint
t = newTransaction(graph)

for (i in 1:nrow(effectiveness)) {
  type_1 = effectiveness[i, ]$Type
  type_2 = effectiveness[i, ]$Type_Rel
  value = effectiveness[i, ]$value
  
  appendCypher(t, 
               query, 
               type_1 = type_1, 
               type_2 = type_2, 
               value  = value)
}

commit(t)

It’s time to query our database!!! Let’s check all the pokemon that Salamence is double effective:

library(visNetwork)

#Query to check for effectiveness for Salamence
final_query <- "
match (n:Pokemon)-[t:TYPE]->(l:Type)-[e:EFECTIVENESS]->(s:Type)<-[j:TYPE]-(z:Pokemon) 
where n.name = 'salamence' 
return n.name as poke1, e.value as value, z.name as poke2, n.url_icon as icon1,
z.url_icon as icon2, n.url_image as image1, z.url_image as image2"

#Execute the query
poke_cypher = cypher(graph, final_query)

#Get data for VisNetwork
poke_cypher <- poke_cypher %>%
  mutate(value = as.numeric(value)) %>%
  group_by(poke1, poke2, image1, image2, icon1, icon2) %>%
  summarise(value = prod(value)) %>%
  ungroup()

#Filter by double effective
poke_sp_eft <- poke_cypher %>%
  filter(value == 2)

#More data for VisNetwork
poke <- unique(c(poke_sp_eft$poke1, poke_sp_eft$poke2))
img  <- unique(c(poke_sp_eft$icon1, poke_sp_eft$icon2))

nodes <- data.frame(id = poke, label = poke, image = img, shape = "image")

edges <- poke_sp_eft %>%
  select(from = poke1, to = poke2)

#The VISUALIZATION
visNetwork(nodes, edges, width = "100%")

And that’s how you do it! With the RNeo4j it’s so easy to set a graph. Maybe in the future it could be expanded in a recommender system or something like that.

Check out a shiny app for the pokemon database!