SEO Dashboard with #R : my Github repo - Part 2 - Data-SEO

In my last article, I showed you some methods to classify URLS and detect active pages.

I improved my source code and now, you can classify 500 000 URLS in 1 second according to several criteria ( active pages, compliant pages, sections, number of inlinks, response time, duplicate meta tags).

Today, I offer you my github repo to test yourself : https://github.com/voltek62/SEO-Dashboard

Classify urls
Method : For each line of my csv, I identify all pages with pattern url specified in csv file.
I use the str_detect_fixed() function to classify each url.

siteconf <- "./conf/blog.csv"
schemas <- read.csv(siteconf,
                    header = FALSE,
                    col.names = "schema",
                    stringsAsFactors = FALSE
)

schemas <- as.character(schemas[,1])

urls$Category <- "no match"
  
for (j in 1:length(schemas))
{
   #print(schemas[j])
   urls$Category[which(stri_detect_fixed(urls$Address,schemas[j]))] <- schemas[j]
}

# Detect HomePage
urls$`Category`[1] <- 'Home'

urls$Category <- as.factor(urls$Category)

Detect Compliant pages
Method: It is compliant if it :

Responds with an HTTP 200 (OK) status code
Does not include a canonical tag to another URL.
Has an HTML content type
Does not include any Noindex meta tag

urls$Compliant <- TRUE

urls$Compliant[which(urls$`Status Code` != 200
                            | urls$`Canonical Link Element 1` != urls$Address
                            | urls$Status != "OK"
                            | grepl("noindex",urls$`Meta Robots 1`)
                            )] <- FALSE

urls$Compliant <- as.factor(urls$Compliant)

Classify by inlinks
Method: I have created 5 groups to classify according to the number of inlinks

URLs with No Follow Inlinks
URLs with 1 Follow Inlink
URLs with 2 to 5 Follow Inlinks
URLs with 5 to 10 Follow Inlinks
URLs with more than 10 Follow Inlinks

urls$`Group Inlinks` <- "URLs with No Follow Inlinks" 

urls$`Group Inlinks`[which(urls$`Inlinks` < 1  )] <- "URLs with No Follow Inlinks"
urls$`Group Inlinks`[which(urls$`Inlinks` == 1 )] <- "URLs with 1 Follow Inlink" 
urls$`Group Inlinks`[which(urls$`Inlinks` > 1 & urls$`Inlinks` < 6)] <- "URLs with 2 to 5 Follow Inlinks" 
urls$`Group Inlinks`[which(urls$`Inlinks` >= 6 & urls$`Inlinks` < 11 )] <- "URLs with 5 to 10 Follow Inlinks" 
urls$`Group Inlinks`[which(urls$`Inlinks` >= 11)] <- "URLs with more than 10 Follow Inlinks"

urls$`Group Inlinks` <- as.factor(urls$`Group Inlinks`)

Detect Duplicate Meta
Method : By default, all meta tags are unique. If my field “Meta Length” is equal to 0, I classify as “No set”.
Finally, I use the function “duplicated()” to detect all duplicate meta tags.

urls$`Status Title` <- 'Unique'
urls$`Status Title`[which(urls$`Title 1 Length` == 0)] <- "No Set"

urls$`Status Description` <- 'Unique'
urls$`Status Description`[which(urls$`Meta Description 1 Length` == 0)] <- "No Set"

urls$`Status H1` <- 'Unique'
urls$`Status H1`[which(urls$`H1-1 Length` == 0)] <- "No Set"

urls$`Status Title`[which(duplicated(urls$`Title 1`))] <- 'Duplicate'
urls$`Status Description`[which(duplicated(urls$`Meta Description 1`))] <- 'Duplicate'
urls$`Status H1`[which(duplicated(urls$`H1-1`))] <- 'Duplicate'

urls$`Status Title` <- as.factor(urls$`Status Title`)
urls$`Status Description` <- as.factor(urls$`Status Description`)
urls$`Status H1` <- as.factor(urls$`Status H1`)

Charts
Method : I use the ggplott() function to draw each chart.
ggsave() is a convenient function to save a plot.

urls_cat_statustitle %
  group_by(Cat,`Status Title`) %>%
  summarise(count = n())

ggplot(urls_cat_statustitle, aes(x=Cat, y=count, fill=`Status Title`) ) +
     geom_bar(stat = "identity", position = "stack") +
     theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
     labs(x = "Section", y ="Crawled URLs") 
     #+ ggtitle("Nombre d'urls crawlés par section et status de la balise title")

ggsave(file="./export/urlsBysectionFillstatustitle.png")

Bonus : Calculate Internal PageRank
Method : Use a new Screaming Frog export : Bulk Export > All Outlinks from the top menu, and save the CSV file.
I use the igraph package to calculate the Google PageRank for the specified vertices.

library(igraph)
library(dplyr)
library(ggplot2)
library(magrittr)
#library(ForceAtlas2)

file_outlinks <- './input/blog/all_outlinks_test.csv'
website_url <- 'http//www.mywebsite.com'

# transform raw internal page rank to page rank
map <- function(x, range = c(0,1), from.range=NA) {
 if(any(is.na(from.range))) from.range <- range(x, na.rm=TRUE)
 
 ## check if all values are the same
 if(!diff(from.range)) return(
 matrix(mean(range), ncol=ncol(x), nrow=nrow(x), 
 dimnames = dimnames(x)))
 
 ## map to [0,1]
 x <- (x-from.range[1])
 x <- x/diff(from.range)
 ## handle single values
 if(diff(from.range) == 0) x <- 0 
 
 ## map from [0,1] to [range]
 if (range[1]>range[2]) x <- 1-x
 x <- x*(abs(diff(range))) + min(range)
 
 x[x<min(range) | x>max(range)] <- NA
 
 x
}

 DF <- read.csv2(file_outlinks, header=TRUE, sep = ",", stringsAsFactors = F, skip=1 )
 ## we keep only link
 DF <- filter(DF,grepl(website_url,Source) & Type=="HREF" & Follow=="true") %>%
 select(Source,Destination)
 
 DF <- as.data.frame(sapply(DF,gsub,pattern=website_url,replacement=""))
 
 ## adapt colnames and rownames
 colnames(DF) <- c("from","to")
 rownames(DF) <- NULL

# generate graph with data.frame
graphObject = graph.data.frame(DF)

# calculate pagerank
urls_pagerank <- page.rank(graphObject, directed= TRUE, damping = 0.85) %>%
 use_series("vector") %>%
 sort(decreasing = TRUE) %>%
 as.data.frame %>%
 set_colnames("raw.internal.pagerank")

urls_pagerank$Address<-rownames(urls_pagerank)
rownames(urls_pagerank) <- NULL

urls_pagerank <- mutate(urls_pagerank, internal.pagerank = map(raw.internal.pagerank, c(1,10)))

Conclusion
Do not hesitate to use comments to request my help to draw complex charts.