Load data
rm(list = ls())
load("/Users/l--jiaojiao/Desktop/analytics design/HW3/GBA424 - Toy Horse Case Data.Rdata")
Part A
A.1)Produce part-utilities to pass to part B
temp = unique(conjointData$ID)
##Loop over individuals by ID: range 1 to 200, can pass the entire Conjoint Data Frame
##Create a lm for where ID = i ,
##Store lm object in a new dataFrame
###Arguments:
## data - conjoint data frame
## IDs - vector of IDs for individuals
###Returns:
## dataframe of individual ID and linear regression model for each individual
individualRegs = function(data,IDs){
K = length(IDs)
LMres = list()
df = data.frame()
for (i in 1:K){
res =lm(ratings~factor(price)+factor(size)+factor(motion)+factor(style),data=conjointData[(conjointData[,1]==i),])
M = length(res$coefficients)
coeffs = list()
for (j in 1:M){
coeffs = append(coeffs,res$coefficients[[j]])
}
coeffs = c(coeffs)
df = rbind(df, coeffs)
}
colnames(df) = c('Intercept','PriceLow','SizeBig','MotionRocking','StyleGlamour')
return(df)
}
partutility = individualRegs(conjointData, temp)
A.2)Produce predictions for missing profiles to pass to part D
for (i in 1:200){
res =lm(ratings~factor(price)+factor(size)+factor(motion)+factor(style),data=conjointData[(conjointData[,1]==i),])
conjointData[(conjointData[,1]==i),][3,3] = predict(res,conjointData[(conjointData[,1]==i),][3,4:7])
conjointData[(conjointData[,1]==i),][6,3] = predict(res,conjointData[(conjointData[,1]==i),][6,4:7])
conjointData[(conjointData[,1]==i),][10,3] = predict(res,conjointData[(conjointData[,1]==i),][10,4:7])
conjointData[(conjointData[,1]==i),][16,3] = predict(res,conjointData[(conjointData[,1]==i),][16,4:7])
}
Part B
B.1)Use cluster analysis on the part-utilities
require("cluster")
## Loading required package: cluster
require("fpc")
## Loading required package: fpc
require("factoextra")
## Loading required package: factoextra
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
require("gridExtra")
## Loading required package: gridExtra
library(cluster)
library(fpc)
library(factoextra)
library(gridExtra)
##Evaluate number of clusters to use on data with visualizations
##Arguments:
## toClust, the data to do kmeans cluster analysis
## maxClusts=15, the max number of clusters to consider
## seed, the random number to initialize the clusters
## iter.max, the max iterations for clustering algorithms to use
## nstart, the number of starting points to consider
##Results:
## a list of weighted sum of squares and the pamk output including optimal number of clusters (nc)
## to create visualizations need to print tmp
clustTest = function(toClust,print=TRUE,scale=TRUE,maxClusts=15,seed=12345,nstart=20,iter.max=100){
if(scale){ toClust = scale(toClust);}
set.seed(seed); # set random number seed before doing cluster analysis
wss <- (nrow(toClust)-1)*sum(apply(toClust,2,var))
for (i in 2:maxClusts) wss[i] <- sum(kmeans(toClust,centers=i,nstart=nstart,iter.max=iter.max)$withinss)
##gpw essentially does the following plot using wss above.
#plot(1:maxClusts, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")
gpw = fviz_nbclust(toClust,kmeans,method="wss",iter.max=iter.max,nstart=nstart,k.max=maxClusts) #alternative way to get wss elbow chart.
pm1 = pamk(toClust,scaling=TRUE)
## pm1$nc indicates the optimal number of clusters based on
## lowest average silhoutte score (a measure of quality of clustering)
#alternative way that presents it visually as well.
gps = fviz_nbclust(toClust,kmeans,method="silhouette",iter.max=iter.max,nstart=nstart,k.max=maxClusts)
if(print){
grid.arrange(gpw,gps, nrow = 1)
}
list(wss=wss,pm1=pm1$nc,gpw=gpw,gps=gps)
}
##Runs a set of clusters as kmeans
##Arguments:
## toClust, data.frame with data to cluster
## nClusts, vector of number of clusters, each run as separate kmeans
## ... some additional arguments to be passed to clusters
##Return:
## list of
## kms, kmeans cluster output with length of nClusts
## ps, list of plots of the clusters against first 2 principle components
runClusts = function(toClust,nClusts,print=TRUE,maxClusts=15,seed=12345,nstart=20,iter.max=100){
if(length(nClusts)>4){
warning("Using only first 4 elements of nClusts.")
}
kms=list(); ps=list();
for(i in 1:length(nClusts)){
kms[[i]] = kmeans(toClust,nClusts[i],iter.max = iter.max, nstart=nstart)
ps[[i]] = fviz_cluster(kms[[i]], geom = "point", data = toClust) + ggtitle(paste("k =",nClusts[i]))
}
library(gridExtra)
if(print){
tmp = marrangeGrob(ps, nrow = 2,ncol=2)
print(tmp)
}
list(kms=kms,ps=ps)
}
##Plots a kmeans cluster as three plot report
## pie chart with membership percentages
## ellipse plot that indicates cluster definitions against principle components
## barplot of the cluster means
plotClust = function(km,toClust,discPlot=FALSE){
nc = length(km$size)
if(discPlot){par(mfrow=c(2,2))}
else {par(mfrow=c(3,1))}
percsize = paste(1:nc," = ",format(km$size/sum(km$size)*100,digits=2),"%",sep="")
pie(km$size,labels=percsize,col=1:nc)
clusplot(toClust, km$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0,col.clus=1:nc); #plot clusters against principal components
if(discPlot){
plotcluster(toClust, km$cluster,col=km$cluster); #plot against discriminant functions ()
}
rng = range(km$centers)
dist = rng[2]-rng[1]
locs = km$centers+.05*dist*ifelse(km$centers>0,1,-1)
bm = barplot(km$centers,beside=TRUE,col=1:nc,main="Cluster Means",ylim=rng+dist*c(-.1,.1))
text(bm,locs,formatC(km$centers,format="f",digits=1))
}
B.2)Test at least two cluster analysis schemes (i.e., number of clusters) and select the best one in your view.
Checks = clustTest(partutility)
clusts = runClusts(partutility,c(2,3,4,5))
plotClust(clusts[[1]][[2]],partutility)
## decided to use three clusters, see explanations below
## demographics of each cluster
clustfinal = clusts[[1]][[2]]
respondentData$cluster = clustfinal$cluster
cluster1 = respondentData[respondentData$cluster==1,]
sum(cluster1$age==1)/nrow(cluster1)
## [1] 0.6125
sum(cluster1$gender==1)/nrow(cluster1)
## [1] 0.3875
cluster2 = respondentData[respondentData$cluster==2,]
sum(cluster2$age==1)/nrow(cluster2)
## [1] 0.2641509
sum(cluster2$gender==1)/nrow(cluster2)
## [1] 0.4528302
cluster3 = respondentData[respondentData$cluster==3,]
sum(cluster3$age==1)/nrow(cluster3)
## [1] 0.5671642
sum(cluster3$gender==1)/nrow(cluster3)
## [1] 0.7910448
############################
## Justify this decision ##
############################
#Can use either cluster of 2 groups or cluster of 3 groups. It would be ideal to choose the
#clustering scheme that maximizes the differences of the cluster but also minizimes the differences
#of the individuals within a particular cluster. For that reason, it is not ideal to use a clustering
#of 2 groups, as we observe that one of the clusters appears to have 2 sub segments. With 3 clusters,
#one can target the preferences of each cluster, thus more accurately targetting the preferences of each #individual within the cluster. Moreover, observing the Total Within Sum of Squares and Average
#Silhouette Width, it is statistically sound to propose a clustering scheme with 3 groups.
B.3)Interpret the segments in the chosen scheme and identify the ideal product for each segment
Segment 1: Older Boys (.40)
Only segment that prefers bouncing, Strong preference for racing style
No real preference for size, will test both sizes
Age: 2: .3875 Gender: M: .61
Age: 3-4: .6125 Gender: F: .39
Ideal Products:
1. 18" Bouncing Racing , Low Price
2. 26" Bouncing Racing , Low Price
Segment 2: Younger Children (.26)
Only segment that prefers small, No preference for style
Age: 2: .74 Gender: M: .55
Age: 3-4: .26 Gender: F: .45
Ideal Products:
1. 18" Rocking Glamour , Low Price
2. 18" Rocking Racing , Low Price
Segment 3: Girls (.34)
Only segment that prefers big, No preference for motion
Age: 2: .44 Gender: M: .21
Age: 3-4: .56 Gender: F: .79
Ideal Products:
1. 26" Rocking Glamour , Low Price
2. 26" Bouncing Glamour , Low Price
C. Conduct a priori segmentation.
prioridf = merge(respondentData,conjointData,by = 'ID')
summary(lm(ratings~price*age+size*age+motion*age+style*age,data=prioridf))
##
## Call:
## lm(formula = ratings ~ price * age + size * age + motion * age +
## style * age, data = prioridf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.528 -11.829 -2.007 11.634 44.386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.5462 0.8917 44.350 < 2e-16 ***
## price 14.4133 0.7975 18.072 < 2e-16 ***
## age -1.2982 1.2548 -1.035 0.300928
## size 3.8532 0.7975 4.831 1.42e-06 ***
## motion 2.7950 0.7975 3.504 0.000464 ***
## style 1.1867 0.7975 1.488 0.136877
## price:age 1.2588 1.1223 1.122 0.262095
## age:size 4.1708 1.1223 3.716 0.000206 ***
## age:motion -3.1188 1.1223 -2.779 0.005486 **
## age:style -0.0857 1.1223 -0.076 0.939140
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.87 on 3190 degrees of freedom
## Multiple R-squared: 0.2131, Adjusted R-squared: 0.2109
## F-statistic: 95.98 on 9 and 3190 DF, p-value: < 2.2e-16
summary(lm(ratings~price*gender+size*gender+motion*gender+style*gender,data=prioridf))
##
## Call:
## lm(formula = ratings ~ price * gender + size * gender + motion *
## gender + style * gender, data = prioridf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.67 -10.97 -0.52 10.74 47.02
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.5668 0.8801 41.547 < 2e-16 ***
## price 16.8573 0.7872 21.414 < 2e-16 ***
## gender 4.3032 1.1977 3.593 0.000332 ***
## size 3.8509 0.7872 4.892 1.05e-06 ***
## motion -0.7601 0.7872 -0.966 0.334327
## style -1.8895 0.7872 -2.400 0.016440 *
## price:gender -3.3488 1.0713 -3.126 0.001788 **
## gender:size 3.9046 1.0713 3.645 0.000272 ***
## gender:motion 3.6669 1.0713 3.423 0.000627 ***
## gender:style 5.6165 1.0713 5.243 1.68e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.1 on 3190 degrees of freedom
## Multiple R-squared: 0.2876, Adjusted R-squared: 0.2856
## F-statistic: 143.1 on 9 and 3190 DF, p-value: < 2.2e-16
######################
## Segmenting on Age ##
## Will discuss the interaction between age and the 4 attributes, some attributes are reflective of the
## parent and others of the children
## Interactions will focus on older children: 2 year olds are the baseline
## Price*Age: A bit more price sensative than for 2 year olds
## Size*Age: Older kids prefer bigger horsers
## Motion*Age: Older kids prefer bouncing motion
## Style*Age: Effect is very minute, preference between style for different ages is negligible
## Segmenting on Gender ##
## Will discuss the interaction between gender and the 4 attributes, some attributes are reflective of
## the parent and others of the children
## Interactions will focus on females: male is the baseline
## Price*Gender: Less price sensative than males(i.e parents with males)
## Size*Gender: Female prefers bigger horses
## Motion*Gender: Female prefer rocking motion
## Style*Gender: Female definitely prefers glamour
######################
Part D
D.Simulate market shares for different product-line scenarios
## prepare dataframe to calculate market shares
mktshare <- data.frame(matrix(0, ncol = 16, nrow = 200))
ranking = c(1:16)
for (i in 1:200){
data=conjointData[(conjointData[,1]==i),]
mktshare[i,rev(order(data$ratings))]=ranking
}
colnames(mktshare) <-c(1:16)
## function for market share calculation
summktshare = function(scenario,fulldata){
data = fulldata[,scenario]
df = data.frame()
for (i in 1:nrow(data)){
x = data[i,]
x = as.data.frame(x)
if (length(which(x==min(x))) > 1){ # if tie
x[which(x==min(x))] = 1/(length(which(x==min(x))))
x[-which(x==min(x))] = 0
}else{ # if no tie
x[which(x==min(x))] = 1
x[-which(x==min(x))] = 0
}
df = rbind(df,x)
}
return(apply(df,2,sum)/nrow(df))
}
# ## Simulate market share for two/three profiles
# twoprofiles = read.csv('2profiles.csv',stringsAsFactors = F)
# threeprofiles = read.csv('3profiles.csv',stringsAsFactors = F)
#
# capture.output(
# for (i in 1:nrow(twoprofiles)){
# for (j in 1:ncol(twoprofiles)){
# x = unlist(strsplit(twoprofiles[i,j],split = ','))
# print(summktshare(x,mktshare))
#
# }
# }
# ,file = "twopktshr.txt")
#
# capture.output(
# for (i in 1:nrow(threeprofiles)){
# for (j in 1:ncol(threeprofiles)){
# x = unlist(strsplit(threeprofiles[i,j],split = ','))
# print(summktshare(x,mktshare))
# }
# }
# ,file = "threepnktshr.txt")
#
# ## Save market share simulation to .txt file for further profit calculation in Excel.
# ```