train_hande_df <- read_csv("laptops-hande.csv") %>%
mutate_if(is.character, factor)
## Rows: 16 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Brand, Processor, Speed, Memory, HardDisk, Screen, Price
## dbl (2): Alternative, RANK
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test_hande_df <- read_csv("laptops-hepsiburada-hande.csv", comment = "#") %>%
mutate_if(is.character, factor)
## Rows: 6 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Brand, Processor, Speed, Memory, HardDisk, Screen, Price
## dbl (2): Alternative, RANK
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(train_hande_df)
## Alternative Brand Processor Speed Memory HardDisk
## Min. : 1.00 Casper :4 Intel Core i5:8 1.8 GHz:4 4 GB:8 500 GB:8
## 1st Qu.: 4.75 HP :4 Intel Core i7:8 2.2 GHz:4 8 GB:8 750 GB:8
## Median : 8.50 Sony :4 2.4 GHz:4
## Mean : 8.50 Toshiba:4 2.6 GHz:4
## 3rd Qu.:12.25
## Max. :16.00
## Screen Price RANK
## 15.6 in:8 1500 TL:4 Min. : 1.00
## 17.3 in:8 1700 TL:4 1st Qu.: 4.75
## 2000 TL:4 Median : 8.50
## 3500 TL:4 Mean : 8.50
## 3rd Qu.:12.25
## Max. :16.00
train_hande_df %>%
arrange(RANK)
## # A tibble: 16 × 9
## Alternative Brand Processor Speed Memory HardDisk Screen Price RANK
## <dbl> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <dbl>
## 1 2 Sony Intel Core i5 2.4 GHz 8 GB 750 GB 17.3 in 1500… 1
## 2 12 Toshiba Intel Core i5 2.2 GHz 8 GB 750 GB 15.6 in 1700… 2
## 3 8 HP Intel Core i5 2.6 GHz 8 GB 750 GB 15.6 in 2000… 3
## 4 16 HP Intel Core i7 2.4 GHz 8 GB 500 GB 17.3 in 1700… 4
## 5 14 Casper Intel Core i7 2.2 GHz 8 GB 500 GB 15.6 in 1500… 5
## 6 3 Casper Intel Core i5 2.6 GHz 4 GB 500 GB 17.3 in 1700… 6
## 7 13 Toshiba Intel Core i7 1.8 GHz 8 GB 500 GB 17.3 in 2000… 7
## 8 1 HP Intel Core i5 1.8 GHz 4 GB 500 GB 15.6 in 1500… 8
## 9 10 Toshiba Intel Core i7 2.6 GHz 4 GB 750 GB 17.3 in 1500… 9
## 10 5 Sony Intel Core i5 2.2 GHz 4 GB 500 GB 17.3 in 2000… 10
## 11 6 Sony Intel Core i7 1.8 GHz 4 GB 750 GB 15.6 in 1700… 11
## 12 7 Casper Intel Core i7 2.4 GHz 4 GB 750 GB 15.6 in 2000… 12
## 13 4 Sony Intel Core i7 2.6 GHz 8 GB 500 GB 15.6 in 3500… 13
## 14 9 HP Intel Core i7 2.2 GHz 4 GB 750 GB 17.3 in 3500… 14
## 15 15 Toshiba Intel Core i5 2.4 GHz 4 GB 500 GB 15.6 in 3500… 15
## 16 11 Casper Intel Core i5 1.8 GHz 8 GB 750 GB 17.3 in 3500… 16
options("contrasts")
## $contrasts
## unordered ordered
## "contr.treatment" "contr.poly"
options(contrasts = c("contr.sum", "contr.poly"))
coef_df <- lm(-RANK ~ ., select(train_hande_df, - Alternative)) %>%
summary() %>%
coef() %>%
as_tibble(rownames = "Part") %>%
select(Part, Estimate) %>%
filter(Part != "(Intercept)") %>%
rename(Worth = Estimate)
parts_df <- coef_df %>%
mutate(ID = as.numeric(str_extract(Part, "\\d+")),
Part = str_extract(Part, "\\D*")) %>%
nest(data=-Part) %>%
mutate(data2 = map(data, ~rbind(.x, tibble(Worth = -sum(.x$Worth),
ID = max(.x$ID + 1))))) %>%
select(-data) %>%
unnest(data2) %>%
mutate(PartLevel = map2_chr(Part, ID, ~levels(train_hande_df[[.x]])[.y])) %>%
group_by(Part) %>%
mutate(Importance = diff(range(Worth)))
parts_df %>%
ggplot(aes(PartLevel, Worth)) +
geom_point() +
geom_linerange(aes(ymin=0, ymax = Worth)) +
coord_flip() +
facet_grid(reorder(Part, -Importance) ~ ., space="free", scale="free") +
theme(strip.text.y = element_text(angle=0))
Now repeat the same analysis for all subjects who filled up the questionnaire.
# d2 <- read_csv("conjoint/all/laptops train survey results-all.csv") %>%
train_df <- read_excel("laptops train survey results-all-coded.xlsx") %>%
mutate_if(is.character, factor) %>%
mutate(id = as.character(id))
test_df <- read_excel("laptops test survey results-all-coded.xlsx")
market_df <- train_df %>%
nest(train_data = -id) %>%
mutate(
# learn individual preferences of each respondent / subject
model = map2(train_data, id, ~{
# cat ("ID=", .y, "\n")
d <- .x
lm(-RANK ~ ., select(d, - Alternative))
}),
cat("model : ranks are learned with multiple regression.\n"),
# compile the part worths
pworth = pmap(list(model, train_data, id), ~{
# cat ("ID=", ..3, "\n")
d <- ..2
linmod <- ..1
coef_df <- linmod %>%
summary() %>%
coef() %>%
as_tibble(rownames = "Part") %>%
select(Part, Estimate) %>%
filter(Part != "(Intercept)") %>%
rename(Worth = Estimate)
parts_df <- coef_df %>%
mutate(ID = as.numeric(str_extract(Part, "\\d+")),
Part = str_extract(Part, "\\D*")) %>%
nest(data=-Part) %>%
mutate(data2 = map(data, ~rbind(.x, tibble(Worth = -sum(.x$Worth),
ID = max(.x$ID + 1))))) %>%
select(-data) %>%
unnest(data2) %>%
mutate(PartLevel = map2_chr(Part, ID, ~levels(d[[.x]])[.y])) %>%
group_by(Part) %>%
mutate(Importance = diff(range(Worth))) %>%
ungroup()
parts_df
}),
cat("pworth : part worth scores are calculated.\n"),
# extract importance scores
importance = map2(pworth, id, ~{
# cat ("ID=", .y, "\n")
.x %>%
distinct(Part, Importance) %>%
mutate(Importance = 100*Importance/sum(Importance))
}),
cat("importance: part importance scores are calculated.\n"),
# visualize the part worths
pwplot = map2(pworth, id, ~{
# cat ("ID=", .y, "\n")
parts_df <- .x
parts_df %>%
ggplot(aes(PartLevel, Worth)) +
geom_point() +
geom_linerange(aes(ymin=0, ymax = Worth)) +
coord_flip() +
facet_grid(reorder(Part, -Importance) ~ ., space="free", scale="free") +
theme(strip.text.y = element_text(angle=0)) +
labs(title = .y)
}),
cat("pwplot : part worth plots are ready.\n"),
) %>%
# append test data
left_join(nest(test_df, test_data = -id), by = "id") %>%
mutate(
# rank predictions on test
test_pred = pmap(list(model, test_data, id), ~{
# cat(..3, " ")
pred <- ..1 %>%
predict(newdata=..2) %>%
`*` (-1)
rank <- pred %>%
rank(ties.method = "first")
..2 %>%
select(Alternative, RANK) %>%
mutate(PREDraw = pred, PREDrank = rank)
}),
cat("test_pred : rankings on test data are predicted.\n")
)
## model : ranks are learned with multiple regression.
## pworth : part worth scores are calculated.
## importance: part importance scores are calculated.
## pwplot : part worth plots are ready.
## test_pred : rankings on test data are predicted.
# test_observed_pred = map2(test_data, test_pred, ~ {
# .x %>%
# select(Alternative, RANK) %>%
# mutate(PRED = .y)
# }),
# cat("test_opred: observed and predicted rankings are combined.\n"),
# )
Show a few part worth plots
for (i in 1:10) plot(market_df$pwplot[[i]])
confusion <- market_df %>%
select(id, test_pred) %>%
unnest(test_pred) %>%
xtabs(~ RANK + PREDrank, .)
confusion
## PREDrank
## RANK 1 2 3 4 5 6
## 1 96 47 19 13 2 8
## 2 40 66 31 23 15 10
## 3 24 36 55 28 31 11
## 4 14 18 44 58 26 25
## 5 4 6 18 30 72 55
## 6 7 12 18 33 39 76
confnames <- dimnames(confusion) %>% names()
heatmap(confusion, Rowv = NA, Colv = NA, revC = TRUE, xlab = confnames[2], ylab = confnames[1])
# confusion %>%
# as_tibble() %>%
# rename(Observed = RANK, Predicted = PRED) %>%
# ggplot(aes(Predicted, Observed)) +
# geom_tile(aes(fill=n)) +
# scale_fill_gradient(low = "yellow", high = "red") +
# coord_fixed()
market_df %>%
select(id, test_pred) %>%
unnest(test_pred) %>%
group_by(id) %>%
summarize(agreement = sum(RANK == PREDrank)) %>%
arrange(desc(agreement)) %>%
count(agreement) %>%
ggplot(aes(agreement, n)) +
geom_col()
subst_df <- market_df %>%
select(id, test_data) %>%
unnest(test_data) %>%
select(id, Brand, RANK) %>%
filter(RANK<=2) %>%
pivot_wider(names_from = "RANK", values_from = "Brand") %>%
count(`1`,`2`) %>%
rename(Brand = `1`, Substitute = `2`) %>%
nest(Subs = -Brand)
oldpar <- par(mar = c(0,0,1,0))
for (i in seq(nrow(subst_df))){
subst_df$Subs[[i]] %>%
column_to_rownames(var = "Substitute") %>%
as.matrix %>%
drop() %>%
pie(main=paste("Substitutes of", subst_df$Brand[i] ))
}
par(mar = oldpar$mar)
All possible Casper configurations
config_df <- test_df %>%
select(Processor, Speed, Memory, HardDisk, Screen, Price) %>%
expand(Processor, Speed, Memory, HardDisk, Screen, Price) %>% # expand to all permutations of the values of listed variables
mutate(Brand = "Casper") %>% # our company is Casper
relocate(Brand) %>%
anti_join(test_df, by = c("Brand", "Processor", "Speed", "Memory", "HardDisk", "Screen", "Price")) %>% # remove the Casper products already in the market
mutate(config = as.character(seq(n()))) %>%
relocate(config)
Predicted rank utilities of all configurations
market_config_df <- market_df %>%
mutate(config_pred = map2(model, id, ~{
pred <- predict(.x, config_df) %>%
`*`(-1)
config_df %>%
select(config) %>%
mutate(PREDraw = pred)
})) %>%
select(id, test_pred, config_pred) %>%
unnest(test_pred) %>%
select(-RANK, -PREDrank) %>%
pivot_wider(names_from = Alternative, values_from = "PREDraw") %>%
unnest(config_pred) %>%
rename(new = PREDraw)
market_config_df %>%
head(10)
## # A tibble: 10 × 9
## id config new `1` `2` `3` `4` `5` `6`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 y14s208s001 1 15.5 12 8 7.50 11 6 12.5
## 2 y14s208s001 2 10.5 12 8 7.50 11 6 12.5
## 3 y14s208s001 3 11.5 12 8 7.50 11 6 12.5
## 4 y14s208s001 4 11.5 12 8 7.50 11 6 12.5
## 5 y14s208s001 5 16.5 12 8 7.50 11 6 12.5
## 6 y14s208s001 6 11.5 12 8 7.50 11 6 12.5
## 7 y14s208s001 7 12.5 12 8 7.50 11 6 12.5
## 8 y14s208s001 8 12.5 12 8 7.50 11 6 12.5
## 9 y14s208s001 9 16.5 12 8 7.50 11 6 12.5
## 10 y14s208s001 10 11.5 12 8 7.50 11 6 12.5
New markets obtained by adding each config to the existing hepsiburada market
new_markets_df <- market_config_df %>%
pivot_longer(cols = c(-id, -config), names_to = "Alternative", values_to = "PREDraw") %>%
group_by(id, config) %>%
mutate(PREDrank = rank(PREDraw, ties.method = "first")) %>%
ungroup() %>%
left_join(mutate_all(select(test_hande_df, Alternative, Brand), as.character), by = "Alternative") %>%
replace_na(list(Brand = "Casper")) %>%
nest(new_market = -config)
new_markets_df %>%
head(10)
## # A tibble: 10 × 2
## config new_market
## <chr> <list>
## 1 1 <tibble [1,295 × 5]>
## 2 2 <tibble [1,295 × 5]>
## 3 3 <tibble [1,295 × 5]>
## 4 4 <tibble [1,295 × 5]>
## 5 5 <tibble [1,295 × 5]>
## 6 6 <tibble [1,295 × 5]>
## 7 7 <tibble [1,295 × 5]>
## 8 8 <tibble [1,295 × 5]>
## 9 9 <tibble [1,295 × 5]>
## 10 10 <tibble [1,295 × 5]>
new_markets_df$new_market[[1]] %>%
filter(PREDrank == 1) %>%
count(Brand) %>%
mutate(Perc = 100*n/sum(n))
## # A tibble: 4 × 3
## Brand n Perc
## <chr> <int> <dbl>
## 1 Casper 81 43.8
## 2 HP 10 5.41
## 3 Sony 22 11.9
## 4 Toshiba 72 38.9
Market shares after each new configuration is introduced to the existing hepsiburada market
new_markets_stat_df <- new_markets_df %>%
mutate(market_shares = map(new_market, ~{
.x %>%
filter(PREDrank == 1) %>%
count(Brand) %>%
mutate(Perc = 100*n/sum(n))
}),
Casper_mshare = map_dbl(market_shares, ~{.x %>% filter(Brand == "Casper") %>% pull("Perc")})) %>%
left_join(config_df, by = "config") %>%
mutate_at(vars(Memory, Price), parse_number,locale = locale(grouping_mark = ".")) %>%
arrange(desc(Casper_mshare), Price)
new_markets_stat_df %>%
select(-new_market, -market_shares) %>%
filter(Casper_mshare >= mshares["Casper"])
## # A tibble: 82 × 9
## config Casper_mshare Brand Processor Speed Memory HardDisk Screen Price
## <chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 228 60.5 Casper Intel Core i7 2.6 G… 16 1 TB 17.3 … 6000
## 2 224 59.5 Casper Intel Core i7 2.6 G… 16 1 TB 15.6 … 6000
## 3 229 58.4 Casper Intel Core i7 2.6 G… 16 1 TB 17.3 … 7000
## 4 230 58.4 Casper Intel Core i7 2.6 G… 16 1 TB 17.3 … 8500
## 5 225 57.8 Casper Intel Core i7 2.6 G… 16 1 TB 15.6 … 7000
## 6 232 57.3 Casper Intel Core i7 2.6 G… 16 750 GB 15.6 … 6000
## 7 236 57.3 Casper Intel Core i7 2.6 G… 16 750 GB 17.3 … 6000
## 8 226 56.2 Casper Intel Core i7 2.6 G… 16 1 TB 15.6 … 8500
## 9 237 55.7 Casper Intel Core i7 2.6 G… 16 750 GB 17.3 … 7000
## 10 101 55.1 Casper Intel Core i5 2.6 G… 16 1 TB 17.3 … 6000
## # … with 72 more rows
new_markets_stat_df %>%
filter(Casper_mshare >= mshares["Casper"]) %>%
select(-new_market, -market_shares) %>%
ggplot(aes(Price, Casper_mshare)) +
geom_hline(yintercept = mshares["Casper"], size=2, col = "yellow") +
geom_jitter(aes(col = Screen), width=200, height=0, alpha = 0.8, size=3) +
scale_x_continuous(labels = scales::comma) +
labs(title = paste("Current Casper market share is", round(mshares["Casper"]), "% (yellow horizontal line)"),
y = "Predicted Casper market share (%)") +
facet_grid(Memory~HardDisk, as.table = FALSE, labeller = label_both) +
theme(strip.text.y = element_text(angle=0))