krz · September 19, 2022 07:44
diff --git a/tw.R b/tw.R
 library(sampleSelection) # for RandHIE data
 library(dplyr)
 data(RandHIE)

 # combined costs
 RandHIE$cost <- RandHIE$outpdol + 
  RandHIE$drugdol +
  RandHIE$suppdol +
  RandHIE$mentdol +
  RandHIE$inpdol 

 # select relevant variables
 rh <- RandHIE %>% group_by(zper) %>% summarise(cost = first(cost),
                                               age=first(xage),
                                               disea=first(disea),
                                               physlm=first(physlm),
                                               logc=first(logc), #lc
                                               idp=first(idp),#
                                               lpi=first(lpi),#
                                               fmde=first(fmde),
                                               linc=first(linc),
                                               lfam=first(lfam),
                                               female=first(female),
                                               black=first(black),
                                               educdec=first(educdec),
                                               hlthg=first(hlthg)
 )

 # select only people >= 18 and remove missing values
 rh <- rh %>% filter(age >= 18 & !is.na(educdec))
 rh$zper <- NULL

 # remove implausible values
 rh <- rh %>% filter(physlm == 0 | physlm == 1 | black == 0 | black == 1)


 length(which(rh$cost ==0))/nrow(rh) # 18.1 % zeros


 ############## tobit

 library(VGAM)
 tob <- vglm(cost ~ ., tobit(Lower = 0, type.fitted = "censored"), data = rh, maxit=100)
 summary(tob)
 AICc(tob)

 ############## tweedie

 library(cplm)
 tw2 <- cpglm(cost ~ ., data=rh)
 summary(tw2)

 (AIC(tw2) - 2*length(coef(tw2)) ) / -2

 ############## gamma twopart
 rh$zero <- ifelse(rh$cost == 0, 0, 1)  # 0 if zero costs

 h1 <- glm(zero ~ . -cost, family=binomial(link=logit), data=rh)
 summary(h1)
 logLik(h1)

 h2 <- glm(cost ~ . -zero, family = Gamma(link = log), data=subset(rh, zero == 1))
 summary(h2)
 logLik(h2)

 AIC(h1) + AIC(h2)


 #######################

 # rmse estimation

 # split into train and test
 set.seed(41)
 idx <- sample(1:nrow(rh), 500, replace=F)

 train.x <- rh[-idx, ]
 test.x <-  rh[idx,]

 ### tobit

 tob.rmse <- vglm(cost ~ . -zero, tobit(Lower = 0, type.fitted = c("censored")), data = train.x, maxit=100)
 summary(tob.rmse)

 preds <- predict(tob.rmse, test.x, type="response")

 #rmse
 sqrt(mean((preds - test.x$cost)^2))


 ### gamma twopart
 h1.rmse <- glm(zero ~ . -cost, family=binomial(link=logit), data=train.x)
 pred2 <- predict(h1.rmse, test.x, type="response")
 #pred2 <- ifelse(pred2 < 0.5, 0, 1) 

 h2.rmse <- glm(cost ~ . -zero, family = Gamma(link = log), data=subset(train.x, zero == 1))

 # new
 preds2 <- pred2 * predict(h2.rmse, test.x, type="response")
 #rmse
 sqrt(mean((preds2 - test.x$cost)^2))

 ### tweedie
 train.x$zero <- NULL
 tw.rmse <- cpglm(cost ~., data=train.x)
 preds3 <- predict(tw.rmse, test.x, type="response")
 # rmse
 sqrt(mean((preds3 - test.x$cost)^2))
	library(sampleSelection) # for RandHIE data
	library(dplyr)
	data(RandHIE)

	# combined costs
	RandHIE$cost <- RandHIE$outpdol +
	RandHIE$drugdol +
	RandHIE$suppdol +
	RandHIE$mentdol +
	RandHIE$inpdol

	# select relevant variables
	rh <- RandHIE %>% group_by(zper) %>% summarise(cost = first(cost),
	age=first(xage),
	disea=first(disea),
	physlm=first(physlm),
	logc=first(logc), #lc
	idp=first(idp),#
	lpi=first(lpi),#
	fmde=first(fmde),
	linc=first(linc),
	lfam=first(lfam),
	female=first(female),
	black=first(black),
	educdec=first(educdec),
	hlthg=first(hlthg)
	)

	# select only people >= 18 and remove missing values
	rh <- rh %>% filter(age >= 18 & !is.na(educdec))
	rh$zper <- NULL

	# remove implausible values
	rh <- rh %>% filter(physlm == 0 \| physlm == 1 \| black == 0 \| black == 1)


	length(which(rh$cost ==0))/nrow(rh) # 18.1 % zeros


	############## tobit

	library(VGAM)
	tob <- vglm(cost ~ ., tobit(Lower = 0, type.fitted = "censored"), data = rh, maxit=100)
	summary(tob)
	AICc(tob)

	############## tweedie

	library(cplm)
	tw2 <- cpglm(cost ~ ., data=rh)
	summary(tw2)

	(AIC(tw2) - 2*length(coef(tw2)) ) / -2

	############## gamma twopart
	rh$zero <- ifelse(rh$cost == 0, 0, 1) # 0 if zero costs

	h1 <- glm(zero ~ . -cost, family=binomial(link=logit), data=rh)
	summary(h1)
	logLik(h1)

	h2 <- glm(cost ~ . -zero, family = Gamma(link = log), data=subset(rh, zero == 1))
	summary(h2)
	logLik(h2)

	AIC(h1) + AIC(h2)


	#######################

	# rmse estimation

	# split into train and test
	set.seed(41)
	idx <- sample(1:nrow(rh), 500, replace=F)

	train.x <- rh[-idx, ]
	test.x <- rh[idx,]

	### tobit

	tob.rmse <- vglm(cost ~ . -zero, tobit(Lower = 0, type.fitted = c("censored")), data = train.x, maxit=100)
	summary(tob.rmse)

	preds <- predict(tob.rmse, test.x, type="response")

	#rmse
	sqrt(mean((preds - test.x$cost)^2))


	### gamma twopart
	h1.rmse <- glm(zero ~ . -cost, family=binomial(link=logit), data=train.x)
	pred2 <- predict(h1.rmse, test.x, type="response")
	#pred2 <- ifelse(pred2 < 0.5, 0, 1)

	h2.rmse <- glm(cost ~ . -zero, family = Gamma(link = log), data=subset(train.x, zero == 1))

	# new
	preds2 <- pred2 * predict(h2.rmse, test.x, type="response")
	#rmse
	sqrt(mean((preds2 - test.x$cost)^2))

	### tweedie
	train.x$zero <- NULL
	tw.rmse <- cpglm(cost ~., data=train.x)
	preds3 <- predict(tw.rmse, test.x, type="response")
	# rmse
	sqrt(mean((preds3 - test.x$cost)^2))