Skip to content

Latest commit

 

History

History
314 lines (268 loc) · 6.68 KB

Medical_Cost_Analysis_SQL_Syntax.md

File metadata and controls

314 lines (268 loc) · 6.68 KB
/*Age Categories*/

CREATE TEMP TABLE AGECAT AS -- Creating TEMP TABLE AGECAT
SELECT 
  age, region, charges,

  CASE 
  WHEN age <= 19 THEN 'Teen'
  WHEN age BETWEEN 20 AND 39 THEN 'Adult'
  WHEN age BETWEEN 40 AND 59 THEN 'Middle Age'
  WHEN age >= 60 THEN 'Senior'
  END AS AgeCategory
FROM `single-being-353600.Medical_Insurance_Data.Insurance_Data`;
/*Age Category Averages*/
SELECT 
  DISTINCT AgeCategory, 
  COUNT(*) OVER (PARTITION BY AgeCategory) AS AgeCategoryCount,
  ROUND(AVG(charges) OVER (PARTITION BY AgeCategory),2) AS AvgChargeAge
FROM AGECAT
ORDER BY AvgChargeAge;

Result

AgeCategory AgeCategoryCount AvgChargeAge
Teen 137 8407.35
Adult 537 10603.65
Middle Age 550 15431.97
Senior 114 21248.02
/*Correlation Between Age and Charges*/
SELECT
  ROUND(CORR(Age,charges),2) AS CORR_Age_Charges
FROM `single-being-353600.Medical_Insurance_Data.Insurance_Data`;
-- Low correlation between age and charges 

Result

CORR_Age_Charges
0.30
/*Distribution of Charges*/
SELECT 
  DISTINCT TRUNC(charges,-3) AS Charge_Bin, 
  COUNT(*) AS Count
FROM `single-being-353600.Medical_Insurance_Data.Insurance_Data`
GROUP BY Charge_Bin
ORDER BY Charge_Bin;
--Right skew of distribution

Result

Charge_Bin Count
0.0 712
10000.0 353
20000.0 111
30000.0 83
40000.0 72
50000.0 4
60000.0 3
/*Male Distrubution of Charges*/
SELECT  
  DISTINCT sex,
  TRUNC(charges,-4) AS charge_bins,
  COUNT(*) AS Count
FROM `single-being-353600.Medical_Insurance_Data.Insurance_Data`
WHERE Sex = 'male'
GROUP BY sex,charge_bins
ORDER BY charge_bins;
--Right skew

Result

sex charge_bins count
male 0.0 357
male 10000.0 161
male 20000.0 55
male 30000.0 56
male 40000.0 43
male 50000.0 2
male 60000.0 2
/*Female Distribution of Charges*/
SELECT  
  DISTINCT sex,
  TRUNC(charges,-4) AS charge_bins,
  COUNT(*)
FROM `single-being-353600.Medical_Insurance_Data.Insurance_Data`
WHERE Sex = 'female'
GROUP BY sex,charge_bins
ORDER BY charge_bins;
-- Right skew

Result

sex charge_bins Count
female 0.0 355
female 10000.0 192
female 20000.0 56
female 30000.0 27
female 40000.0 29
female 50000.0 2
female 60000.0 1
/*Male and Female Average Charges*/
SELECT 
  DISTINCT Sex, 
  ROUND(AVG(charges) OVER (PARTITION BY sex),2) AS Avg_Charges_Sex
FROM `single-being-353600.Medical_Insurance_Data.Insurance_Data`;

Result

Sex Avg_Charges_Sex
male 13956.75
female 12569.58
/*Male and Female Median Charges*/
SELECT 
  DISTINCT Sex,
  ROUND(PERCENTILE_CONT(charges, 0.50) OVER (PARTITION BY Sex),2) AS Sex_Median
FROM `single-being-353600.Medical_Insurance_Data.Insurance_Data`;

Result

Sex Sex_Median
male 9369.62
female 9412.96
/*Smoker Region Totals*/
SELECT
  DISTINCT region,smoker, COUNT(smoker) OVER (PARTITION BY region, 
  smoker) AS SmokerRegionCount,COUNT(smoker) OVER () AS TotalSmokerCount,
  ROUND((COUNT(smoker) OVER (PARTITION BY region, smoker)/COUNT(smoker) OVER ()*100),0) AS SmokerPercentage
FROM `single-being-353600.Medical_Insurance_Data.Insurance_Data`
WHERE smoker = true
ORDER BY SmokerRegionCount;
--Southwest has the least amount of smokers

Result

region smoker SmokerRegionCount TotalSmokerCount SmokerPercentage
northwest true 58 274 21.0
southwest true 58 274 21.0
northeast true 67 274 24.0
southeast true 91 274 33.0
SELECT
  Smoker, 
  ROUND(AVG(charges),2) AS Avg_Smoking_Charges
FROM `single-being-353600.Medical_Insurance_Data.Insurance_Data`
GROUP BY Smoker;

Result

Smoker Avg_Smoking_Charges
false 8434.27
true 32050.23
/*Comparing charges for smokers to non-smokers*/
SELECT
  ROUND(32050.23/8434.27);
--smoking accounts for an average cost that is about four times greater than a non-smoker

Result

Smoker_Non_Smoker_Comparsion
4.0
/*Smokers by Region - Sex*/
SELECT
  DISTINCT sex,region,smoker, COUNT(smoker) OVER (PARTITION BY region, sex, smoker) AS SmokerRegionCount,
  COUNT(smoker) OVER () AS SmokerTotal, 
  ROUND((COUNT(smoker) OVER (PARTITION BY region, sex, smoker)/COUNT(smoker) OVER () * 100),0) AS RegionFemaleSmokerPercentage
FROM `single-being-353600.Medical_Insurance_Data.Insurance_Data`
WHERE smoker = true
ORDER BY SmokerRegionCount;
--Southwest has lowest number of female smokers

Result

sex region smoker SmokerRegionCount SmokerTotal RegionSexSmokerPercentage
female southwest true 21 274 8.0
female northwest true 29 274 11.0
female northeast true 29 274 11.0
male northwest true 29 274 11.0
female southeast true 36 274 13.0
male southwest true 37 274 14.0
male northeast true 38 274 14.0
male southeast true 55 274 20.0
/*Children and Charge Correlation*/
SELECT 
  ROUND(CORR(children, charges),2) AS CORR_CO_Charges
FROM `single-being-353600.Medical_Insurance_Data.Insurance_Data`;

Result

CORR_CO_Charges
0.07
/*Region Median Charges*/
SELECT 
   DISTINCT Region,
   ROUND(PERCENTILE_CONT(charges, 0.50) OVER (PARTITION BY Region),2) AS Region_Median,
  ROUND(PERCENTILE_CONT(charges, 0.50) OVER (),2) AS Overall_Median
FROM `single-being-353600.Medical_Insurance_Data.Insurance_Data`
ORDER BY Region_Median;

Result

Region Region_Median Overall_Median
southwest 8798.59 9382.03
northwest 8965.80 9382.03
southeast 9294.13 9382.03
northeast 10057.65 9382.03
/*BMI Categories*/
CREATE TEMP TABLE BMI_Table AS
SELECT 
*,
  CASE 
  WHEN BMI < 18.5 THEN 'Underweight'
  WHEN BMI >= 18.5 AND BMI < 25 THEN 'Normal'
  WHEN BMI >= 25 AND BMI < 30 THEN 'Overweight'
  WHEN BMI >= 30 THEN 'Obese'
  END AS BMI_Category,

  CASE 
  WHEN BMI < 30 THEN 'Not Obese'
  WHEN BMI >= 30 THEN 'Obese'
  END Obese

FROM `single-being-353600.Medical_Insurance_Data.Insurance_Data`;

SELECT 
  BMI_Category,
  ROUND(AVG(charges),2) AS Avg_Cost
FROM BMI_Table
GROUP BY BMI_Category
ORDER BY Avg_Cost;
SELECT 
  BMI_Category,
  ROUND(AVG(charges),2) AS Avg_Cost
FROM BMI_Table
GROUP BY BMI_Category
ORDER BY Avg_Cost;

Result

BMI_Category Avg_Cost
Underweight 8852.2
Normal 10409.34
Overweight 10987.51
Obese 15552.34
/Calculating Avg and Median Charges of BMI Subgroups*/
SELECT 
  DISTINCT Obese,
  ROUND(AVG(charges) OVER (PARTITION BY Obese),2) AS Avg_Charges_Obese,
  PERCENTILE_DISC(charges,0.50) OVER (PARTITION BY Obese) AS Obese_Category_Median
FROM BMI_Table
WHERE Smoker IS TRUE

Result

Obese Avg_Charges_Obese Obese_Category_Median
Not Obese 21363.22 20167.34
Obese 41557.99 40904.2