# Set Up Analysis Part 1 of 2

# Import libraries to help with ...

# Reproducible file paths
import os # Reproducible file paths
from glob import glob # Find files by pattern
import pathlib # Find the home folder
import time # formatting time
import warnings # Filter warning messages
import zipfile # Work with zip files
from io import BytesIO # Stream binary (zip) files

# Find files by pattern
import numpy as np # adjust images 
import matplotlib.pyplot as plt # Overlay pandas and xarry plots, Overlay raster and vector data
import requests # Request data over HTTP

# Work with tabular, vector, and raster data
import cartopy.crs as ccrs # CRSs (Coordinate Reference Systems)
import geopandas as gpd # work with vector data
import geoviews as gv # holoviews extension for data visualization
import hvplot.pandas # Interactive tabular and vector data
import hvplot.xarray # Interactive raster
import pandas as pd # Group and aggregate
import pystac_client # Modify returns from API
import shapely # Perform geometric operations on spatial data
import xarray as xr # Adjust images
import rioxarray as rxr # Work with geospatial raster data
from rioxarray.merge import merge_arrays # Merge rasters

# Processing and regression related
from scipy.ndimage import convolve # Image and signal processing
from sklearn.model_selection import KFold # Cross validation
from scipy.ndimage import label # Labels connected features in an array
from sklearn.linear_model import LinearRegression # Work with linear regression models
from sklearn.model_selection import train_test_split # Split data into subsets - evaluate model
from tqdm.notebook import tqdm # Visualize progress of iterative operations

# import to be able to save plots
import holoviews as hv # be able to save hvplots

# Suppress third party warnings - 'ignore'
warnings.simplefilter('ignore')

# Prevent GDAL from quitting due to momentary disruptions
os.environ["GDAL_HTTP_MAX_RETRY"] = "5"
os.environ["GDAL_HTTP_RETRY_DELAY"] = "1"

# Set Up Analysis Part 2 of 2

# Set up census tract path
# Define and create the project data directory
den_census_tracts_data_dir = os.path.join(
    pathlib.Path.home(),
    'documents',
    'earth-analytics',
    'urban_greenspace_denver'
)
os.makedirs(den_census_tracts_data_dir, exist_ok=True)

# Call the data dir to confirm location
den_census_tracts_data_dir

'/Users/briannagleason/documents/earth-analytics/urban_greenspace_denver'

# Download the census tracts from CDC (only once) Part 1 of 1

# Define info for census tract download
den_census_tracts_dir = os.path.join(den_census_tracts_data_dir, 'denver-tract')
os.makedirs(den_census_tracts_dir, exist_ok=True)
den_census_tracts_path = os.path.join(den_census_tracts_dir, '*.shp')

# Only download once (conditional statement)
if not os.path.exists(den_census_tracts_path):
    den_census_tracts_url = (
    'https://data.cdc.gov/download/x7zy-2xmx/application%2Fzip'
    )
    den_census_tracts_gdf = gpd.read_file(den_census_tracts_url)
    denver_tracts_gdf = den_census_tracts_gdf[den_census_tracts_gdf.PlaceName=='Denver']
    denver_tracts_gdf.to_file(den_census_tracts_path, index=False)

# Load in the census tract data
denver_tracts_gdf = gpd.read_file(den_census_tracts_path)

# Call the chicago tracts gdf to see it
denver_tracts_gdf.head()

# Download the census tracts for state of CO (only once) Part 1 of 1

# Define info for census tract download
den_tiger_tracts_dir = os.path.join(den_census_tracts_data_dir, 'colorado-tracts')
os.makedirs(den_tiger_tracts_dir, exist_ok=True)
den_tiger_tracts_path = os.path.join(den_tiger_tracts_dir, '*.shp')

# Only download once (conditional statement)
if not os.path.exists(den_tiger_tracts_path):
    co_tiger_tracts_url = (
    'https://www2.census.gov/geo/tiger/TIGER2024/TRACT/tl_2024_08_tract.zip'
    )
    co_tiger_tracts_gdf = gpd.read_file(co_tiger_tracts_url)
    # COUNTYFP 031 is Denver County which in this case is also the City of Denver
    # It's a City and a County
    den_tiger_tracts_gdf = co_tiger_tracts_gdf[co_tiger_tracts_gdf.COUNTYFP=='031']
    den_tiger_tracts_gdf.to_file(den_tiger_tracts_path, index=False)

# Load in the census tract data
den_tiger_tracts_gdf = gpd.read_file(den_tiger_tracts_path)

# Call the chicago tracts gdf to see it
den_tiger_tracts_gdf.head()

# Perform a spatial join for census tracts at least partially 
# within City of Denver boundary

# this new gdf needs to be joined to the previous one from CDC which
# is already clipped to the city boundary, so no need to download a 
# seperate city boundary shapefile which reduces the amount of things 
# being downloaded

# Define new variable for the joind gdf
joined_den_tracts_gdf = (
    gpd.sjoin(
        # TIGER tracts gdf - only need tracts that intersect with..
        den_tiger_tracts_gdf.to_crs(ccrs.Mercator()),
        # CDC tracts gdf - which are already clipped to the Chicago city boundary
        denver_tracts_gdf.to_crs(ccrs.Mercator()), 
        # Specify type of join ("inner", "left", "right")
        how="inner", 
        # Specify the spatial relationship ("intersects", "within", "contains")
        predicate="intersects"
        )
)

# Explore the result
joined_den_tracts_gdf.head()

# Try to see how many rows there are because I think there's duplicates
num_rows = joined_den_tracts_gdf.shape[0]
print("Number of rows:", num_rows)

Number of rows: 637

# Drop duplicate geometries

# Normalize the geometry column to ensure consistent representation
joined_den_tracts_gdf['geometry'] = joined_den_tracts_gdf.geometry.normalize()

# Drop duplicate rows based on the geometry column
dropped_joined_den_tracts_gdf = joined_den_tracts_gdf.drop_duplicates(subset='geometry')

# Call the gdf to see it
dropped_joined_den_tracts_gdf.head()

# Site plot -- Census tracts with satellite imagery in the background

# Create new variable for plot in order to save it later
joined_denver_tracts_plot = dropped_joined_den_tracts_gdf.to_crs(
# Use hvplot to plot and set parameters
ccrs.Mercator()).hvplot(
    geo=True, crs=ccrs.Mercator(),
    tiles='EsriImagery',
    title='City and County of Denver - Site Plot of Census Tracts',
    fill_color=None, line_color='darkorange', 
    line_width=3, #frame_width=600
    width=700 , height=500
)

# Save the plot as html to be able to display online
hv.save(joined_denver_tracts_plot, 'joined_den_site_plot_using_tiger_and_cdc.html')  

# Display the plot
joined_denver_tracts_plot

# Set up a path for the depression data
den_cdc_depression_path = os.path.join(den_census_tracts_data_dir, 'asthma.csv')

# Download depression data (only once)
if not os.path.exists(den_cdc_depression_path):
    # Define new variable for url
    cdc_places_tracts_url = (
        
        "https://archive.org/download/20250128-cdc-datasets/PLACES_Local_Data_for_Better_Health_Census_Tract_Data_2022_release.csv"
        #"&StateAbbr=CO"
        #"&CountyName=Denver"
        #"&Measureid=DEPRESSION"
        #"&$limit=1500"
    )

    # Make a request to the URL and show progress with tqdm
    print("Downloading data...")
    response = requests.get(cdc_places_tracts_url, stream=True)

    # Check for successful request (status code 200)
    if response.status_code == 200:
        # Total size of the response in bytes for tqdm
        total_size = int(response.headers.get('content-length', 0))
        
        # Download and save the file in chunks, updating the progress bar
        with open(den_cdc_depression_path, 'wb') as f, tqdm(
            total=total_size, unit='B', unit_scale=True, desc="Downloading depression data"
        ) as pbar:
            for data in response.iter_content(chunk_size=1024):
                f.write(data)
                pbar.update(len(data))  # Update the progress bar with the chunk size
    else:
        print(f"Failed to download data. HTTP Status code: {response.status_code}")
    
    # After download, process the CSV and load into a DataFrame
    print("Processing the downloaded data...")

    # Define new variable for dataframe
    cdc_df = (
        # Read a CSV file into a dataframe
        pd.read_csv(den_cdc_depression_path)
        # Replace column names as needed - 'old_name':'new_name'
        .rename(columns={
            'Data_Value': 'depression',
            'Low_Confidence_Limit': 'depression_ci_low',
            'High_Confidence_Limit': 'depression_ci_high',
            'LocationName': 'tract'})
        # Select specifc columns needed/wanted with double brackets    
        [[
            'MeasureId',
            'CountyName',
            'Year', 
            'tract', 
            'depression', 'depression_ci_low', 'depression_ci_high', 'Data_Value_Unit',
            'TotalPopulation'
        ]]
    )

    # Filter based on multiple conditions:
    den_cdc_depression_df = cdc_df[
        # Select the health outcome wanted from the measure id
        (cdc_df['MeasureId'] =='DEPRESSION') & 
        # Select the county name
        (cdc_df['CountyName'] =='Denver')
        # If the county name occurs in more than one state,
        # the sateabbr would need to be chosen and included in this 
        # selection and in the columns wanted above
    ]

    # Save dataframe to a CSV (tabular data) file
    den_cdc_depression_df.to_csv(
        den_cdc_depression_path, 
        # Prevent a new index column from being created
        index=False
        )

# Load in depression data
den_cdc_depression_df = pd.read_csv(den_cdc_depression_path)

# Preview depression data
den_cdc_depression_df

# Change tract identifier datatype for merging
dropped_joined_den_tracts_gdf.tract2010 = dropped_joined_den_tracts_gdf.tract2010.astype('int64')

# Merge census data with geometry
den_tract_cdc_gdf = (
    # Census tracts gdf
    dropped_joined_den_tracts_gdf
    # Use the .merge() method
    .merge(
        # Depression prevalence dataframe
        den_cdc_depression_df, 
        # Specify the column/index to merge on for Census tracts gdf
        left_on='tract2010', 
        # Specify the column/index to merge on for the depression dataframe
        right_on='tract', 
        # Specify type of join ("inner", "left", "right")
        how='inner'
        )
)

# Plot depression data as chloropleth
den_chloropleth_depression_by_tract = (
# Use EsriImagery tiles for the background
(   gv.tile_sources.EsriImagery
    * 
    gv.Polygons(
        # Change gdf CRS to mercator
        den_tract_cdc_gdf.to_crs(ccrs.Mercator()),
        # Set variables for the plot
        vdims=['depression', 'tract2010'],
        # Set CRS to Mercator
        crs=ccrs.Mercator()
    ).opts(
        # Add a colorbar and label
        color='depression', colorbar=True, 
        clabel='% of Depression Prevalence in Popultation',
        tools=['hover'])
).opts(
    # Plot size, title, and axes labels
    title= 'Denver - Depression Prevalence by Census Tract',
    # Set the width and the height
    width=800, height=500,
    # Drop the axes labels
    xaxis=None, yaxis=None,
    )
)

# Save the plot as html to be able to display online
hv.save(den_chloropleth_depression_by_tract, 'den_chloropleth_depression_by_tract.html')  

# Display the plot
den_chloropleth_depression_by_tract

# Call the denver cdc gdf to see it
den_tract_cdc_gdf.head()

# Connect to the planetary computer catalog
e84_catalog = pystac_client.Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1"
)
e84_catalog.title

'Microsoft Planetary Computer STAC API'

# Convert geometry to lat/lon for STAC
den_tract_latlon_gdf = den_tract_cdc_gdf.to_crs(4326)

# Define a path to save NDVI stats
den_ndvi_stats_path = os.path.join(den_census_tracts_data_dir, 'denver-ndvi-stats.csv')

# Check for existing data - do not access duplicate tracts

# Create a list to accumulate downloaded tracts
den_downloaded_tracts = []
# If Else statement to control specifc blocks of code
# Code to execute if the condition is true
if os.path.exists(den_ndvi_stats_path):
    den_ndvi_stats_df = pd.read_csv(den_ndvi_stats_path)
    den_downloaded_tracts = den_ndvi_stats_df.tract.values
# Code to execute if the condition is false
else:
    print('No census tracts downloaded so far')


# Loop through each census tract

# Create list of dataframes, list needs to be outside of loop
den_scene_dfs = []
# Start for loop
for i, den_tract_values in tqdm(den_tract_latlon_gdf.iterrows(), ncols=100):
    den_tract = den_tract_values.tract
    # Check if statistics are already downloaded for this tract
    if not (den_tract in den_downloaded_tracts):
        # Retry up to 5 times in case of a momentary disruption
        i = 0
        retry_limit = 5
        # Loop for executing code block as long as specified condition is true
        while i < retry_limit:
            # Try accessing the STAC
            try:
                # Search for tiles
                naip_search = e84_catalog.search(
                    # In the NAIP collection
                    collections=["naip"],
                    # That intersect with geometry of census tracts
                    intersects=shapely.to_geojson(den_tract_values.geometry),
                    # In the year 2021
                    datetime="2021"
                )
                
                # Build dataframe with tracts and tile urls
                den_scene_dfs.append(pd.DataFrame(dict(
                    tract=den_tract,
                    # Convert datetime value to a pandas Timestamp, then extract just the date 
                    date=[pd.to_datetime(scene.datetime).date() 
                          # of the items in the NAIP search
                          for scene in naip_search.items()],
                    # Reference image from the assets folder of the items in the NAIP search?   
                    rgbir_href=[scene.assets['image'].href for scene in naip_search.items()],
                )))
                # Add break to prevent long waits during debugging
                break
            # Try again in case of an APIError
            except pystac_client.exceptions.APIError:
                print(
                    f'Could not connect with STAC server. '
                    f'Retrying tract {den_tract}...')
                time.sleep(2)
                i += 1
                # Skip the rest of the current iteration and move on to the next one
                continue
    
# Concatenate the url dataframes
# Code to execute if the condition is true
if den_scene_dfs:
    den_scene_df = pd.concat(den_scene_dfs).reset_index(drop=True)
# Code to execute if the condition is false
else:
    den_scene_df = None

# Preview the URL DataFrame
den_scene_df.head()

No census tracts downloaded so far

0it [00:00, ?it/s]

# See dropped_joined_den_tracts because the merge 
# below didn't work because of an index error
dropped_joined_den_tracts_gdf.head()

""" Uncomment the double comments (##) for code if the merge doesn't 
 work below - after I ran it once, it wouldn't let me go back and 
 re run it """

# dropped_joined_den_tracts_gdf has a '0' in front of all 
# the values for tract2010 (because the stateFP is 08). But...
# the NAIP automatically dropped the '0' in front of all the 
# values for the 'tract' column. 

# Create a copy of dropped_joined_den_tracts_gdf to perserve the original
## stripped_den_census_tracts_gdf = dropped_joined_den_tracts_gdf.copy()

# Remove/strip the leading '0' from the values of the 'tract2010'
# column of the gdf in order to index correctly
## stripped_den_census_tracts_gdf['tract2010'
    ##] = stripped_den_census_tracts_gdf['tract2010'].str.lstrip('0')

# Call the gdf to make sure that worked
##stripped_den_census_tracts_gdf

" Uncomment the double comments (##) for code if the merge doesn't \n work below - after I ran it once, it wouldn't let me go back and \n re run it "

# Create conditional statement
if not den_scene_df is None:
    # Create empty list outside of for loop to save results back to
    all_den_ndvi_dfs = []
    # Loop through the census tracts with URLs
    for tract, tract_date_gdf in tqdm(den_scene_df.groupby('tract')):
        # Open all images for tract
        all_den_tile_das = []
        # Create for loop, iterate over rows
        for _, href_s in tract_date_gdf.iterrows():
            # Open vsi connection to data
            all_den_tile_da = rxr.open_rasterio(
                # File path/url to the multispectral image
                href_s.rgbir_href, 
                # Create masked array, then remove any single-dimensional axes from it
                masked=True).squeeze()
            
            # Crop data to the bounding box of the census tract
            # Create the boundary
            all_den_boundary = (
                # Using the census tract gdf
                den_tract_cdc_gdf
                # Set the 'tract2010' as the index of the gdf
                .set_index('tract2010')
                # Select the tracts from the gdf
                .loc[[tract]]
                # Set to the same CRS as the images for the tracts
                .to_crs(all_den_tile_da.rio.crs)
                # Access the geometry of the tracts to perform further operations
                .geometry
            )
            # Crop the data to bounding box
            all_den_crop_da = all_den_tile_da.rio.clip_box(
                # Compute bounding box (min and max coordinates) of census tract geometry
                *all_den_boundary.envelope.total_bounds,
                # Expand bounding box slightly beyond its original extent to ensure full coverage
                auto_expand=True)
            
            # Clip data to the boundary of the census tract
            all_den_clip_da = all_den_crop_da.rio.clip(all_den_boundary, all_touched=True)

            # Compute NDVI ((NIR - Red)/(NIR + Red))
            all_den_ndvi_da = (
                (all_den_clip_da.sel(band=4) - all_den_clip_da.sel(band=1)) 
                / (all_den_clip_da.sel(band=4) + all_den_clip_da.sel(band=1))
            )
            
            # Accumulate result
            all_den_tile_das.append(all_den_ndvi_da)

        # Merge data
        all_den_scene_da = merge_arrays(all_den_tile_das)

        # Mask vegetation
        all_den_veg_mask = (all_den_scene_da>.3)

        # Calculate statistics and save data to file
        # Calculate total number of non-missing (valid) pixels in the merged raster
        total_pixels = all_den_scene_da.notnull().sum()
        # Calculates total number of pixels that are classified as vegetation
        veg_pixels = all_den_veg_mask.sum()

        # Calculate mean patch size
        # Label the connected areas
        all_labeled_patches, all_num_patches = label(all_den_veg_mask)
        # Count patch pixels, ignoring background at patch 0
        all_patch_sizes = np.bincount(all_labeled_patches.ravel())[1:] 
        # Get the mean 
        all_mean_patch_size = all_patch_sizes.mean()

        # Calculate edge density
        all_kernel = np.array([
            [1, 1, 1], 
            [1, -8, 1], 
            [1, 1, 1]])
        
        # Apply convolution to the vegetation mask
        all_den_edges = convolve(
            # Input array - the array to apply convolution on
            all_den_veg_mask, 
            # Kernel array - the smaller array that defines the filter to be applied
            all_kernel, 
            # Input array is extended beyond its boundaries by - 
            # filling all values beyond the edge with the same constant value
            mode='constant')

        # Calculate edge density = 
        # count of edge pixels present / total number of pixels in the veg_mask
        all_den_edge_density = np.sum(all_den_edges != 0) / all_den_veg_mask.size

        # Add a row to the statistics file for this tract
        pd.DataFrame(dict(
            # Unique identifier for a given tract
            tract=[tract],
            # Cast total number of pixels to an integer
            total_pixels=[int(total_pixels)],
            # Cast the fraction of pixels in the tract that are vegetation to a float
            frac_veg=[float(veg_pixels/total_pixels)],
            # Mean patch size of vegetation for this tract,
            all_mean_patch_size=[all_mean_patch_size],
            # Edge density of vegetation for this tract
            all_den_edge_density=[all_den_edge_density]
            # Write the df to a csv file
        )).to_csv(
            # The file path where the CSV will be saved to
            den_ndvi_stats_path, 
            # Ensure that the data is appended to the file rather than overwriting it
            mode='a', 
            # Prevent the index from being written to the CSV file
            index=False, 
            # Check if the file exists
            header=(not os.path.exists(den_ndvi_stats_path))
        )

# Re-load results from file
all_den_ndvi_stats_df = pd.read_csv(den_ndvi_stats_path)
# Call this df to see it
all_den_ndvi_stats_df

  0%|          | 0/98 [00:00<?, ?it/s]

# Merge census data with geometry
den_ndvi_cdc_gdf = (
    # Choose gdf that contacins geometry for Census tracts
    den_tract_cdc_gdf
    # Combine two df's/gdf's based on specified columns
    .merge(
        # Choose all_ndvi_stats_df - it contains the veg stats for each census tract
        all_den_ndvi_stats_df,
        # Specify tract2010 column from the tract_cdc_gdf as key for merging
        left_on='tract', 
        # Specify the tract column from the all_ndvi_stats_df as the key for merging
        right_on='tract', 
        # Keep only the rows where there is a matching key in both df/gdf
        how='inner')
)

# Plot chloropleths with vegetation statistics
def plot_chloropleth(gdf, **opts):
    """Generate a chloropleth with the given color column"""
    # Plot polygons based on geometry in gdf
    return gv.Polygons(
        # Convert CRS of gdf to Mercator 
        gdf.to_crs(ccrs.Mercator()),
        # Define the CRS to use for the map - Mercator
        crs=ccrs.Mercator()
        # Customize the plot
    ).opts(
        # Remove the x and y axes from the plot
        xaxis=None, yaxis=None, 
        # Add a colorbar
        colorbar=True, 
        # Any additional options passed when the 
        # function is called are included
        **opts)

# Create new variable for plots in order to save them later
den_side_by_side_chlorpleths = (
(
    # First chloropleth for Asthma Prevalence
    plot_chloropleth(
        # Using the NDVI CDC gdf
        den_ndvi_cdc_gdf,
        # Specify that the census tracts should be colored based on the depression column
        color='depression', 
        # Set label for colorbar
        clabel='% of Depression Prevalence in Population',
        # Specify the color map to use for coloring the tracts
        cmap='viridis',
        # Add a title
        title= 'Denver Census Tracts - Depression Prevalence',
        #Set width and height
        width=600, height=450
        )
    # Overlay the two plots
    + 
    # Second chloropleth for Edge Density
    plot_chloropleth(
        # Using the NDVI CDC gdf
        den_ndvi_cdc_gdf, 
        # Specify that the census tracts should be colored based on the edge_density column
        color='all_den_edge_density', 
        # Set label for colorbar
        clabel='Edge Density',
        # Specify the color map to use for coloring the tracts
        cmap='Greens',
        # Add a title
        title= 'Denver Census Tracts - Edge Density ',
        # Set width and height
        width=600, height=450
        )
)
)
# Save the plot as html to be able to display online
hv.save(den_side_by_side_chlorpleths, 'den_side_by_side_chlorpleths.html') 

# Display the plots 
den_side_by_side_chlorpleths

# Call the new gdf to see the table
den_ndvi_cdc_gdf

# Variable selection and transformation
# Create new variable for the model df
den_model_df = (
    # Using the den_ndvi_cdc_gdf
    den_ndvi_cdc_gdf
    # Create a copy to avoid modifying the original data
    .copy()
    # Select the subet of columns needed
    [['frac_veg', 'depression', 'all_mean_patch_size', 'all_den_edge_density', 'geometry']]
    # Remove any rows with NaN VALUES
    .dropna()
)
# Log transformation of depression data in the df
# This is to help handle skewed data or effort to normalize it
den_model_df['log_depression'] = np.log(den_model_df.depression)

# Plot scatter matrix to identify variables that need transformation
# Create new variable to save plots to
den_scatter_matrix = (

# Generate a scatter matrix (or pair plot)
hvplot.scatter_matrix(
    # Using model df
    den_model_df
    # Select columns to be plotted in the matrix
    [[ 
        'all_mean_patch_size',
        'all_den_edge_density',
        'log_depression',
    # Set width and height    
    ]]
    ).opts(
        # Plot size, title, and axes labels
        title='Scatter Matrix',
        width=800,
        height=600,
    )
)

# Save the plot as html to be able to display online
hv.save(den_scatter_matrix, 'den_scatter_matrix.html')  

# Display the plots
den_scatter_matrix

WARNING:bokeh.core.validation.check:W-1005 (FIXED_SIZING_MODE): 'fixed' sizing mode requires width and height to be set: figure(id='470bdf82-c5d4-488c-a93d-4d29c11a587a', ...)
WARNING:bokeh.core.validation.check:W-1005 (FIXED_SIZING_MODE): 'fixed' sizing mode requires width and height to be set: figure(id='06732310-011c-4376-89fc-a606316a107f', ...)
WARNING:bokeh.core.validation.check:W-1005 (FIXED_SIZING_MODE): 'fixed' sizing mode requires width and height to be set: figure(id='50634169-eda6-4b2d-8ddf-2f6bb8aa0054', ...)
WARNING:bokeh.core.validation.check:W-1005 (FIXED_SIZING_MODE): 'fixed' sizing mode requires width and height to be set: figure(id='6129eb91-40fa-4ece-b8de-770a8e86b88a', ...)
WARNING:bokeh.core.validation.check:W-1005 (FIXED_SIZING_MODE): 'fixed' sizing mode requires width and height to be set: figure(id='0b7d9560-8c5c-443d-9ec8-d208433f46e0', ...)
WARNING:bokeh.core.validation.check:W-1005 (FIXED_SIZING_MODE): 'fixed' sizing mode requires width and height to be set: figure(id='d21ecfcb-9ab3-4463-a96c-e92091b06a31', ...)
WARNING:bokeh.core.validation.check:W-1005 (FIXED_SIZING_MODE): 'fixed' sizing mode requires width and height to be set: figure(id='5cbc397a-8ce0-47ce-a102-98995a15cfe3', ...)
WARNING:bokeh.core.validation.check:W-1005 (FIXED_SIZING_MODE): 'fixed' sizing mode requires width and height to be set: figure(id='0e0ebc3a-7341-48a9-b932-6bc9b6e4fbdc', ...)
WARNING:bokeh.core.validation.check:W-1005 (FIXED_SIZING_MODE): 'fixed' sizing mode requires width and height to be set: figure(id='d26fb7cb-bc84-48a0-9e4f-168cd7131db3', ...)

# Select predictor and outcome variables
# Define the predictor or indpendent variables
X = den_model_df[['all_den_edge_density', 'all_mean_patch_size']]
# Define the outcome variable or dependent variable
y = den_model_df[['log_depression']]

# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(
    # Specifiy that 33% of the data will be used for testing
    X, y, test_size=0.33, 
    # Ensure that data is split randomly - the random split is reproducible
    random_state=42)

# Fit a linear regression
# Create an instance of the linear regression model
reg = LinearRegression()
# Fit the training data to the linear regression model
reg.fit(X_train, y_train)

# Predict depression values for the test dataset
y_test['pred_depression'] = np.exp(
    # Apply exponential function to predicted values to transform to original scale
    reg.predict(X_test))
# Apply exponential function to predicted values to transform to original scale
y_test['depression'] = np.exp(y_test.log_depression)

# Plot measured vs. predicted depression prevalence with a 1-to-1 line

# Find max value of depression prevalence in the test dat to set the limits for the plot axes
y_max = y_test.depression.max()

# Create new variable to save plot to
den_measured_v_predicted_depression = (
(
# Create scatterplot 
 y_test.hvplot.scatter(
        # X axis is actual depression prevalence and Y axis is predicted depression prevalence
        x='depression', y='pred_depression',
        # Label x axis
        xlabel='Measured Depression Prevalence', 
        # Label y axis
        ylabel='Predicted Depression Prevalence',
        # Create title for plot
        title='Linear Regression Performance - Testing Data'
    ) 
    .opts(
        # Scale both axes the same
        aspect='equal', 
        # Set limits for the axes - scale according to range of actual depression values
        xlim=(0, y_max), ylim=(0, y_max), 
        # Set size of the plot
        height=500, width=500)
    # Add a slope line and set color of line
) * hv.Slope(slope=1, y_intercept=0).opts(color='black')
)
# Save the plot as html to be able to display online
hv.save(den_measured_v_predicted_depression, 'den_measured_v_predicted_depression.html') 

# Display the plot 
den_measured_v_predicted_depression

# Compute model error for all census tracts
# Apply exponential function to predicted values to transform to original scale
den_model_df['pred_depression'] = np.exp(reg.predict(X))
# Calculate model error for each Census tract, store computed errors in a new column
den_model_df['err_depression'] = den_model_df['pred_depression'] - den_model_df['depression']

# Plot chloropleths with vegetation statistics
def plot_chloropleth(gdf, **opts):
    """Generate a chloropleth with the given color column"""
    # Plot polygons based on geometry in gdf
    return gv.Polygons(
        # Convert CRS of gdf to Mercator 
        gdf.to_crs(ccrs.Mercator()),
        # Define the CRS to use for the map - Mercator
        crs=ccrs.Mercator()
        # Customize the plot
    ).opts(
        # Remove the x and y axes from the plot
        xaxis=None, yaxis=None, 
        # Add a colorbar
        colorbar=True, 
        # Any additional options passed when the 
        # function is called are included
        **opts)

# Create new variable to save the plot to
den_model_error_chloropleth = (
# Plot error geographically as a chloropleth

    (
        # Color the chloropleth based on the model error
        plot_chloropleth(den_model_df, color='err_depression', cmap='RdBu')
        # Adjust the color scale/range for the model error
        .redim.range(err_depression=(-.3, .3))
        # Customize plot
        .opts(
            # Add a title
            title= 'City of Denver - Model Errors for Predicted Depression Prevalence',
            # Add a label for color bar
            clabel= 'Model Error', 
            # Ensure aspect ratio equal (helps preserve the true shaps of census tracts)
            aspect='equal',
            # Set width and height
            width = 700, height = 400
            )
    )
)
# Save the plot as html to be able to display online
hv.save(den_model_error_chloropleth, 'den_model_error_chloropleth.html')  

# Display the plot
den_model_error_chloropleth

	place2010	tract2010	ST	PlaceName	plctract10	PlcTrPop10	geometry
0	0820000	08031000102	08	Denver	0820000-08031000102	3109	POLYGON ((-11691351.798 4834636.885, -11691351...
1	0820000	08031000201	08	Denver	0820000-08031000201	3874	POLYGON ((-11688301.532 4835632.272, -11688302...
2	0820000	08031000202	08	Denver	0820000-08031000202	3916	POLYGON ((-11688362.201 4834372.228, -11688360...
3	0820000	08031000301	08	Denver	0820000-08031000301	5003	POLYGON ((-11691355.36 4833538.467, -11691357....
4	0820000	08031000302	08	Denver	0820000-08031000302	4036	POLYGON ((-11692926.635 4832494.047, -11692925...

	STATEFP	COUNTYFP	TRACTCE	GEOID	GEOIDFQ	NAME	NAMELSAD	MTFCC	FUNCSTAT	ALAND	AWATER	INTPTLAT	INTPTLON	geometry
0	08	031	004404	08031004404	1400000US08031004404	44.04	Census Tract 44.04	G5020	S	1304208	0	+39.7365094	-104.8940558	POLYGON ((-104.90346 39.73899, -104.90346 39.7...
1	08	031	004403	08031004403	1400000US08031004403	44.03	Census Tract 44.03	G5020	S	1465389	0	+39.7443925	-104.8948130	POLYGON ((-104.90346 39.74561, -104.90346 39.7...
2	08	031	003701	08031003701	1400000US08031003701	37.01	Census Tract 37.01	G5020	S	1873267	120383	+39.7444161	-104.9509827	POLYGON ((-104.95979 39.7452, -104.95978 39.74...
3	08	031	003702	08031003702	1400000US08031003702	37.02	Census Tract 37.02	G5020	S	706200	0	+39.7365218	-104.9543646	POLYGON ((-104.95979 39.73504, -104.95979 39.7...
4	08	031	003703	08031003703	1400000US08031003703	37.03	Census Tract 37.03	G5020	S	681502	0	+39.7361181	-104.9450408	POLYGON ((-104.94949 39.73242, -104.94949 39.7...

	STATEFP	COUNTYFP	TRACTCE	GEOID	GEOIDFQ	NAME	NAMELSAD	MTFCC	FUNCSTAT	ALAND	...	INTPTLAT	INTPTLON	geometry	index_right	place2010	tract2010	ST	PlaceName	plctract10	PlcTrPop10
0	08	031	004404	08031004404	1400000US08031004404	44.04	Census Tract 44.04	G5020	S	1304208	...	+39.7365094	-104.8940558	POLYGON ((-11677799.972 4800763.868, -11677799...	90	0820000	08031004405	08	Denver	0820000-08031004405	7316
0	08	031	004404	08031004404	1400000US08031004404	44.04	Census Tract 44.04	G5020	S	1304208	...	+39.7365094	-104.8940558	POLYGON ((-11677799.972 4800763.868, -11677799...	89	0820000	08031004404	08	Denver	0820000-08031004404	5978
1	08	031	004403	08031004403	1400000US08031004403	44.03	Census Tract 44.03	G5020	S	1465389	...	+39.7443925	-104.8948130	POLYGON ((-11677800.084 4801719.178, -11677799...	89	0820000	08031004404	08	Denver	0820000-08031004404	5978
1	08	031	004403	08031004403	1400000US08031004403	44.03	Census Tract 44.03	G5020	S	1465389	...	+39.7443925	-104.8948130	POLYGON ((-11677800.084 4801719.178, -11677799...	88	0820000	08031004403	08	Denver	0820000-08031004403	4213
1	08	031	004403	08031004403	1400000US08031004403	44.03	Census Tract 44.03	G5020	S	1465389	...	+39.7443925	-104.8948130	POLYGON ((-11677800.084 4801719.178, -11677799...	80	0820000	08031004107	08	Denver	0820000-08031004107	3810

	STATEFP	COUNTYFP	TRACTCE	GEOID	GEOIDFQ	NAME	NAMELSAD	MTFCC	FUNCSTAT	ALAND	...	INTPTLAT	INTPTLON	geometry	index_right	place2010	tract2010	ST	PlaceName	plctract10	PlcTrPop10
0	08	031	004404	08031004404	1400000US08031004404	44.04	Census Tract 44.04	G5020	S	1304208	...	+39.7365094	-104.8940558	POLYGON ((-11677800.195 4800722.197, -11677799...	90	0820000	08031004405	08	Denver	0820000-08031004405	7316
1	08	031	004403	08031004403	1400000US08031004403	44.03	Census Tract 44.03	G5020	S	1465389	...	+39.7443925	-104.8948130	POLYGON ((-11677800.084 4801719.178, -11677799...	89	0820000	08031004404	08	Denver	0820000-08031004404	5978
2	08	031	003701	08031003701	1400000US08031003701	37.01	Census Tract 37.01	G5020	S	1873267	...	+39.7444161	-104.9509827	POLYGON ((-11684070.377 4801596.893, -11684070...	83	0820000	08031004301	08	Denver	0820000-08031004301	4469
3	08	031	003702	08031003702	1400000US08031003702	37.02	Census Tract 37.02	G5020	S	706200	...	+39.7365218	-104.9543646	POLYGON ((-11684070.933 4799965.66, -11684070....	57	0820000	08031003300	08	Denver	0820000-08031003300	3099
4	08	031	003703	08031003703	1400000US08031003703	37.03	Census Tract 37.03	G5020	S	681502	...	+39.7361181	-104.9450408	POLYGON ((-11682924.12 4799771.02, -11682924.0...	57	0820000	08031003300	08	Denver	0820000-08031003300	3099

	MeasureId	CountyName	Year	tract	depression	depression_ci_low	depression_ci_high	Data_Value_Unit	TotalPopulation
0	DEPRESSION	Denver	2020	8031001301	20.5	19.6	21.5	%	4972
1	DEPRESSION	Denver	2020	8031000600	19.5	18.4	20.9	%	2552
2	DEPRESSION	Denver	2020	8031001402	20.3	19.5	21.0	%	4070
3	DEPRESSION	Denver	2020	8031001800	22.9	21.3	24.7	%	3209
4	DEPRESSION	Denver	2020	8031000301	20.0	19.1	21.0	%	5003
...	...	...	...	...	...	...	...	...	...
138	DEPRESSION	Denver	2020	8031002802	21.5	20.6	22.5	%	4213
139	DEPRESSION	Denver	2020	8031003202	19.6	18.7	20.6	%	3001
140	DEPRESSION	Denver	2020	8031002901	21.2	19.9	22.7	%	2602
141	DEPRESSION	Denver	2020	8031008391	20.0	19.2	20.8	%	7020
142	DEPRESSION	Denver	2020	8031004202	20.2	19.3	21.1	%	4109

Urban Greenspace and Depression Prevalence in Denver¶

Overview¶

Citations:¶

Site Description¶

Citations:¶

Data Description¶

* CDC Places Data¶

CDC Places Data (prior to it being taken down)¶

CDC Places Data (via the Internet Archive)¶

CDC Places Citations:¶

* NAIP via Microsoft Planetary Computer STAC API - Multispectral Data¶

NAIP Citations:¶

Methods Description¶

Set Up Analysis and Site Plot¶

Site Plot of Census Tracts for Denver: dispersed¶

tracts that also vary in size¶

Citations:¶

Depression Data Wrangle¶

Links to asthma data - for reference (both original and archived)¶

Depression Prevalence Plot Description: % of depression¶

prevalence has possible visual connection to the 'inverted L'¶

Citations:¶

NAIP Wrangle¶

Get Data URLs¶

Compute NDVI Stats¶

Plot¶

Side by Side Plot Description: Some similarities visually - there¶

is possibly a relationship between % depression prevalence and¶

edge density¶

Citations:¶

Regression and Analysis¶

Data Transformation and Selection Process¶

Fit and Predict¶

Compute Error¶

Describe and Interpret Model Results and Plots:¶

results do not indicate a predictive relationship,¶

but it is possible that the model was a poor choice¶

	tract	date	rgbir_href
0	8031004405	2021-07-28	https://naipeuwest.blob.core.windows.net/naip/...
1	8031004404	2021-07-28	https://naipeuwest.blob.core.windows.net/naip/...
2	8031004404	2021-07-28	https://naipeuwest.blob.core.windows.net/naip/...
3	8031004301	2021-07-28	https://naipeuwest.blob.core.windows.net/naip/...
4	8031004301	2021-07-28	https://naipeuwest.blob.core.windows.net/naip/...

	tract	total_pixels	frac_veg	all_mean_patch_size	all_den_edge_density
0	8031000201	10846478	0.252190	216.457308	0.113350
1	8031000202	5419141	0.309361	208.308897	0.142652
2	8031000402	10770115	0.301096	163.730284	0.138094
3	8031000501	4012262	0.298093	158.855891	0.216446
4	8031000600	9657159	0.211728	132.754967	0.087855
...	...	...	...	...	...
93	8031011902	2924616	0.347838	207.780229	0.135499
94	8031012001	7430057	0.312909	217.344022	0.143229
95	8031012010	50762227	0.136467	218.000063	0.055945
96	8031015600	27883693	0.125721	184.786095	0.056048
97	8031015700	22520093	0.215354	122.720236	0.080804