distribution_fit_utils

View Source
   1from csv import QUOTE_ALL
   2import glob
   3import math
   4import os
   5import sys
   6import numpy as np
   7import pandas as pd
   8import json
   9import itertools
  10from fitter import Fitter, get_common_distributions
  11from datetime import timedelta
  12from utils import Utils
  13from des_parallel_process import parallelProcessJoblib, collateRunResults, removeExistingResults
  14from datetime import datetime
  15import visualisation._job_outcome_calculation as _job_outcome_calculation
  16
  17os.chdir(os.path.dirname(os.path.realpath(__file__)))
  18sys.path.append(os.path.dirname(os.path.realpath(__file__)))
  19
  20
  21class DistributionFitUtils():
  22    """
  23        # The DistributionFitUtils classa
  24
  25        This class will import a CSV, undertake some light
  26        wrangling and then determine distributions and probabilities required
  27        for the Discrete Event Simulation
  28
  29        example usage:
  30            my_data = DistributionFitUtils('data/my_data.csv')
  31            my_data.import_and_wrangle()
  32
  33    """
  34
  35    def __init__(self, file_path: str, calculate_school_holidays = False, school_holidays_years = 0):
  36
  37        self.file_path = file_path
  38        self.df = pd.DataFrame()
  39
  40        # The number of additional years of school holidays
  41        # that will be calculated over that maximum date in the provided dataset
  42        self.school_holidays_years = school_holidays_years
  43        self.calculate_school_holidays = calculate_school_holidays
  44
  45        self.times_to_fit = [
  46            {"hems_result": "Patient Treated but not conveyed by HEMS",
  47            "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_clear']},
  48            {"hems_result": "Patient Conveyed by HEMS" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 'time_to_clear']},
  49            {"hems_result": "Patient Conveyed by land with HEMS" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 'time_to_clear']},
  50            {"hems_result": "Stand Down" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_clear']},
  51            {"hems_result": "Landed but no patient contact" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_clear']},
  52        ]
  53
  54        self.sim_tools_distr_plus = [
  55            "poisson",
  56            "bernoulli",
  57            "triang",
  58            "erlang",
  59            "weibull_min",
  60            "expon_weib",
  61            "betabinom",
  62            "pearson3",
  63            "cauchy",
  64            "chi2",
  65            "expon",
  66            "exponpow",
  67            "gamma",
  68            "lognorm",
  69            "norm",
  70            "powerlaw",
  71            "rayleigh",
  72            "uniform",
  73            "neg_binomial",
  74            "zip"
  75        ]
  76        # SR 16-04-2025 Have hardcoded the common distributions
  77        # to make setup for random number generation more robust
  78        #+ get_common_distributions()
  79
  80    def removeExistingResults(self, folder: str) -> None:
  81            """
  82                Removes results from previous fitting
  83            """
  84
  85            matching_files = glob.glob(os.path.join(folder, "*.*"))
  86
  87            print(matching_files)
  88
  89            for file in matching_files:
  90                os.remove(file)
  91
  92
  93    def getBestFit(self, q_times, distr=get_common_distributions(), show_summary=False):
  94        """
  95
  96            Convenience function for Fitter.
  97            Returns model and parameters that is considered
  98            the 'best fit'.
  99
 100            TODO: Determine how Fitter works this out
 101
 102        """
 103
 104        if(q_times.size > 0):
 105            if(len(distr) > 0):
 106                f = Fitter(q_times, timeout=60, distributions=distr)
 107            else:
 108                f = Fitter(q_times, timeout=60)
 109            f.fit()
 110            if show_summary == True:
 111                f.summary()
 112            return f.get_best()
 113        else:
 114            return {}
 115
 116
 117    def import_and_wrangle(self):
 118        """
 119
 120            Function to import CSV, add additional columns that are required
 121            and then sequentially execute other class functions to generate
 122            the probabilities and distributions required for the DES.
 123
 124            TODO: Additional logic is required to check the imported CSV
 125            for missing values, incorrect columns names etc.
 126
 127        """
 128
 129        try:
 130            df = pd.read_csv(self.file_path, quoting=QUOTE_ALL)
 131            self.df = df
 132
 133            # Perhaps run some kind of checking function here.
 134
 135        except FileNotFoundError:
 136            print(f"Cannot locate that file")
 137
 138        # If everything is okay, crack on...
 139        self.df['inc_date'] = pd.to_datetime(self.df['inc_date'])
 140        self.df['date_only'] = pd.to_datetime(df['inc_date'].dt.date)
 141        self.df['hour'] = self.df['inc_date'].dt.hour                      # Hour of the day
 142        self.df['day_of_week'] = self.df['inc_date'].dt.day_name()         # Day of the week (e.g., Monday)
 143        self.df['month'] = self.df['inc_date'].dt.month
 144        self.df['quarter'] = self.df['inc_date'].dt.quarter
 145        self.df['first_day_of_month'] = self.df['inc_date'].to_numpy().astype('datetime64[M]')
 146
 147        # Replacing a upper quartile limit on job cycle times and
 148        # instead using a manually specified time frame.
 149        # This has the advantage of allowing manual amendment of the falues
 150        # on the front-end
 151        # self.max_values_df = self.upper_allowable_time_bounds()
 152        self.min_max_values_df = pd.read_csv('actual_data/upper_allowable_time_bounds.csv')
 153        #print(self.min_max_values_df)
 154
 155        #This will be needed for other datasets, but has already been computed for DAA
 156        #self.df['ampds_card'] = self.df['ampds_code'].str[:2]
 157
 158        self.removeExistingResults(Utils.HISTORICAL_FOLDER)
 159        self.removeExistingResults(Utils.DISTRIBUTION_FOLDER)
 160
 161        #get proportions of AMPDS card by hour of day
 162        self.hour_by_ampds_card_probs()
 163
 164        # Determine 'best' distributions for time-based data
 165        self.activity_time_distributions()
 166
 167        # Calculate probability patient will be female based on AMPDS card
 168        self.sex_by_ampds_card_probs()
 169
 170        # Determine 'best' distributions for age ranges straitifed by AMPDS card
 171        self.age_distributions()
 172
 173        # Alternative approach to IA times. Start with probabilty of call at given hour stratified by quarter
 174        self.hourly_arrival_by_qtr_probs()
 175
 176        # Calculates the mean and standard deviation of the number of incidents per day stratified by quarter
 177        self.incidents_per_day()
 178        self.incidents_per_day_samples()
 179
 180        # Calculate probability of enhanced or critical care being required based on AMPDS card
 181        self.enhanced_or_critical_care_by_ampds_card_probs()
 182
 183        # Calculate HEMS result
 184        self.hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs()
 185
 186        # Calculate probability of callsign being allocated to a job based on AMPDS card and hour of day
 187        # self.callsign_group_by_ampds_card_and_hour_probs()
 188        # self.callsign_group_by_ampds_card_probs()
 189        # self.callsign_group_by_care_category()
 190        self.callsign_group()
 191
 192        # Calculate probability of a particular vehicle type based on callsign group and month of year
 193        # self.vehicle_type_by_month_probs()
 194        self.vehicle_type_by_quarter_probs()
 195        # self.vehicle_type_probs() # Similar to previous but without monthly stratification since ad hoc unavailability should account for this.
 196
 197        # Calculate the patient outcome (conveyed, deceased, unknown)
 198        self.patient_outcome_by_care_category_and_quarter_probs()
 199
 200        # ============= ARCHIVED CODE ================= #
 201        # Calculate the mean inter-arrival times stratified by yearly quarter and hour of day
 202        # self.inter_arrival_times()
 203        # ============= END ARCHIVED CODE ================= #
 204
 205
 206        # ============= ARCHIVED CODE ================= #
 207        # Calculate probably of patient outcome
 208        # Note - this still needs to be run to support another one?
 209        # Calculate probability of a specific patient outcome being allocated to a job based on HEMS result and callsign
 210        # self.pt_outcome_by_hems_result_and_care_category_probs()
 211        # ============= END ARCHIVED CODE ================= #
 212
 213        # ============= ARCHIVED CODE ================= #
 214        # self.hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs()
 215        # ============= END ARCHIVED CODE ================= #
 216
 217        # ============= ARCHIVED CODE ================= #
 218        # Calculate probabily of HEMS result being allocated to a job based on callsign and hour of day
 219        # self.hems_result_by_callsign_group_and_vehicle_type_probs()
 220        # ============= END ARCHIVED CODE ================= #
 221
 222        # ============= ARCHIVED CODE ================= #
 223        # Calculate probability of HEMS result being allocated to a job based on care category and helicopter benefit
 224        # self.hems_result_by_care_cat_and_helicopter_benefit_probs()
 225        # ============= END ARCHIVED CODE ================= #
 226
 227
 228        # Calculate school holidays since servicing schedules typically avoid these dates
 229        if self.calculate_school_holidays:
 230            self.school_holidays()
 231
 232        # Calculate historical data
 233        self.historical_monthly_totals()
 234        self.historical_monthly_totals_by_callsign()
 235        self.historical_monthly_totals_by_day_of_week()
 236        self.historical_median_time_of_activities_by_month_and_resource_type()
 237        self.historical_monthly_totals_by_hour_of_day()
 238        self.historical_monthly_resource_utilisation()
 239        self.historical_monthly_totals_all_calls()
 240        self.historical_daily_calls_breakdown()
 241        self.historical_job_durations_breakdown()
 242        self.historical_missed_jobs()
 243        self.historical_jobs_per_day_per_callsign()
 244        self.historical_care_cat_counts()
 245
 246        # Calculate proportions of ad hoc unavailability
 247        try:
 248            # self.ad_hoc_unavailability()
 249            self.ad_hoc_unavailability(period_start="2022-08-01", period_end="2024-07-31")
 250        except FileNotFoundError:
 251            print("Couldn't find ad-hoc unavailability file")
 252
 253    def hour_by_ampds_card_probs(self):
 254        """
 255
 256            Calculates the proportions of calls that are triaged with
 257            a specific AMPDS card. This is stratified by hour of day
 258
 259            TODO: Determine whether this should also be stratified by yearly quarter
 260
 261        """
 262        category_counts = self.df.groupby(['hour', 'ampds_card']).size().reset_index(name='count')
 263        total_counts = category_counts.groupby('hour')['count'].transform('sum')
 264        category_counts['proportion'] = round(category_counts['count'] / total_counts, 4)
 265
 266        #category_counts['ampds_card'] = category_counts['ampds_card'].apply(lambda x: str(x).zfill(2))
 267
 268        category_counts.to_csv('distribution_data/hour_by_ampds_card_probs.csv', mode="w+")
 269
 270
 271    def sex_by_ampds_card_probs(self):
 272        """
 273
 274            Calculates the probability that the patient will be female
 275            stratified by AMPDS card.
 276
 277        """
 278        age_df = self.df
 279        category_counts = age_df.groupby(['ampds_card', 'sex']).size().reset_index(name='count')
 280        total_counts = category_counts.groupby('ampds_card')['count'].transform('sum')
 281        category_counts['proportion'] = round(category_counts['count'] / total_counts, 3)
 282
 283        category_counts[category_counts['sex'] =='Female'].to_csv('distribution_data/sex_by_ampds_card_probs.csv', mode="w+")
 284
 285    def activity_time_distributions(self):
 286        """
 287
 288            Determine the 'best' distribution for each phase of a call
 289            i.e. Allocation time, Mobilisation time, Time to scene
 290            Time on scene, Travel time to hospital and handover, Time to clear.
 291            Not all times will apply to all cases, so the class 'times_to_fit'
 292            variable is a list of dictionaries, which contains the times to fit
 293
 294            The data is currently stratitied by HEMS_result and vehicle type fields.
 295
 296        """
 297
 298        vehicle_type = self.df['vehicle_type'].dropna().unique()
 299
 300        # We'll need to make sure that where a distribution is missing that the time is set to 0 in the model.
 301        # Probably easier than complicated logic to determine what times should be available based on hems_result
 302
 303        final_distr = []
 304
 305        for row in self.times_to_fit:
 306            #print(row)
 307            for ttf in row['times_to_fit']:
 308                for vt in vehicle_type:
 309                    #print(f"HEMS result is {row['hems_result']} times_to_fit is {ttf} and vehicle type is  {vt}")
 310
 311                    # This line might not be required if data quality is determined when importing the data
 312                    max_time = self.min_max_values_df[self.min_max_values_df['time'] == ttf].max_value_mins.iloc[0]
 313                    min_time = self.min_max_values_df[self.min_max_values_df['time'] == ttf].min_value_mins.iloc[0]
 314
 315                    #print(f"Max time is {max_time} and Min time is {min_time}")
 316
 317                    if ttf == 'time_on_scene':
 318                        # There is virtually no data for HEMS_result other than patient conveyed
 319                        # which is causing issues with fitting. For time on scene, will
 320                        # use a simplified fitting ignoring hems_result as a category
 321                        fit_times = self.df[
 322                            (self.df.vehicle_type == vt) &
 323                            (self.df[ttf] >= min_time) &
 324                            (self.df[ttf] <= max_time)
 325                        ][ttf]
 326                    else:
 327                        fit_times = self.df[
 328                            (self.df.vehicle_type == vt) &
 329                            (self.df[ttf] >= min_time) &
 330                            (self.df[ttf] <= max_time) &
 331                            (self.df.hems_result == row['hems_result'])
 332                        ][ttf]
 333                    #print(fit_times[:10])
 334                    best_fit = self.getBestFit(fit_times, distr=self.sim_tools_distr_plus)
 335                    #print(best_fit)
 336
 337                    return_dict = { "vehicle_type": vt, "time_type" : ttf, "best_fit": best_fit, "hems_result": row['hems_result'], "n": len(fit_times)}
 338                    #print(return_dict)
 339                    final_distr.append(return_dict)
 340
 341        with open('distribution_data/activity_time_distributions.txt', 'w+') as convert_file:
 342            convert_file.write(json.dumps(final_distr))
 343        convert_file.close()
 344
 345
 346    def age_distributions(self):
 347        """
 348
 349            Determine the 'best' distribution for age stratified by
 350            AMPDS card
 351
 352        """
 353
 354        age_distr = []
 355
 356        age_df = self.df[["age", "ampds_card"]].dropna()
 357        ampds_cards = age_df['ampds_card'].unique()
 358        print(ampds_cards)
 359
 360        for card in ampds_cards:
 361            fit_ages = age_df[age_df['ampds_card'] == card]['age']
 362            best_fit = self.getBestFit(fit_ages, distr=self.sim_tools_distr_plus)
 363            return_dict = { "ampds_card": str(card), "best_fit": best_fit, "n": len(fit_ages)}
 364            age_distr.append(return_dict)
 365
 366        with open('distribution_data/age_distributions.txt', 'w+') as convert_file:
 367            convert_file.write(json.dumps(age_distr))
 368        convert_file.close()
 369
 370
 371    # def inter_arrival_times(self):
 372    #     """
 373
 374    #         Calculate the mean inter-arrival times for patients
 375    #         stratified by hour, and and yearly quarter
 376
 377    #     """
 378
 379    #     ia_df = self.df[['date_only', 'quarter', 'hour']].dropna()
 380
 381    #     count_df = ia_df.groupby(['hour', 'date_only', 'quarter']).size().reset_index(name='n')
 382
 383    #     ia_times_df = (
 384    #         count_df.groupby(['hour', 'quarter'])
 385    #         .agg(
 386    #             # max_arrivals_per_hour=('n', lambda x: round(60 / np.max(x), 3)),
 387    #             # min_arrivals_per_hour=('n', lambda x: round(60 / np.min(x),3)),
 388    #             mean_cases=('n', lambda x: round(x.mean(), 1)),
 389    #             # sd_cases=('n', lambda x: round(x.std(), 3)),
 390    #             mean_iat=('n', lambda x: 60 / x.mean())
 391    #             # n=('n', 'size')
 392    #         )
 393    #         .reset_index()
 394    #     )
 395    #     # Additional column for NSPPThinning
 396    #     ia_times_df['t'] = ia_times_df['hour']
 397    #     ia_times_df['arrival_rate'] = ia_times_df['mean_iat'].apply(lambda x: 1/x)
 398
 399    #     ia_times_df.to_csv('distribution_data/inter_arrival_times.csv', mode='w+')
 400
 401
 402    def incidents_per_day(self):
 403        """
 404        Fit distributions for number of incidents per day using actual daily counts,
 405        applying year-based weighting to reflect trends (e.g., 2024 busier than 2023),
 406        stratified by season and quarter.
 407        """
 408        import math
 409        import json
 410        import numpy as np
 411
 412        inc_df = self.df[['inc_date', 'date_only', 'quarter']].copy()
 413        inc_df['year'] = inc_df['date_only'].dt.year
 414
 415        # Daily incident counts
 416        inc_per_day = inc_df.groupby('date_only').size().reset_index(name='jobs_per_day')
 417        inc_per_day['year'] = inc_per_day['date_only'].dt.year
 418
 419        # Merge quarter and season from self.df
 420        date_info = self.df[['date_only', 'quarter']].drop_duplicates()
 421
 422        if 'season' not in self.df.columns:
 423            date_info['season'] = date_info['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer")
 424        else:
 425            date_info = date_info.merge(
 426                self.df[['date_only', 'season']].drop_duplicates(),
 427                on='date_only',
 428                how='left'
 429            )
 430
 431        inc_per_day = inc_per_day.merge(date_info, on='date_only', how='left')
 432
 433        # Weight settings - simple implementation rather than biased mean thing
 434        year_weights = {
 435            2023: 1.0,
 436            2024: 4.0  # 10% more weight to 2024
 437        }
 438
 439        # ========== SEASONAL DISTRIBUTIONS ==========
 440        jpd_distr = []
 441
 442        for season in inc_per_day['season'].dropna().unique():
 443            filtered = inc_per_day[inc_per_day['season'] == season].copy()
 444            filtered['weight'] = filtered['year'].map(year_weights).fillna(1.0)
 445
 446            # Repeat rows proportionally by weight
 447            replicated = filtered.loc[
 448                filtered.index.repeat((filtered['weight'] * 10).round().astype(int))
 449            ]['jobs_per_day']
 450
 451            best_fit = self.getBestFit(np.array(replicated), distr=self.sim_tools_distr_plus)
 452
 453            jpd_distr.append({
 454                "season": season,
 455                "best_fit": best_fit,
 456                "min_n_per_day": int(replicated.min()),
 457                "max_n_per_day": int(replicated.max()),
 458                "mean_n_per_day": float(replicated.mean())
 459            })
 460
 461        with open('distribution_data/inc_per_day_distributions.txt', 'w+') as f:
 462            json.dump(jpd_distr, f)
 463
 464        # ========== QUARTERLY DISTRIBUTIONS ==========
 465        jpd_qtr_distr = []
 466
 467        for quarter in inc_per_day['quarter'].dropna().unique():
 468            filtered = inc_per_day[inc_per_day['quarter'] == quarter].copy()
 469            filtered['weight'] = filtered['year'].map(year_weights).fillna(1.0)
 470
 471            replicated = filtered.loc[
 472                filtered.index.repeat((filtered['weight'] * 10).round().astype(int))
 473            ]['jobs_per_day']
 474
 475            best_fit = self.getBestFit(np.array(replicated), distr=self.sim_tools_distr_plus)
 476
 477            jpd_qtr_distr.append({
 478                "quarter": int(quarter),
 479                "best_fit": best_fit,
 480                "min_n_per_day": int(replicated.min()),
 481                "max_n_per_day": int(replicated.max()),
 482                "mean_n_per_day": float(replicated.mean())
 483            })
 484
 485        with open('distribution_data/inc_per_day_qtr_distributions.txt', 'w+') as f:
 486            json.dump(jpd_qtr_distr, f)
 487
 488    def incidents_per_day_samples(self, weight_map=None, scale_factor=10):
 489        """
 490            Create weighted empirical samples of incidents per day by season and quarter.
 491        """
 492
 493        inc_df = self.df[['date_only', 'quarter']].copy()
 494        inc_df['year'] = inc_df['date_only'].dt.year
 495        inc_df['season'] = inc_df['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer")
 496
 497        # Get raw counts per day
 498        daily_counts = inc_df.groupby('date_only').size().reset_index(name='jobs_per_day')
 499        daily_counts['year'] = daily_counts['date_only'].dt.year
 500
 501        # Merge back in season/quarter info
 502        meta_info = self.df[['date_only', 'quarter']].drop_duplicates()
 503        if 'season' in self.df.columns:
 504            meta_info = meta_info.merge(
 505                self.df[['date_only', 'season']].drop_duplicates(),
 506                on='date_only', how='left'
 507            )
 508        else:
 509            meta_info['season'] = meta_info['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer")
 510
 511        daily_counts = daily_counts.merge(meta_info, on='date_only', how='left')
 512
 513        # Year weight map
 514        if weight_map is None:
 515            weight_map = {2023: 1.0, 2024: 1.1}
 516
 517        # Compute weights
 518        daily_counts['weight'] = daily_counts['year'].map(weight_map).fillna(1.0)
 519
 520        # Storage
 521        empirical_samples = {}
 522
 523        # Season-based
 524        for season in daily_counts['season'].dropna().unique():
 525            filtered = daily_counts[daily_counts['season'] == season].copy()
 526            repeated = filtered.loc[
 527                filtered.index.repeat((filtered['weight'] * scale_factor).round().astype(int))
 528            ]['jobs_per_day'].tolist()
 529
 530            empirical_samples[season] = repeated
 531
 532        # Quarter-based
 533        for quarter in daily_counts['quarter'].dropna().unique():
 534            filtered = daily_counts[daily_counts['quarter'] == quarter].copy()
 535            repeated = filtered.loc[
 536                filtered.index.repeat((filtered['weight'] * scale_factor).round().astype(int))
 537            ]['jobs_per_day'].tolist()
 538
 539            empirical_samples[f"Q{int(quarter)}"] = repeated
 540
 541        with open("distribution_data/inc_per_day_samples.json", 'w') as f:
 542            json.dump(empirical_samples, f)
 543
 544
 545    def enhanced_or_critical_care_by_ampds_card_probs(self):
 546        """
 547
 548            Calculates the probabilty of enhanced or critical care resource beign required
 549            based on the AMPDS card
 550
 551        """
 552
 553        ec_df = self.df[['ampds_card', 'ec_benefit', 'cc_benefit']].copy()
 554
 555        def assign_care_category(row):
 556            # There are some columns with both EC and CC benefit selected
 557            # this function will allocate to only 1
 558            if row['cc_benefit'] == 'y':
 559                return 'CC'
 560            elif row['ec_benefit'] == 'y':
 561                return 'EC'
 562            else:
 563                return 'REG'
 564
 565        ec_df['care_category'] = ec_df.apply(assign_care_category, axis = 1)
 566
 567        care_cat_counts = ec_df.groupby(['ampds_card', 'care_category']).size().reset_index(name='count')
 568        total_counts = care_cat_counts.groupby('ampds_card')['count'].transform('sum')
 569
 570        care_cat_counts['proportion'] = round(care_cat_counts['count'] / total_counts, 3)
 571
 572        care_cat_counts.to_csv('distribution_data/enhanced_or_critical_care_by_ampds_card_probs.csv', mode = "w+", index = False)
 573
 574    def patient_outcome_by_care_category_and_quarter_probs(self):
 575        """
 576
 577            Calculates the probabilty of a patient outcome based on care category and yearly quarter
 578
 579        """
 580
 581        po_df = self.df[['quarter', 'ec_benefit', 'cc_benefit', 'pt_outcome']].copy()
 582
 583        def assign_care_category(row):
 584            # There are some columns with both EC and CC benefit selected
 585            # this function will allocate to only 1
 586            if row['cc_benefit'] == 'y':
 587                return 'CC'
 588            elif row['ec_benefit'] == 'y':
 589                return 'EC'
 590            else:
 591                return 'REG'
 592
 593        # There are some values that are missing e.g. CC quarter 1 Deceased
 594        # I think we've had problems when trying to sample from this kind of thing before
 595        # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0
 596        outcomes = po_df['pt_outcome'].unique()
 597        care_categories = ['CC', 'EC', 'REG']
 598        quarters = po_df['quarter'].unique()
 599
 600        all_combinations = pd.DataFrame(list(itertools.product(outcomes, care_categories, quarters)),
 601                                    columns=['pt_outcome', 'care_category', 'quarter'])
 602
 603        po_df['care_category'] = po_df.apply(assign_care_category, axis = 1)
 604
 605        po_cat_counts = po_df.groupby(['pt_outcome', 'care_category', 'quarter']).size().reset_index(name='count')
 606
 607        merged = pd.merge(all_combinations, po_cat_counts,
 608                      on=['pt_outcome', 'care_category', 'quarter'],
 609                      how='left').fillna({'count': 0})
 610        merged['count'] = merged['count'].astype(int)
 611
 612        total_counts = merged.groupby(['care_category', 'quarter'])['count'].transform('sum')
 613        merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3)
 614
 615        merged.to_csv('distribution_data/patient_outcome_by_care_category_and_quarter_probs.csv', mode = "w+", index = False)
 616
 617    # def hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs(self):
 618    #     """
 619
 620    #         Calculates the probabilty of a given HEMS result based on
 621    #         patient outcome, yearly quarter, vehicle type and callsign group
 622
 623    #     """
 624
 625    #     hr_df = self.df[['hems_result', 'quarter', 'pt_outcome', 'vehicle_type', 'callsign_group']].copy()
 626
 627    #     # There are some values that are missing e.g. CC quarter 1 Deceased
 628    #     # I think we've had problems when trying to sample from this kind of thing before
 629    #     # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0
 630    #     # hems_results = hr_df['hems_result'].unique()
 631    #     # outcomes = hr_df['pt_outcome'].unique()
 632    #     # vehicle_categories = [x for x in hr_df['vehicle_type'].unique() if pd.notna(x)]
 633    #     # callsign_group_categories = hr_df['callsign_group'].unique()
 634    #     # quarters = hr_df['quarter'].unique()
 635
 636    #     # all_combinations = pd.DataFrame(list(itertools.product(hems_results, outcomes, vehicle_categories, callsign_group_categories, quarters)),
 637    #     #                             columns=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'])
 638
 639    #     hr_cat_counts = hr_df.groupby(['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter']).size().reset_index(name='count')
 640
 641    #     # merged = pd.merge(all_combinations, hr_cat_counts,
 642    #     #               on=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'],
 643    #     #               how='left').fillna({'count': 0})
 644    #     # merged['count'] = merged['count'].astype(int)
 645
 646    #     merged = hr_cat_counts
 647
 648    #     total_counts = merged.groupby(['pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'])['count'].transform('sum')
 649    #     merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3)
 650
 651    #     merged.to_csv('distribution_data/hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs.csv', mode = "w+", index = False)
 652
 653    def hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs(self):
 654        """
 655
 656            Calculates the probabilty of a given HEMS result based on
 657                - patient outcome
 658                - yearly quarter
 659                - time of day (7am - 6pm, 7pm - 6am)
 660                - vehicle type
 661                - and callsign group
 662
 663        """
 664        self.df['inc_date'] = pd.to_datetime(self.df['inc_date'])
 665        self.df['hour'] = self.df['inc_date'].dt.hour
 666        self.df['time_of_day'] = self.df['hour'].apply(lambda x: 'day' if x >= 7 and x <= 18 else "night")
 667
 668        hr_df = self.df[[
 669            'hems_result', 'quarter', 'pt_outcome',
 670            'vehicle_type', 'callsign_group', 'time_of_day'
 671            ]].copy()
 672
 673        # There are some values that are missing e.g. CC quarter 1 Deceased
 674        # I think we've had problems when trying to sample from this kind of thing before
 675        # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0
 676        # hems_results = hr_df['hems_result'].unique()
 677        # outcomes = hr_df['pt_outcome'].unique()
 678        # vehicle_categories = [x for x in hr_df['vehicle_type'].unique() if pd.notna(x)]
 679        # callsign_group_categories = hr_df['callsign_group'].unique()
 680        # quarters = hr_df['quarter'].unique()
 681
 682        # all_combinations = pd.DataFrame(list(itertools.product(hems_results, outcomes, vehicle_categories, callsign_group_categories, quarters)),
 683        #                             columns=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'])
 684
 685        hr_cat_counts = hr_df.groupby(['hems_result', 'pt_outcome',
 686                                       'vehicle_type', 'callsign_group',
 687                                       'quarter', 'time_of_day']).size().reset_index(name='count')
 688
 689        # merged = pd.merge(all_combinations, hr_cat_counts,
 690        #               on=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'],
 691        #               how='left').fillna({'count': 0})
 692        # merged['count'] = merged['count'].astype(int)
 693
 694        merged = hr_cat_counts
 695
 696        total_counts = merged.groupby(['pt_outcome',
 697                                       'vehicle_type', 'callsign_group',
 698                                       'quarter', 'time_of_day'])['count'].transform('sum')
 699        merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3)
 700
 701        merged.to_csv('distribution_data/hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs.csv', mode = "w+", index = False)
 702
 703
 704
 705    def hourly_arrival_by_qtr_probs(self):
 706        """
 707
 708            Calculates the proportions of calls arriving in any given hour
 709            stratified by yearly quarter
 710
 711        """
 712
 713        ia_df = self.df[['quarter', 'hour']].dropna()
 714
 715        hourly_counts = ia_df.groupby(['hour', 'quarter']).size().reset_index(name='count')
 716        total_counts = hourly_counts.groupby(['quarter'])['count'].transform('sum')
 717        hourly_counts['proportion'] = round(hourly_counts['count'] / total_counts, 4)
 718
 719        hourly_counts.sort_values(by=['quarter', 'hour']).to_csv('distribution_data/hourly_arrival_by_qtr_probs.csv', mode="w+")
 720
 721
 722    def callsign_group_by_ampds_card_and_hour_probs(self):
 723        """
 724
 725            Calculates the probabilty of a specific callsign being allocated to
 726            a call based on the AMPDS card category and hour of day
 727
 728        """
 729        callsign_counts = self.df.groupby(['ampds_card', 'hour', 'callsign_group']).size().reset_index(name='count')
 730
 731        total_counts = callsign_counts.groupby(['ampds_card', 'hour'])['count'].transform('sum')
 732        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 733
 734        callsign_counts.to_csv('distribution_data/callsign_group_by_ampds_card_and_hour_probs.csv', mode = "w+", index=False)
 735
 736    def callsign_group_by_ampds_card_probs(self):
 737        """
 738
 739            Calculates the probabilty of a specific callsign being allocated to
 740            a call based on the AMPDS card category
 741
 742        """
 743
 744        callsign_df = self.df[self.df['callsign_group'] != 'Other']
 745
 746        callsign_counts = callsign_df.groupby(['ampds_card', 'callsign_group']).size().reset_index(name='count')
 747
 748        total_counts = callsign_counts.groupby(['ampds_card'])['count'].transform('sum')
 749        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 750
 751        callsign_counts.to_csv('distribution_data/callsign_group_by_ampds_card_probs.csv', mode = "w+", index=False)
 752
 753    def callsign_group(self):
 754        """
 755            Calculates the probabilty of a specific callsign being allocated to
 756            a call
 757        """
 758        df = self.df.copy()
 759
 760        # Convert time fields to numeric
 761        time_fields = [
 762            "time_allocation", "time_mobile", "time_to_scene",
 763            "time_on_scene", "time_to_hospital", "time_to_clear"
 764        ]
 765        for col in time_fields:
 766            df[col] = pd.to_numeric(df[col], errors="coerce")
 767
 768        # Calculate total job duration in minutes
 769        df["job_duration_min"] = df[time_fields].sum(axis=1, skipna=True)
 770
 771        # Compute job start and end times
 772        df["start_time"] = df["inc_date"]
 773        df["end_time"] = df["start_time"] + pd.to_timedelta(df["job_duration_min"], unit="m")
 774
 775        # Sort jobs by start time
 776        df = df.sort_values(by="start_time").reset_index(drop=True)
 777
 778        # Set to hold indices of jobs that overlap (but only the later-starting ones)
 779        overlapping = set()
 780
 781        # Check for overlaps
 782        for i in range(len(df)):
 783            this_end = df.at[i, "end_time"]
 784
 785            # Compare only to later jobs
 786            for j in range(i + 1, len(df)):
 787                next_start = df.at[j, "start_time"]
 788                if next_start >= this_end:
 789                    break  # No more possible overlaps
 790                # If it starts before i's job ends, it's overlapping
 791                overlapping.add(j)
 792
 793        # Mark the overlaps in the dataframe
 794        df["overlaps"] = df.index.isin(overlapping)
 795
 796        # Filter out overlapping jobs
 797        df_no_overlap = df[~df["overlaps"]]
 798
 799        # We will use the ad-hoc unavailability to remove any instances where we already know one of
 800        # the vehicles to be recorded as offline
 801
 802        data = df_no_overlap.copy()
 803
 804        # TODO: Ideally we'd also remove any instances where we know one of the helos to have been
 805        # off for servicing if that data is available
 806        ad_hoc = pd.read_csv("external_data/ad_hoc.csv", parse_dates=["offline", "online"])
 807        ad_hoc["aircraft"] = ad_hoc["aircraft"].str.lower()
 808
 809        data["inc_date"] = pd.to_datetime(data["inc_date"], format="ISO8601")
 810        data["vehicle"] = data["vehicle"].str.lower()
 811
 812        # Create a cross-join between data and ad_hoc
 813        data['key'] = 1
 814        ad_hoc['key'] = 1
 815        merged = data.merge(ad_hoc, on='key')
 816
 817        # Keep rows where inc_date falls within the offline period
 818        overlap = merged[(merged['inc_date'] >= merged['offline']) & (merged['inc_date'] <= merged['online'])]
 819
 820        # Filter out those rows from the original data
 821        df_no_overlap = data[~data['inc_date'].isin(overlap['inc_date'])].drop(columns='key')
 822
 823        callsign_df = (
 824            df_no_overlap
 825            .assign(
 826                helicopter_benefit=np.select(
 827                    [
 828                        df_no_overlap["cc_benefit"] == "y",
 829                        df_no_overlap["ec_benefit"] == "y",
 830                        df_no_overlap["hems_result"].isin([
 831                            "Stand Down En Route",
 832                            "Landed but no patient contact",
 833                            "Stand Down Before Mobile"
 834                        ])
 835                    ],
 836                    ["y", "y", "n"],
 837                    default=df_no_overlap["helicopter_benefit"]
 838                ),
 839                care_category=np.select(
 840                    [
 841                        df_no_overlap["cc_benefit"] == "y",
 842                        df_no_overlap["ec_benefit"] == "y"
 843                    ],
 844                    ["CC", "EC"],
 845                    default="REG"
 846                )
 847            )
 848        )
 849
 850        callsign_df = callsign_df[callsign_df['callsign_group'] != 'Other']
 851
 852        callsign_counts = callsign_df.groupby(['callsign_group']).size().reset_index(name='count')
 853
 854        total_counts = len(callsign_df)
 855        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 856
 857        callsign_counts.to_csv('distribution_data/callsign_group_probs.csv', mode = "w+", index=False)
 858
 859
 860    # def callsign_group_by_care_category(self):
 861    #     """
 862
 863    #         Calculates the probabilty of a specific callsign being allocated to
 864    #         a call based on the care category
 865
 866    #     """
 867    #     df = self.df.copy()
 868
 869    #     # Convert time fields to numeric
 870    #     time_fields = [
 871    #         "time_allocation", "time_mobile", "time_to_scene",
 872    #         "time_on_scene", "time_to_hospital", "time_to_clear"
 873    #     ]
 874    #     for col in time_fields:
 875    #         df[col] = pd.to_numeric(df[col], errors="coerce")
 876
 877    #     # Calculate total job duration in minutes
 878    #     df["job_duration_min"] = df[time_fields].sum(axis=1, skipna=True)
 879
 880    #     # Compute job start and end times
 881    #     df["start_time"] = df["inc_date"]
 882    #     df["end_time"] = df["start_time"] + pd.to_timedelta(df["job_duration_min"], unit="m")
 883
 884    #     # Sort jobs by start time
 885    #     df = df.sort_values(by="start_time").reset_index(drop=True)
 886
 887    #     # Set to hold indices of jobs that overlap (but only the later-starting ones)
 888    #     overlapping = set()
 889
 890    #     # Check for overlaps
 891    #     for i in range(len(df)):
 892    #         this_end = df.at[i, "end_time"]
 893
 894    #         # Compare only to later jobs
 895    #         for j in range(i + 1, len(df)):
 896    #             next_start = df.at[j, "start_time"]
 897    #             if next_start >= this_end:
 898    #                 break  # No more possible overlaps
 899    #             # If it starts before i's job ends, it's overlapping
 900    #             overlapping.add(j)
 901
 902    #     # Mark the overlaps in the dataframe
 903    #     df["overlaps"] = df.index.isin(overlapping)
 904
 905    #     # Filter out overlapping jobs
 906    #     df_no_overlap = df[~df["overlaps"]]
 907
 908
 909    #     callsign_df = (
 910    #         df_no_overlap
 911    #         .assign(
 912    #             helicopter_benefit=np.select(
 913    #                 [
 914    #                     df_no_overlap["cc_benefit"] == "y",
 915    #                     df_no_overlap["ec_benefit"] == "y",
 916    #                     df_no_overlap["hems_result"].isin([
 917    #                         "Stand Down En Route",
 918    #                         "Landed but no patient contact",
 919    #                         "Stand Down Before Mobile"
 920    #                     ])
 921    #                 ],
 922    #                 ["y", "y", "n"],
 923    #                 default=df_no_overlap["helicopter_benefit"]
 924    #             ),
 925    #             care_category=np.select(
 926    #                 [
 927    #                     df_no_overlap["cc_benefit"] == "y",
 928    #                     df_no_overlap["ec_benefit"] == "y"
 929    #                 ],
 930    #                 ["CC", "EC"],
 931    #                 default="REG"
 932    #             )
 933    #         )
 934    #     )
 935
 936    #     callsign_df = callsign_df[callsign_df['callsign_group'] != 'Other']
 937
 938    #     callsign_counts = callsign_df.groupby(['care_category', 'callsign_group']).size().reset_index(name='count')
 939
 940    #     total_counts = callsign_counts.groupby(['care_category'])['count'].transform('sum')
 941    #     callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 942
 943    #     callsign_counts.to_csv('distribution_data/callsign_group_by_care_category_probs.csv', mode = "w+", index=False)
 944
 945    #========== ARCHIVED CODE ============ #
 946    # def vehicle_type_by_month_probs(self):
 947    #     """
 948
 949    #         Calculates the probabilty of a car/helicopter being allocated to
 950    #         a call based on the callsign group and month of the year
 951
 952    #     """
 953    #     callsign_counts = self.df.groupby(['callsign_group', 'month', 'vehicle_type']).size().reset_index(name='count')
 954
 955    #     total_counts = callsign_counts.groupby(['callsign_group', 'month'])['count'].transform('sum')
 956    #     callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 957
 958    #     callsign_counts.to_csv('distribution_data/vehicle_type_by_month_probs.csv', mode = "w+")
 959    #========== END ARCHIVED CODE ============ #
 960    def vehicle_type_by_quarter_probs(self):
 961        """
 962
 963            Calculates the probabilty of a car/helicopter being allocated to
 964            a call based on the callsign group and quarter of the year
 965
 966            Quarter accounts for seasonal variation without being as affected by
 967
 968        """
 969        data = self.df.copy()
 970
 971        # We will use the ad-hoc unavailability to remove any instances where we already know one of
 972        # the vehicles to be recorded as offline
 973
 974        # TODO: Ideally we'd also remove any instances where we know one of the helos to have been
 975        # off for servicing if that data is available
 976        ad_hoc = pd.read_csv("external_data/ad_hoc.csv", parse_dates=["offline", "online"])
 977        ad_hoc["aircraft"] = ad_hoc["aircraft"].str.lower()
 978
 979        data["inc_date"] = pd.to_datetime(data["inc_date"], format="ISO8601")
 980        data["vehicle"] = data["vehicle"].str.lower()
 981
 982        # Create a cross-join between data and ad_hoc
 983        data['key'] = 1
 984        ad_hoc['key'] = 1
 985        merged = data.merge(ad_hoc, on='key')
 986
 987        # Keep rows where inc_date falls within the offline period
 988        overlap = merged[(merged['inc_date'] >= merged['offline']) & (merged['inc_date'] <= merged['online'])]
 989
 990        # Filter out those rows from the original data
 991        filtered_data = data[~data['inc_date'].isin(overlap['inc_date'])].drop(columns='key')
 992
 993        # First, calculate overall props
 994        callsign_counts = filtered_data.groupby(['callsign_group', 'vehicle_type']).size().reset_index(name='count')
 995
 996        total_counts = callsign_counts.groupby(['callsign_group'])['count'].transform('sum')
 997        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 998
 999        callsign_counts.to_csv('distribution_data/vehicle_type_probs.csv', mode = "w+")
1000
1001        # Then, redo by quarter
1002        callsign_counts = filtered_data.groupby(['callsign_group', 'quarter', 'vehicle_type']).size().reset_index(name='count')
1003
1004        total_counts = callsign_counts.groupby(['callsign_group', 'quarter'])['count'].transform('sum')
1005        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
1006
1007        callsign_counts.to_csv('distribution_data/vehicle_type_by_quarter_probs.csv', mode = "w+")
1008
1009    def vehicle_type_probs(self):
1010        """
1011
1012            Calculates the probabilty of a car/helicopter being allocated to
1013            a call based on the callsign group
1014
1015        """
1016
1017        callsign_counts = self.df.groupby(['callsign_group', 'vehicle_type']).size().reset_index(name='count')
1018
1019        total_counts = callsign_counts.groupby(['callsign_group'])['count'].transform('sum')
1020        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
1021
1022        callsign_counts.to_csv('distribution_data/vehicle_type_probs.csv', mode = "w+")
1023
1024    def hems_result_by_callsign_group_and_vehicle_type_probs(self):
1025        """
1026
1027            Calculates the probabilty of a specific HEMS result being allocated to
1028            a call based on the callsign group and hour of day
1029
1030            TODO: These probability calculation functions could probably be refactored into a single
1031            function and just specify columns and output name
1032
1033        """
1034        hems_counts = self.df.groupby(['hems_result', 'callsign_group', 'vehicle_type']).size().reset_index(name='count')
1035
1036        total_counts = hems_counts.groupby(['callsign_group', 'vehicle_type'])['count'].transform('sum')
1037        hems_counts['proportion'] = round(hems_counts['count'] / total_counts, 4)
1038
1039        hems_counts.to_csv('distribution_data/hems_result_by_callsign_group_and_vehicle_type_probs.csv', mode = "w+", index=False)
1040
1041
1042    #========== ARCHIVED CODE ============ #
1043    # def hems_result_by_care_cat_and_helicopter_benefit_probs(self):
1044    #     """
1045
1046    #         Calculates the probabilty of a specific HEMS result being allocated to
1047    #         a call based on the care category amd whether a helicopter is beneficial
1048
1049    #     """
1050
1051    #     # Wrangle the data...trying numpy for a change
1052
1053    #     hems_df = (
1054    #         self.df
1055    #         .assign(
1056    #             helicopter_benefit=np.select(
1057    #                 [
1058    #                     self.df["cc_benefit"] == "y",
1059    #                     self.df["ec_benefit"] == "y",
1060    #                     self.df["hems_result"].isin([
1061    #                         "Stand Down En Route",
1062    #                         "Landed but no patient contact",
1063    #                         "Stand Down Before Mobile"
1064    #                     ])
1065    #                 ],
1066    #                 ["y", "y", "n"],
1067    #                 default=self.df["helicopter_benefit"]
1068    #             ),
1069    #             care_cat=np.select(
1070    #                 [
1071    #                     self.df["cc_benefit"] == "y",
1072    #                     self.df["ec_benefit"] == "y"
1073    #                 ],
1074    #                 ["CC", "EC"],
1075    #                 default="REG"
1076    #             )
1077    #         )
1078    #     )
1079
1080    #     hems_counts = hems_df.groupby(['hems_result', 'care_cat', 'helicopter_benefit']).size().reset_index(name='count')
1081
1082    #     hems_counts['total'] = hems_counts.groupby(['care_cat', 'helicopter_benefit'])['count'].transform('sum')
1083    #     hems_counts['proportion'] = round(hems_counts['count'] / hems_counts['total'], 4)
1084
1085    #     hems_counts.to_csv('distribution_data/hems_result_by_care_cat_and_helicopter_benefit_probs.csv', mode = "w+", index=False)
1086    #========== END ARCHIVED CODE ============ #
1087
1088    #========== ARCHIVED CODE ============ #
1089    # def pt_outcome_by_hems_result_and_care_category_probs(self):
1090    #     """
1091
1092    #         Calculates the probabilty of a specific patient outcome based on HEMS result
1093
1094    #     """
1095
1096    #     hems_df = (
1097    #         self.df
1098    #         .assign(
1099    #             helicopter_benefit=np.select(
1100    #                 [
1101    #                     self.df["cc_benefit"] == "y",
1102    #                     self.df["ec_benefit"] == "y",
1103    #                     self.df["hems_result"].isin([
1104    #                         "Stand Down En Route",
1105    #                         "Landed but no patient contact",
1106    #                         "Stand Down Before Mobile"
1107    #                     ])
1108    #                 ],
1109    #                 ["y", "y", "n"],
1110    #                 default=self.df["helicopter_benefit"]
1111    #             ),
1112    #             care_category=np.select(
1113    #                 [
1114    #                     self.df["cc_benefit"] == "y",
1115    #                     self.df["ec_benefit"] == "y"
1116    #                 ],
1117    #                 ["CC", "EC"],
1118    #                 default="REG"
1119    #             )
1120    #         )
1121    #     )
1122
1123    #     po_counts = hems_df.groupby(['pt_outcome', 'hems_result', 'care_category']).size().reset_index(name='count')
1124
1125    #     po_counts['total'] = po_counts.groupby(['hems_result', 'care_category'])['count'].transform('sum')
1126    #     po_counts['proportion'] = round(po_counts['count'] / po_counts['total'], 4)
1127
1128    #     po_counts.to_csv('distribution_data/pt_outcome_by_hems_result_and_care_category_probs.csv', mode = "w+")
1129    #========== END ARCHIVED CODE ============ #
1130
1131    def school_holidays(self) -> None:
1132        """"
1133            Function to generate a CSV file containing schoole holiday
1134            start and end dates for a given year. The Year range is determined
1135            by the submitted data (plus a year at the end of the study for good measure)
1136        """
1137
1138        min_date = self.df.inc_date.min()
1139        max_date = self.df.inc_date.max() + timedelta(weeks = (52 * self.school_holidays_years))
1140
1141        u = Utils()
1142
1143        years_of_holidays_list = u.years_between(min_date, max_date)
1144
1145        sh = pd.DataFrame(columns=['year', 'start_date', 'end_date'])
1146
1147        for i, year in enumerate(years_of_holidays_list):
1148            tmp = u.calculate_term_holidays(year)
1149
1150            if i == 0:
1151                sh = tmp
1152            else:
1153                sh = pd.concat([sh, tmp])
1154
1155        sh.to_csv('actual_data/school_holidays.csv', index = False)
1156
1157
1158# These functions are to wrangle historical data to provide comparison against the simulation outputs
1159
1160    def historical_jobs_per_day_per_callsign(self):
1161        df = self.df
1162
1163        df["date"] = pd.to_datetime(df["inc_date"]).dt.date
1164        all_counts_hist = df.groupby(["date", "callsign"])["job_id"].count().reset_index()
1165        all_counts_hist.rename(columns={'job_id':'jobs_in_day'}, inplace=True)
1166
1167        all_combinations = pd.DataFrame(
1168            list(itertools.product(df['date'].unique(), df['callsign'].unique())),
1169            columns=['date', 'callsign']
1170        ).dropna()
1171
1172        merged = all_combinations.merge(all_counts_hist, on=['date', 'callsign'], how='left')
1173        merged['jobs_in_day'] = merged['jobs_in_day'].fillna(0).astype(int)
1174
1175        all_counts = merged.groupby(['callsign', 'jobs_in_day']).count().reset_index().rename(columns={"date":"count"})
1176        all_counts.to_csv("historical_data/historical_jobs_per_day_per_callsign.csv", index=False)
1177
1178    def historical_care_cat_counts(self):
1179        """
1180        Process historical incident data to categorize care types and compute hourly counts.
1181
1182        This method performs the following steps:
1183        - Converts incident dates to datetime format.
1184        - Extracts month start and hour from the incident date.
1185        - Categorizes each incident into care categories based on benefit flags and attendance.
1186        - Counts the number of incidents by hour and care category.
1187        - Outputs these counts to a CSV file.
1188        - Computes and writes the proportion of regular care jobs with a helicopter benefit
1189        (excluding those not attended by a DAA resource) to a text file.
1190
1191        Outputs:
1192        - CSV file: 'historical_data/historical_care_cat_counts.csv'
1193        - Text file: 'distribution_data/proportion_jobs_heli_benefit.txt'
1194        """
1195
1196        df_historical = self.df
1197
1198        df_historical['inc_date'] = pd.to_datetime(df_historical['inc_date'])
1199        # Extract the first day of the month and the hour of each incident
1200        df_historical['month_start'] = df_historical.inc_date.dt.strftime("%Y-%m-01")
1201        df_historical['hour'] = df_historical.inc_date.dt.hour
1202
1203        conditions = [
1204            df_historical['cc_benefit'] == 'y',
1205            df_historical['ec_benefit'] == 'y',
1206            df_historical['helicopter_benefit'] == 'y',
1207            df_historical['callsign_group'] == 'Other'
1208        ]
1209
1210        choices = [
1211            'CC',
1212            'EC',
1213            'REG - helicopter benefit',
1214            'Unknown - DAA resource did not attend'
1215        ]
1216        # Assign care category to each record
1217        # If the case did not meet any of the criteria in 'conditions', it will default
1218        # to being labelled as a 'regular/REG' case (i.e there was no benefit recorded)
1219        df_historical['care_category'] = np.select(conditions, choices, default='REG')
1220
1221        # Count occurrences grouped by hour and care category
1222        historical_value_counts_by_hour = (
1223            df_historical.value_counts(["hour", "care_category"])
1224            .reset_index(name="count")
1225            )
1226        # Output to CSV for use in tests and visualisations
1227        (historical_value_counts_by_hour
1228         .sort_values(['hour', 'care_category'])
1229         .to_csv("historical_data/historical_care_cat_counts.csv"))
1230
1231        # Also output the % of regular (not cc/ec) jobs with a helicopter benefit
1232        # These are the regular jobs we will make an assumption follow different logic due to having an obvious expected
1233        # patient benefit of having a helicopter allocated to them that we will have to assume is apparent at the time
1234        # of the call being placed (such as the casualty being located in a remote location, or )
1235
1236        numerator = (historical_value_counts_by_hour[
1237            historical_value_counts_by_hour["care_category"] == "REG - helicopter benefit"
1238            ]["count"].sum())
1239
1240        denominator = (historical_value_counts_by_hour[
1241            (historical_value_counts_by_hour["care_category"] == "REG - helicopter benefit") |
1242            (historical_value_counts_by_hour["care_category"] == "REG")
1243            ]["count"].sum())
1244
1245        with open('distribution_data/proportion_jobs_heli_benefit.txt', 'w+') as heli_benefit_file:
1246            heli_benefit_file.write(json.dumps((numerator/denominator).round(4)))
1247
1248         # Count occurrences grouped by hour and care category
1249        historical_value_counts_by_hour_cc_ec = (
1250            df_historical.value_counts(["hour", "care_category", "helicopter_benefit"])
1251            .reset_index(name="count")
1252            )
1253
1254       # Output to CSV for use in tests and visualisations
1255        # (historical_value_counts_by_hour_cc_ec
1256        #  .sort_values(["hour", "care_category", "helicopter_benefit"])
1257        #  .to_csv("historical_data/historical_care_cat_counts_cc_ec.csv"))
1258
1259
1260        numerator_cc = (
1261            historical_value_counts_by_hour_cc_ec[
1262                (historical_value_counts_by_hour_cc_ec["care_category"] == "CC") &
1263                (historical_value_counts_by_hour_cc_ec["helicopter_benefit"] == "y")
1264                ]["count"].sum())
1265
1266        denominator_cc = (
1267            historical_value_counts_by_hour_cc_ec[
1268                (historical_value_counts_by_hour_cc_ec["care_category"] == "CC")
1269                ]["count"].sum())
1270
1271        with open('distribution_data/proportion_jobs_heli_benefit_cc.txt', 'w+') as heli_benefit_file:
1272            heli_benefit_file.write(json.dumps((numerator_cc/denominator_cc).round(4)))
1273
1274        numerator_ec = (
1275            historical_value_counts_by_hour_cc_ec[
1276                (historical_value_counts_by_hour_cc_ec["care_category"] == "EC") &
1277                (historical_value_counts_by_hour_cc_ec["helicopter_benefit"] == "y")
1278                ]["count"].sum())
1279
1280        denominator_ec = (
1281            historical_value_counts_by_hour_cc_ec[
1282                (historical_value_counts_by_hour_cc_ec["care_category"] == "EC")
1283                ]["count"].sum())
1284
1285        with open('distribution_data/proportion_jobs_heli_benefit_ec.txt', 'w+') as heli_benefit_file:
1286            heli_benefit_file.write(json.dumps((numerator_ec/denominator_ec).round(4)))
1287
1288
1289
1290    def historical_monthly_totals(self):
1291        """
1292            Calculates monthly incident totals from provided dataset of historical data
1293        """
1294
1295        # Multiple resources can be sent to the same job.
1296        monthly_df = self.df[['inc_date', 'first_day_of_month', 'hems_result', 'vehicle_type']]\
1297            .drop_duplicates(subset="inc_date", keep="first")
1298
1299        is_stand_down = monthly_df['hems_result'].str.contains("Stand Down")
1300        monthly_df['stand_down_car'] = ((monthly_df['vehicle_type'] == "car") & is_stand_down).astype(int)
1301        monthly_df['stand_down_helicopter'] = ((monthly_df['vehicle_type'] == "helicopter") & is_stand_down).astype(int)
1302
1303        monthly_totals_df = monthly_df.groupby('first_day_of_month').agg(
1304                                stand_down_car=('stand_down_car', 'sum'),
1305                                stand_down_helicopter=('stand_down_helicopter', 'sum'),
1306                                total_jobs=('vehicle_type', 'size')
1307                            ).reset_index()
1308
1309        monthly_totals_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_jobs_per_month.csv', mode="w+", index=False)
1310
1311    def historical_monthly_totals_all_calls(self):
1312        """
1313            Calculates monthly incident totals from provided dataset of historical data stratified by callsign
1314        """
1315
1316        # Multiple resources can be sent to the same job.
1317        monthly_df = self.df[['inc_date', 'first_day_of_month']].dropna()
1318
1319        monthly_totals_df = monthly_df.groupby(['first_day_of_month']).count().reset_index()
1320
1321        monthly_totals_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_all_calls.csv', mode="w+", index=False)
1322
1323    def historical_monthly_totals_by_callsign(self):
1324        """
1325            Calculates monthly incident totals from provided dataset of historical data stratified by callsign
1326        """
1327
1328        # Multiple resources can be sent to the same job.
1329        monthly_df = self.df[['inc_date', 'first_day_of_month', 'callsign']].dropna()
1330
1331        monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'callsign']).count().reset_index()
1332
1333        #print(monthly_totals_df.head())
1334
1335        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='callsign', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1)
1336
1337        #print(monthly_totals_pivot_df.head())
1338
1339        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_callsign.csv', mode="w+", index=False)
1340
1341    def historical_monthly_totals_by_hour_of_day(self):
1342        """
1343            Calculates monthly incident totals from provided dataset of historical data stratified by hour of the day
1344        """
1345
1346        # Multiple resources can be sent to the same job.
1347        monthly_df = self.df[['inc_date', 'first_day_of_month', 'hour']].dropna()\
1348            .drop_duplicates(subset="inc_date", keep="first")
1349
1350        monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'hour']).count().reset_index()
1351
1352        #print(monthly_totals_df.head())
1353
1354        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='hour', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1)
1355
1356        #print(monthly_totals_pivot_df.head())
1357
1358        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_hour_of_day.csv', mode="w+", index=False)
1359
1360    def historical_monthly_totals_by_day_of_week(self):
1361        """
1362            Calculates number of incidents per month stratified by day of the week
1363        """
1364
1365        # Multiple resources can be sent to the same job.
1366        monthly_df = self.df[['inc_date', 'first_day_of_month', 'day_of_week']].dropna()\
1367            .drop_duplicates(subset="inc_date", keep="first")
1368
1369        monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'day_of_week']).count().reset_index()
1370
1371        #print(monthly_totals_df.head())
1372
1373        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='day_of_week', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1)
1374
1375        #print(monthly_totals_pivot_df.head())
1376
1377        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_day_of_week.csv', mode="w+", index=False)
1378
1379    def historical_median_time_of_activities_by_month_and_resource_type(self):
1380        """
1381            Calculate the median time for each of the job cycle phases stratified by month and vehicle type
1382        """
1383
1384        median_df = self.df[['first_day_of_month', 'time_allocation',
1385                             'time_mobile', 'time_to_scene', 'time_on_scene',
1386                             'time_to_hospital', 'time_to_clear', 'vehicle_type']].copy()
1387
1388        median_df['total_job_time'] = median_df[[
1389            'time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene',
1390            'time_to_hospital', 'time_to_clear']].sum(axis=1, skipna=True)
1391
1392        # Replacing zeros with NaN to exclude from median calculation
1393        # since if an HEMS result is Stood down en route, then time_on_scene would be zero and affect the median
1394        # median_df.replace(0, np.nan, inplace=True)
1395
1396        # Grouping by month and resource_type, calculating medians
1397        median_times = median_df.groupby(['first_day_of_month', 'vehicle_type']).median(numeric_only=True).reset_index()
1398
1399        pivot_data = median_times.pivot_table(
1400            index='first_day_of_month',
1401            columns='vehicle_type',
1402            values=['time_allocation', 'time_mobile', 'time_to_scene',
1403                    'time_on_scene', 'time_to_hospital', 'time_to_clear', 'total_job_time']
1404        )
1405
1406        pivot_data.columns = [f"median_{col[1]}_{col[0]}" for col in pivot_data.columns]
1407        pivot_data = pivot_data.reset_index()
1408
1409        pivot_data.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_median_time_of_activities_by_month_and_resource_type.csv', mode="w+", index=False)
1410
1411    def historical_monthly_resource_utilisation(self):
1412        """
1413            Calculates number of, and time spent on, incidents per month stratified by callsign
1414        """
1415
1416        # Multiple resources can be sent to the same job.
1417        monthly_df = self.df[[
1418            'inc_date', 'first_day_of_month', 'callsign', 'time_allocation',
1419            'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital',
1420            'time_to_clear']].copy()
1421
1422        monthly_df['total_time'] = monthly_df.filter(regex=r'^time_').sum(axis=1)
1423
1424        monthly_totals_df = monthly_df.groupby(['callsign', 'first_day_of_month'], as_index=False)\
1425            .agg(n = ('callsign', 'size'), total_time = ('total_time', 'sum'))
1426
1427        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='callsign', values=['n', 'total_time'])
1428
1429        monthly_totals_pivot_df.columns = [f"{col[0]}_{col[1]}" for col in  monthly_totals_pivot_df.columns]
1430        monthly_totals_pivot_df = monthly_totals_pivot_df.reset_index()
1431
1432        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_resource_utilisation.csv', mode="w+", index=False)
1433
1434    def historical_daily_calls_breakdown(self):
1435
1436        df = self.df
1437        # Convert inc_date to date only (remove time)
1438        df['date'] = pd.to_datetime(df['inc_date']).dt.date
1439
1440        # Count number of calls per day
1441        calls_in_day_breakdown = df.groupby('date').size().reset_index(name='calls_in_day')
1442
1443        # Save the daily call counts with a 'day' index column
1444        calls_in_day_breakdown_with_day = calls_in_day_breakdown.copy()
1445        calls_in_day_breakdown_with_day.insert(0, 'day', range(1, len(calls_in_day_breakdown) + 1))
1446        calls_in_day_breakdown_with_day.drop(columns='date').to_csv('historical_data/historical_daily_calls_breakdown.csv', index=False)
1447
1448        # Count how many days had the same number of calls
1449        calls_per_day_summary = calls_in_day_breakdown['calls_in_day'].value_counts().reset_index()
1450        calls_per_day_summary.columns = ['calls_in_day', 'days']
1451        calls_per_day_summary.to_csv('historical_data/historical_daily_calls.csv', index=False)
1452
1453    def historical_missed_jobs(self):
1454        df = self.df
1455        df["date"] = pd.to_datetime(df["inc_date"])
1456        df["hour"] = df["date"].dt.hour
1457        df["month_start"] = df["date"].dt.strftime("%Y-%m-01")
1458        df["callsign_group_simplified"] = df["callsign_group"].apply(lambda x: "No HEMS available" if x=="Other" else "HEMS (helo or car) available and sent")
1459        df["quarter"] = df["inc_date"].dt.quarter
1460
1461        # By month
1462        count_df_month = df[["callsign_group_simplified", "month_start"]].value_counts().reset_index(name="count").sort_values(['callsign_group_simplified','month_start'])
1463        count_df_month.to_csv("historical_data/historical_missed_calls_by_month.csv", index=False)
1464
1465        # By hour
1466        count_df = df[["callsign_group_simplified", "hour"]].value_counts().reset_index(name="count").sort_values(['callsign_group_simplified','hour'])
1467        count_df.to_csv("historical_data/historical_missed_calls_by_hour.csv", index=False)
1468
1469        # By quarter and hour
1470        count_df_quarter = df[["callsign_group_simplified", "quarter", "hour"]].value_counts().reset_index(name="count").sort_values(['quarter','callsign_group_simplified','hour'])
1471        count_df_quarter.to_csv("historical_data/historical_missed_calls_by_quarter_and_hour.csv", index=False)
1472
1473    def upper_allowable_time_bounds(self):
1474        """
1475            Calculates the maximum permissable time for each phase on an incident based on supplied historical data.
1476            This is currently set to 1.5x the upper quartile of the data distribution
1477        """
1478
1479        median_df = self.df[[
1480            'time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene',
1481            'time_to_hospital', 'time_to_clear', 'vehicle_type']]
1482
1483        # Replacing zeros with NaN to exclude from median calculation
1484        # since if an HEMS result is Stood down en route, then time_on_scene
1485        # would be zero and affect the median
1486        median_df.replace(0, np.nan, inplace=True)
1487
1488        print(median_df.quantile(.75))
1489        # pivot_data.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_median_time_of_activities_by_month_and_resource_type.csv', mode="w+", index=False)
1490
1491    def historical_job_durations_breakdown(self):
1492
1493        df = self.df
1494
1495        cols = [
1496            'callsign', 'vehicle_type',
1497            'time_allocation', 'time_mobile',
1498            'time_to_scene', 'time_on_scene',
1499            'time_to_hospital', 'time_to_clear'
1500        ]
1501        df2 = df[cols].copy()
1502
1503        # 2. Add a 1-based row identifier
1504        df2['job_identifier'] = range(1, len(df2) + 1)
1505
1506        # 3. Compute total_duration as the row-wise sum of the time columns
1507        time_cols = [
1508            'time_allocation', 'time_mobile',
1509            'time_to_scene', 'time_on_scene',
1510            'time_to_hospital', 'time_to_clear'
1511        ]
1512        df2['total_duration'] = df2[time_cols].sum(axis=1, skipna=True)
1513
1514        #print(df2.head())
1515
1516        # 4. Pivot (melt) to long format
1517        df_long = df2.melt(
1518            id_vars=['job_identifier', 'callsign', 'vehicle_type'],
1519            value_vars=time_cols + ['total_duration'],
1520            var_name='name',
1521            value_name='value'
1522        )
1523
1524        #print(df_long[df_long.job_identifier == 1])
1525
1526        # 5. Drop any rows where callsign or vehicle_type is missing
1527        df_long = df_long.dropna(subset=['callsign', 'vehicle_type'])
1528        df_long_sorted = df_long.sort_values("job_identifier").reset_index(drop=True)
1529
1530        # 6. Write out to CSV
1531        df_long_sorted.to_csv("historical_data/historical_job_durations_breakdown.csv", index=False)
1532
1533    # ========== ARCHIVED CODE - v1 of calcualte_availability_row ======================== #
1534    # def calculate_availability_row(self, row, rota_df, callsign_lookup_df, period_start, period_end):
1535    #     """
1536    #     Compute downtime overlap, rota-based scheduled time, and proportion for a given row.
1537    #     Returns data tagged with bin, quarter, and downtime reason.
1538    #     """
1539
1540    #     registration = row['aircraft'].lower()
1541    #     downtime_start = pd.to_datetime(row['offline'], utc=True)
1542    #     downtime_end = pd.to_datetime(row['online'], utc=True)
1543    #     reason = row.get('reason', None)
1544
1545    #     hour = downtime_start.hour
1546    #     if 0 <= hour <= 5:
1547    #         six_hour_bin = '00-05'
1548    #     elif 6 <= hour <= 11:
1549    #         six_hour_bin = '06-11'
1550    #     elif 12 <= hour <= 17:
1551    #         six_hour_bin = '12-17'
1552    #     else:
1553    #         six_hour_bin = '18-23'
1554
1555    #     quarter = downtime_start.quarter
1556
1557    #     # Match callsign
1558    #     match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration]
1559    #     if match.empty:
1560    #         return {
1561    #             'registration': registration,
1562    #             'offline': downtime_start,
1563    #             'online': downtime_end,
1564    #             'six_hour_bin': six_hour_bin,
1565    #             'quarter': quarter,
1566    #             'total_offline': None,
1567    #             'scheduled_minutes': None,
1568    #             'reason': reason,
1569    #             'proportion': None
1570    #         }
1571    #     callsign = match.iloc[0]['callsign']
1572
1573    #     rota_rows = rota_df[rota_df['callsign'] == callsign]
1574    #     if rota_rows.empty:
1575    #         return {
1576    #             'registration': registration,
1577    #             'offline': downtime_start,
1578    #             'online': downtime_end,
1579    #             'six_hour_bin': six_hour_bin,
1580    #             'quarter': quarter,
1581    #             'total_offline': None,
1582    #             'scheduled_minutes': None,
1583    #             'reason': reason,
1584    #             'proportion': None
1585    #         }
1586
1587    #     # Clip evaluation window to downtime period
1588    #     eval_start = max(downtime_start.normalize(), pd.to_datetime(period_start, utc=True))
1589    #     eval_end = min(downtime_end.normalize(), pd.to_datetime(period_end, utc=True))
1590
1591    #     total_scheduled_minutes = 0
1592    #     total_overlap_minutes = 0
1593
1594    #     current_day = eval_start
1595    #     while current_day <= eval_end:
1596    #         month = current_day.month
1597    #         season = 'summer' if month in [4, 5, 6, 7, 8, 9] else 'winter'
1598
1599    #         for _, rota in rota_rows.iterrows():
1600    #             start_hour = rota[f'{season}_start']
1601    #             end_hour = rota[f'{season}_end']
1602
1603    #             rota_start = current_day + timedelta(hours=start_hour)
1604    #             rota_end = current_day + timedelta(hours=end_hour)
1605    #             if end_hour <= start_hour:
1606    #                 rota_end += timedelta(days=1)
1607
1608    #             # Count scheduled time regardless of overlap
1609    #             scheduled_minutes = (rota_end - rota_start).total_seconds() / 60
1610    #             total_scheduled_minutes += scheduled_minutes
1611
1612    #             # Count overlap only if intersecting with downtime
1613    #             overlap_start = max(downtime_start, rota_start)
1614    #             overlap_end = min(downtime_end, rota_end)
1615    #             if overlap_end > overlap_start:
1616    #                 overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60
1617    #                 total_overlap_minutes += overlap_minutes
1618
1619    #         current_day += timedelta(days=1)
1620
1621    #     if total_scheduled_minutes == 0:
1622    #         proportion = None
1623    #     else:
1624    #         proportion = total_overlap_minutes / total_scheduled_minutes
1625
1626    #     return {
1627    #         'registration': registration,
1628    #         'offline': downtime_start,
1629    #         'online': downtime_end,
1630    #         'six_hour_bin': six_hour_bin,
1631    #         'quarter': quarter,
1632    #         'total_offline': total_overlap_minutes,
1633    #         'scheduled_minutes': total_scheduled_minutes,
1634    #         'reason': reason,
1635    #         'proportion': proportion
1636    #     }
1637
1638
1639    # ============= ARCHIVED CODE - v2 of calculate_availability_row ======================= #
1640    # def calculate_availability_row(self, row, rota_df, callsign_lookup_df):
1641    #     """
1642    #         Compute downtime overlap, rota-based scheduled time, and proportion for a given row.
1643    #         Returns data tagged with bin, quarter, and downtime reason.
1644    #     """
1645
1646    #     registration = row['aircraft'].lower()
1647    #     downtime_start = pd.to_datetime(row['offline'], utc=True)
1648    #     downtime_end = pd.to_datetime(row['online'], utc=True)
1649    #     reason = row.get('reason', None)
1650
1651    #     hour = downtime_start.hour
1652    #     if 0 <= hour <= 5:
1653    #         six_hour_bin = '00-05'
1654    #     elif 6 <= hour <= 11:
1655    #         six_hour_bin = '06-11'
1656    #     elif 12 <= hour <= 17:
1657    #         six_hour_bin = '12-17'
1658    #     else:
1659    #         six_hour_bin = '18-23'
1660
1661    #     quarter = downtime_start.quarter
1662
1663    #     # Match callsign
1664    #     match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration]
1665    #     if match.empty:
1666    #         return {
1667    #             'registration': registration,
1668    #             'offline': downtime_start,
1669    #             'online': downtime_end,
1670    #             'six_hour_bin': six_hour_bin,
1671    #             'quarter': quarter,
1672    #             'total_offline': None,
1673    #             'scheduled_minutes': None,
1674    #             'reason': reason,
1675    #             'proportion': None
1676    #         }
1677    #     callsign = match.iloc[0]['callsign']
1678
1679    #     rota_rows = rota_df[rota_df['callsign'] == callsign]
1680    #     if rota_rows.empty:
1681    #         return {
1682    #             'registration': registration,
1683    #             'offline': downtime_start,
1684    #             'online': downtime_end,
1685    #             'six_hour_bin': six_hour_bin,
1686    #             'quarter': quarter,
1687    #             'total_offline': None,
1688    #             'scheduled_minutes': None,
1689    #             'reason': reason,
1690    #             'proportion': None
1691    #         }
1692
1693    #     month = downtime_start.month
1694    #     season = 'summer' if month in [4, 5, 6, 7, 8, 9] else 'winter'
1695
1696    #     total_scheduled_minutes = 0
1697    #     total_overlap_minutes = 0
1698
1699    #     for _, rota in rota_rows.iterrows():
1700    #         start_hour = rota[f'{season}_start']
1701    #         end_hour = rota[f'{season}_end']
1702
1703    #         for base_day in [downtime_start.normalize() - timedelta(days=1),
1704    #                         downtime_start.normalize(),
1705    #                         downtime_start.normalize() + timedelta(days=1)]:
1706
1707    #             rota_start = base_day + timedelta(hours=start_hour)
1708    #             rota_end = base_day + timedelta(hours=end_hour)
1709    #             if end_hour <= start_hour:
1710    #                 rota_end += timedelta(days=1)
1711
1712    #             overlap_start = max(downtime_start, rota_start)
1713    #             overlap_end = min(downtime_end, rota_end)
1714
1715    #             if overlap_end > overlap_start:
1716    #                 scheduled_minutes = (rota_end - rota_start).total_seconds() / 60
1717    #                 overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60
1718
1719    #                 total_scheduled_minutes += scheduled_minutes
1720    #                 total_overlap_minutes += overlap_minutes
1721
1722    #     if total_scheduled_minutes == 0:
1723    #         proportion = None
1724    #     else:
1725    #         proportion = total_overlap_minutes / total_scheduled_minutes
1726
1727    #     return {
1728    #         'registration': registration,
1729    #         'offline': downtime_start,
1730    #         'online': downtime_end,
1731    #         'six_hour_bin': six_hour_bin,
1732    #         'quarter': quarter,
1733    #         'total_offline': total_overlap_minutes,
1734    #         'scheduled_minutes': total_scheduled_minutes,
1735    #         'reason': reason,
1736    #         'proportion': proportion
1737    #     }
1738
1739    # ================ ARCHIVED CODE - ad-hoc unavailability calculation ================ #
1740    # def ad_hoc_unavailability(self, period_start, period_end, include_debugging_cols=False):
1741    #     """Process ad hoc unavailability records into a stratified probability table.
1742
1743    #     Calculates the probability of ad hoc unavailability and availability based
1744    #     on historical data. The data is stratified by aircraft registration,
1745    #     six-hour time bins, and calendar quarters. It ensures all standard
1746    #     reasons ('available', 'crew', 'weather', 'aircraft') are present for
1747    #     each combination. It filters out any registration/quarter/bin pairings
1748    #     with little scheduled time, marking their probabilities as blank. It also adds
1749    #     a count of the ad-hoc unavailability events considered for each probability calculation
1750    #     and the total scheduled time for the resource in that quarter/time period that was part of
1751    #     the calculations.
1752
1753    #     Returns
1754    #     -------
1755    #     None
1756    #         This function does not return a value but saves the calculated
1757    #         probabilities to 'distribution_data/ad_hoc_unavailability.csv'.
1758    #         The CSV file includes columns for registration, six_hour_bin,
1759    #         quarter, reason, probability, and count.
1760
1761    #     Notes
1762    #     -----
1763    #     This function relies on the following input files:
1764    #     - 'external_data/ad_hoc.csv': Contains ad hoc unavailability records.
1765    #     - 'actual_data/HEMS_ROTA.csv': Contains rota information.
1766    #     - 'actual_data/callsign_registration_lookup.csv': Maps callsigns to registrations.
1767    #     It also depends on the 'calculate_availability_row' method within the
1768    #     same class. Ensure that the 'distribution_data' directory exists.
1769    #     """
1770    #     try:
1771    #         # Load data
1772    #         adhoc_df = pd.read_csv('external_data/ad_hoc.csv', parse_dates=['offline', 'online'])
1773    #         adhoc_df = adhoc_df[['aircraft', 'offline', 'online', 'reason']]
1774    #         rota_df = pd.read_csv("actual_data/HEMS_ROTA.csv")
1775    #         callsign_lookup_df = pd.read_csv("actual_data/callsign_registration_lookup.csv")
1776
1777    #         # Process each ad hoc record
1778    #         results = adhoc_df.apply(
1779    #             lambda row: self.calculate_availability_row(row, rota_df, callsign_lookup_df, period_start, period_end),
1780    #             axis=1
1781    #         )
1782    #         final_df = pd.DataFrame(results.tolist())
1783    #         final_df.to_csv("external_data/ad_hoc_intermediate.csv")
1784
1785    #         # Check if final_df is empty before proceeding
1786    #         if final_df.empty:
1787    #             print("No ad-hoc data processed, skipping file generation.")
1788    #             return
1789
1790    #         # Define the full set of reasons expected
1791    #         all_reasons = ['available', 'crew', 'weather', 'aircraft']
1792
1793    #         # --- Aggregate Data ---
1794    #         # Calculate job count per registration, quarter, AND bin
1795    #         unavailability_instance_counts = final_df.groupby(['registration', 'quarter', 'six_hour_bin']).size().reset_index(name='unavailability_instance_counts')
1796
1797    #         # Downtime by bin + quarter + reason (only for unavailability reasons)
1798    #         grouped = final_df[final_df['reason'].isin(['crew', 'weather', 'aircraft'])]
1799    #         grouped = grouped.groupby(['registration', 'six_hour_bin', 'quarter', 'reason'])['total_offline'].sum().reset_index()
1800
1801    #         # Scheduled time by bin + quarter
1802    #         scheduled_totals = final_df.groupby(['registration', 'six_hour_bin', 'quarter'])['scheduled_minutes'].sum().reset_index()
1803    #         scheduled_totals = scheduled_totals.rename(columns={'scheduled_minutes': 'total_scheduled'})
1804
1805    #         # Merge job counts into scheduled totals
1806    #         scheduled_totals = pd.merge(scheduled_totals, unavailability_instance_counts, on=['registration', 'quarter', 'six_hour_bin'], how='left')
1807
1808    #         # Calculate total downtime per bin + quarter (for 'available' calculation)
1809    #         downtime_totals = grouped.groupby(['registration','six_hour_bin', 'quarter'])['total_offline'].sum().reset_index()
1810    #         downtime_totals = downtime_totals.rename(columns={'total_offline': 'total_downtime'})
1811
1812    #         # --- Create Full Grid ---
1813    #         # Get all unique combinations of registration, quarter, and bin
1814    #         unique_bins = scheduled_totals[['registration', 'quarter', 'six_hour_bin']].drop_duplicates()
1815
1816    #         # Check for empty unique_bins
1817    #         if unique_bins.empty:
1818    #             print("No valid unique bins found, skipping file generation.")
1819    #             return
1820
1821    #         # Create the full grid by crossing unique bins with all reasons
1822    #         full_grid = unique_bins.assign(key=1).merge(pd.DataFrame({'reason': all_reasons, 'key': 1}), on='key').drop('key', axis=1)
1823
1824    #         # --- Merge Data into Full Grid ---
1825    #         full_grid = pd.merge(full_grid, scheduled_totals, on=['registration', 'quarter', 'six_hour_bin'], how='left')
1826    #         full_grid = pd.merge(full_grid, grouped, on=['registration', 'six_hour_bin', 'quarter', 'reason'], how='left')
1827    #         full_grid = pd.merge(full_grid, downtime_totals, on=['registration', 'six_hour_bin', 'quarter'], how='left')
1828
1829    #         # Fill NaNs created during merges
1830    #         full_grid['total_offline'] = full_grid['total_offline'].fillna(0)
1831    #         full_grid['total_downtime'] = full_grid['total_downtime'].fillna(0)
1832    #         full_grid['unavailability_instance_counts'] = full_grid['unavailability_instance_counts'].fillna(0) # Fill job count with 0 for bins that might exist but have no jobs
1833
1834    #         # --- Calculate Probabilities ---
1835    #         # Suppress division by zero warnings - we handle these next
1836    #         with np.errstate(divide='ignore', invalid='ignore'):
1837    #             prob_avail = (full_grid['total_scheduled'] - full_grid['total_downtime']) / full_grid['total_scheduled']
1838    #             prob_unavail = full_grid['total_offline'] / full_grid['total_scheduled']
1839
1840    #             full_grid['probability'] = np.where(
1841    #                 full_grid['reason'] == 'available',
1842    #                 prob_avail,
1843    #                 prob_unavail
1844    #             )
1845
1846    #         # Handle NaN/Inf from division by zero, set them to 0.0 for now.
1847    #         full_grid['probability'] = full_grid['probability'].replace([np.inf, -np.inf], np.nan).fillna(0.0)
1848
1849    #         # --- Apply Threshold and Blanking ---
1850    #         # Condition for setting probability to blank
1851    #         condition_for_blank = (full_grid['total_scheduled'] < 60 * 2 * 30 * 3 ) # If less than 2 hours per day in time period rotad, exclude
1852
1853    #         # Convert probability to object to allow blanks, then apply the condition
1854    #         full_grid['probability'] = full_grid['probability'].astype(object)
1855    #         full_grid.loc[condition_for_blank, 'probability'] = ''
1856
1857    #         # --- Finalize and Save ---
1858    #         # Select and rename columns
1859    #         if include_debugging_cols:
1860    #             final_prob_df = full_grid[['registration', 'six_hour_bin', 'quarter', 'reason', 'probability', 'unavailability_instance_counts', 'total_offline', 'total_scheduled']]
1861    #             final_prob_df.rename(columns={"total_offine":"total_minutes_offline_for_reason", "total_scheduled": "total_minutes_rotad_availability_in_quarter_and_time_period"})
1862    #         else:
1863    #             final_prob_df = full_grid[['registration', 'six_hour_bin', 'quarter', 'reason', 'probability']]
1864    #         # final_prob_df = final_prob_df.rename(columns={'job_count': 'count'})
1865
1866    #         # Sort and save
1867    #         final_prob_df = final_prob_df.sort_values(by=['registration', 'quarter', 'six_hour_bin', 'reason']).reset_index(drop=True)
1868    #         final_prob_df.to_csv("distribution_data/ad_hoc_unavailability.csv", index=False)
1869
1870    #         print("Ad-hoc unavailability probability table generated successfully.")
1871
1872    #     except FileNotFoundError:
1873    #         print("Couldn't generate ad-hoc unavailability due to missing file(s). "
1874    #             "Please ensure 'external_data/ad_hoc.csv', "
1875    #             "'actual_data/HEMS_ROTA.csv', and "
1876    #             "'actual_data/callsign_registration_lookup.csv' exist.")
1877    #         pass
1878    #     except Exception as e:
1879    #         print(f"An error occurred during ad-hoc unavailability processing: {e}")
1880    #         pass
1881
1882    def ad_hoc_unavailability(self, period_start, period_end):
1883        """
1884        Calculate aircraft availability and unavailability probabilities based on scheduled rotas and ad-hoc downtime.
1885
1886        Args:
1887            period_start (str/datetime): Start date for analysis period
1888            period_end (str/datetime): End date for analysis period
1889            include_debugging_cols (bool): Whether to include debugging columns in output (currently unused)
1890
1891        Returns:
1892            None (saves results to CSV file)
1893        """
1894        # Load and prepare ad-hoc downtime data
1895        adhoc_df = pd.read_csv('external_data/ad_hoc.csv', parse_dates=['offline', 'online'])
1896        adhoc_df = adhoc_df[['aircraft', 'offline', 'online', 'reason']]
1897        # Load rota and callsign lookup data
1898        rota_df = pd.read_csv("actual_data/HEMS_ROTA.csv")
1899        callsign_lookup_df = pd.read_csv("actual_data/callsign_registration_lookup.csv")
1900        # Merge rota with callsign lookup to get registration numbers to allow matching with
1901        # ad-hoc data, which uses registrations
1902        full_rota_df = rota_df.merge(callsign_lookup_df, on="callsign")
1903
1904        # Define the hour bands mapping
1905        HOUR_BANDS = {
1906            '00-05': (0, 6),   # 00:00 to 05:59
1907            '06-11': (6, 12),  # 06:00 to 11:59
1908            '12-17': (12, 18), # 12:00 to 17:59
1909            '18-23': (18, 24)  # 18:00 to 23:59
1910        }
1911
1912        # Create list of 6-hour bins
1913        bins = ['00-05', '06-11', '12-17', '18-23']
1914
1915        def is_summer(date_obj, summer_start_month=4, summer_end_month=9):
1916            """
1917            Determine if a date falls in summer months (April-September).
1918
1919            Args:
1920                date_obj: Date object to check
1921
1922            Returns:
1923                bool: True if date is in summer months
1924            """
1925            return date_obj.month in [i for i in range(summer_start_month, summer_end_month+1)]
1926
1927        def check_month_is_summer(month, summer_start_month=4, summer_end_month=9):
1928            """
1929            Determine if a date falls in summer months (April-September).
1930
1931            Args:
1932                month: Integer month
1933
1934            Returns:
1935                str: 'summer' is in summer months, 'winter' if in winter months
1936            """
1937            return 'summer' if month in [i for i in range(summer_start_month, summer_end_month+1)] else 'winter'
1938
1939        def get_band_for_hour(hour):
1940            """
1941            Return the 6-hour band name for a given hour (0-23).
1942
1943            Args:
1944                hour (int): Hour of day (0-23)
1945
1946            Returns:
1947                str or None: Band name or None if hour is invalid
1948            """
1949            for band_name, (start, end) in HOUR_BANDS.items():
1950                if start <= hour < end:
1951                    return band_name
1952            return None
1953
1954        def calculate_minutes_in_band(shift_start, shift_end, band_start, band_end):
1955            """
1956            Calculate how many minutes of a shift fall within a specific time band.
1957
1958            Args:
1959                shift_start (float): Shift start time in hours (0-23.99)
1960                shift_end (float): Shift end time in hours (0-23.99)
1961                band_start (int): Band start hour
1962                band_end (int): Band end hour
1963
1964            Returns:
1965                int: Minutes of overlap between shift and band
1966            """
1967            # Find the overlap between shift and band
1968            overlap_start = max(shift_start, band_start)
1969            overlap_end = min(shift_end, band_end)
1970
1971            if overlap_start < overlap_end:
1972                return int((overlap_end - overlap_start) * 60)  # Convert to minutes
1973            return 0
1974
1975        # Create date range for analysis period
1976        date_range = pd.date_range(start=period_start,
1977                            end=period_end,
1978                            freq='D')
1979        daily_df = pd.DataFrame({'date': date_range})
1980        daily_df['date'] = pd.to_datetime(daily_df['date'])
1981
1982    # Create MultiIndex from date and bins combinations to get all date/time combinations
1983        multi_index = pd.MultiIndex.from_product(
1984            [daily_df['date'], bins],
1985            names=['date', 'six_hour_bin']
1986            )
1987
1988        # Convert MultiIndex to DataFrame and reset index
1989        daily_df = multi_index.to_frame(index=False).reset_index(drop=True)
1990        # Add quarter information for seasonal analysis
1991        daily_df["quarter"] = daily_df.date.dt.quarter
1992
1993        # Initialize availability columns for each aircraft registration
1994        # Each column will store minutes of scheduled time per time band
1995        for registration in full_rota_df['registration'].unique():
1996            daily_df[registration] = 0 # Initialize with 0 minutes
1997
1998        # Calculate scheduled availability for each date/time band combination
1999        for _, row in daily_df.iterrows():
2000            current_date = row['date']
2001            current_band = row['six_hour_bin']
2002            band_start, band_end = HOUR_BANDS[current_band]
2003
2004            is_current_date_summer = is_summer(current_date)
2005
2006            # Get the row index for updating the dataframe
2007            row_idx = daily_df[(daily_df['date'] == current_date) &
2008                            (daily_df['six_hour_bin'] == current_band)].index[0]
2009
2010            # Iterate through each resource's rota entry
2011            for _, rota_entry in full_rota_df.iterrows():
2012                registration = rota_entry['registration']
2013                # Select appropriate start/end times based on season
2014                start_hour_col = 'summer_start' if is_current_date_summer else 'winter_start'
2015                end_hour_col = 'summer_end' if is_current_date_summer else 'winter_end'
2016                start_hour = rota_entry[start_hour_col]
2017                end_hour = rota_entry[end_hour_col]
2018
2019                total_minutes_for_band = 0
2020
2021                if start_hour < end_hour:
2022                    # Shift within same day
2023                    total_minutes_for_band = calculate_minutes_in_band(
2024                        start_hour, end_hour, band_start, band_end
2025                    )
2026                elif start_hour > end_hour:
2027                    # Shift spans midnight - check both parts
2028
2029                    # Part 1: Today from start_hour to midnight
2030                    if band_end <= 24:  # This band is today
2031                        total_minutes_for_band += calculate_minutes_in_band(
2032                            start_hour, 24, band_start, band_end
2033                        )
2034
2035                    # Part 2: Tomorrow from midnight to end_hour
2036                    # Need to check if this band is for tomorrow
2037                    tomorrow = current_date + pd.Timedelta(days=1)
2038                    tomorrow_rows = daily_df[daily_df['date'] == tomorrow]
2039
2040                    if not tomorrow_rows.empty and current_band in tomorrow_rows['six_hour_bin'].values:
2041                        total_minutes_for_band += calculate_minutes_in_band(
2042                            0, end_hour, band_start, band_end
2043                        )
2044
2045                # Update the scheduled time for this aircraft in this time band
2046                daily_df.loc[row_idx, registration] += total_minutes_for_band
2047
2048        # Aggregate scheduled availability by quarter, time band, and registration
2049        available_time = (
2050            daily_df.melt(id_vars=["date", "six_hour_bin", "quarter"], value_name="rota_time", var_name="registration")
2051            .groupby(["quarter", "six_hour_bin", "registration"])[["rota_time"]]
2052            .sum().reset_index()
2053            )
2054
2055        def calculate_availability_row(row, rota_df, callsign_lookup_df):
2056            """
2057            Calculate downtime overlap with scheduled rota for a single downtime event.
2058
2059            Args:
2060                row: DataFrame row containing downtime information
2061                rota_df: DataFrame with rota schedules
2062                callsign_lookup_df: DataFrame mapping callsigns to registrations
2063
2064            Returns:
2065                dict: Dictionary containing processed availability data
2066            """
2067            # Extract downtime information
2068            registration = row['aircraft'].lower()
2069            downtime_start = pd.to_datetime(row['offline'], utc=True)
2070            downtime_end = pd.to_datetime(row['online'], utc=True)
2071            reason = row.get('reason', None)
2072
2073            # Determine which 6-hour bin this downtime starts in
2074            hour = downtime_start.hour
2075            if 0 <= hour <= 5:
2076                six_hour_bin = '00-05'
2077            elif 6 <= hour <= 11:
2078                six_hour_bin = '06-11'
2079            elif 12 <= hour <= 17:
2080                six_hour_bin = '12-17'
2081            else:
2082                six_hour_bin = '18-23'
2083
2084            quarter = downtime_start.quarter
2085
2086            # Find the callsign for this registration
2087            match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration]
2088
2089            # No matching callsign found
2090            if match.empty:
2091                return {
2092                    'registration': registration,
2093                    'offline': downtime_start,
2094                    'online': downtime_end,
2095                    'six_hour_bin': six_hour_bin,
2096                    'quarter': quarter,
2097                    'total_offline': None,
2098                    'reason': reason
2099                }
2100
2101            callsign = match.iloc[0]['callsign']
2102            # Find rota entries for this callsign
2103            rota_rows = rota_df[rota_df['callsign'] == callsign]
2104            if rota_rows.empty:
2105                return {
2106                    'registration': registration,
2107                    'offline': downtime_start,
2108                    'online': downtime_end,
2109                    'six_hour_bin': six_hour_bin,
2110                    'quarter': quarter,
2111                    'total_offline': None,
2112                    'reason': reason
2113                }
2114
2115            # Determine season for appropriate rota times
2116            month = downtime_start.month
2117            season = check_month_is_summer(month)
2118
2119            total_overlap_minutes = 0
2120
2121            # Calculate overlap between downtime and scheduled rota times
2122            for _, rota in rota_rows.iterrows():
2123                start_hour = rota[f'{season}_start']
2124                end_hour = rota[f'{season}_end']
2125
2126                # Check overlap across multiple days (yesterday, today, tomorrow)
2127                # This handles shifts that span midnight
2128                for base_day in [downtime_start.normalize() - timedelta(days=1),
2129                                downtime_start.normalize(),
2130                                downtime_start.normalize() + timedelta(days=1)]:
2131
2132                    rota_start = base_day + timedelta(hours=start_hour)
2133                    rota_end = base_day + timedelta(hours=end_hour)
2134
2135                    # Handle shifts that cross midnight
2136                    if end_hour <= start_hour:
2137                        rota_end += timedelta(days=1)
2138                    # Calculate overlap between downtime and this rota period
2139                    overlap_start = max(downtime_start, rota_start)
2140                    overlap_end = min(downtime_end, rota_end)
2141
2142                    if overlap_end > overlap_start:
2143                        overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60
2144
2145                        total_overlap_minutes += overlap_minutes
2146
2147            return {
2148                'registration': registration,
2149                'offline': downtime_start,
2150                'online': downtime_end,
2151                'six_hour_bin': six_hour_bin,
2152                'quarter': quarter,
2153                'total_offline': total_overlap_minutes,
2154                'reason': reason
2155            }
2156
2157        # Process all ad-hoc downtime events
2158        results = adhoc_df.apply(
2159                        lambda row: calculate_availability_row(
2160                            row, rota_df, callsign_lookup_df),
2161                        axis=1
2162                    )
2163
2164        # Convert results to DataFrame and select relevant columns
2165        unavailability_minutes_df = (
2166            pd.DataFrame(results.tolist())
2167            [["registration", "six_hour_bin", "quarter", "total_offline", "reason"]]
2168            )
2169
2170        # Aggregate offline time by registration, time band, quarter, and reason
2171        offline_durations = unavailability_minutes_df.groupby(
2172            ["registration", "six_hour_bin", "quarter", "reason"]
2173            )[["total_offline"]].sum().reset_index()
2174
2175        # Create complete combinations to ensure all possible categories are represented
2176        # This prevents missing data issues in the final output
2177        registrations = offline_durations['registration'].unique()
2178        six_hour_bins = offline_durations['six_hour_bin'].unique()
2179        quarters = offline_durations['quarter'].unique()
2180        reasons = offline_durations['reason'].unique()
2181
2182        # Generate all possible combinations
2183        all_combinations = list(itertools.product(registrations, six_hour_bins, quarters, reasons))
2184
2185        # Create complete dataframe with all combinations
2186        complete_df = pd.DataFrame(
2187            all_combinations,
2188            columns=['registration', 'six_hour_bin', 'quarter', 'reason']
2189            )
2190
2191        # Merge with original data to preserve existing values, fill missing with 0
2192        offline_durations = complete_df.merge(
2193            offline_durations,
2194            on=['registration', 'six_hour_bin', 'quarter', 'reason'],
2195            how='left'
2196            )
2197
2198        # Fill NaN values with 0 (no downtime for those combinations)
2199        offline_durations['total_offline'] = offline_durations['total_offline'].fillna(0.0)
2200
2201        # Sort for better readability
2202        offline_durations = offline_durations.sort_values(
2203            ['registration', 'six_hour_bin', 'quarter', 'reason']
2204            ).reset_index(drop=True)
2205
2206        # Ensure consistent case for registration names
2207        available_time["registration"] = available_time["registration"].str.lower()
2208        offline_durations["registration"] = offline_durations["registration"].str.lower()
2209
2210        # Merge scheduled time with downtime data
2211        ad_hoc = available_time.merge(offline_durations, on=["registration", "quarter", "six_hour_bin"])
2212        ad_hoc["probability"] = ad_hoc["total_offline"] / ad_hoc["rota_time"]
2213
2214        # Calculate availability probability (1 - sum of all unavailability probabilities)
2215        available_prop_df = ad_hoc.groupby(
2216            ["quarter", "six_hour_bin", "registration", "rota_time"]
2217            )[["probability"]].sum().reset_index()
2218
2219        available_prop_df["reason"] = "available"
2220        available_prop_df["probability"] = 1 - available_prop_df["probability"]
2221
2222        # Combine unavailability and availability data
2223        final_ad_hoc_df = (
2224            pd.concat([ad_hoc, available_prop_df])
2225            .sort_values(["quarter", "six_hour_bin", "registration", "reason"])
2226            )
2227
2228        # Handle cases where there's no scheduled time (set probability to NaN)
2229        final_ad_hoc_df["probability"] = final_ad_hoc_df.apply(
2230            lambda x: np.nan if x["rota_time"]==0 else x["probability"],
2231            axis=1
2232            )
2233
2234        final_ad_hoc_df["probability"] = final_ad_hoc_df["probability"].round(5)
2235
2236        # Save results to CSV
2237        final_ad_hoc_df[["registration", "six_hour_bin", "quarter", "reason", "probability"]].to_csv(
2238            "distribution_data/ad_hoc_unavailability.csv", index=False
2239            )
2240
2241
2242    def run_sim_on_historical_params(self):
2243        # Ensure all rotas are using default values
2244        rota = pd.read_csv("tests/rotas_historic/HISTORIC_HEMS_ROTA.csv")
2245        rota.to_csv("actual_data/HEMS_ROTA.csv", index=False)
2246
2247        callsign_reg_lookup = pd.read_csv("tests/rotas_historic/HISTORIC_callsign_registration_lookup.csv")
2248        callsign_reg_lookup.to_csv("actual_data/callsign_registration_lookup.csv", index=False)
2249
2250        service_history = pd.read_csv("tests/rotas_historic/HISTORIC_service_history.csv")
2251        service_history.to_csv("actual_data/service_history.csv", index=False)
2252
2253        service_sched = pd.read_csv("tests/rotas_historic/HISTORIC_service_schedules_by_model.csv")
2254        service_sched.to_csv("actual_data/service_schedules_by_model.csv", index=False)
2255
2256        print("Generating simulation results...")
2257        removeExistingResults()
2258
2259        total_runs = 30
2260        sim_years = 2
2261        sim_duration = 60 * 24 * 7 * 52 * sim_years
2262
2263        parallelProcessJoblib(
2264            total_runs=total_runs,
2265            sim_duration=sim_duration,
2266            warm_up_time=0,
2267            sim_start_date=datetime.strptime("2023-01-01 05:00:00", "%Y-%m-%d %H:%M:%S"),
2268            amb_data=False,
2269            print_debug_messages=True
2270        )
2271
2272        collateRunResults()
2273
2274        try:
2275            results_all_runs = pd.read_csv("data/run_results.csv")
2276            # results_all_runs.to_csv("historical_data/calculated/SIM_hist_params.csv", index=False)
2277
2278            # save data of counts of suboptimal care category sent
2279            counts_df = results_all_runs[results_all_runs["event_type"]=="resource_use"][["run_number", 'hems_res_category', "care_cat"]].value_counts().reset_index()
2280
2281            counts_df_summary = counts_df.groupby(["hems_res_category", "care_cat"])["count"].agg(["mean", "min", "max"]).reset_index()
2282
2283            counts_df_summary.to_csv("historical_data/calculated/SIM_hist_params_suboptimal_care_cat_sent_summary.csv")
2284
2285            # save data of counts of suboptimal vehicle type sent
2286            counts_df = results_all_runs[results_all_runs["event_type"]=="resource_use"][["run_number", "vehicle_type", "heli_benefit"]].value_counts().reset_index()
2287
2288            counts_df_summary = counts_df.groupby(["vehicle_type", "heli_benefit"])["count"].agg(["mean", "min", "max"]).reset_index()
2289
2290            counts_df_summary.to_csv("historical_data/calculated/SIM_hist_params_suboptimal_vehicle_type_sent_summary.csv")
2291
2292
2293            # # Also run the model to get some base-case outputs
2294            # resource_requests = (
2295            #     results_all_runs[results_all_runs["event_type"] == "resource_request_outcome"]
2296            #     .copy()
2297            #     )
2298
2299            # resource_requests["care_cat"] = (
2300            #     resource_requests.apply(lambda x: "REG - Helicopter Benefit" if x["heli_benefit"]=="y"
2301            #                             and x["care_cat"]=="REG" else x["care_cat"],
2302            #                             axis=1)
2303            #                             )
2304
2305            # missed_jobs_care_cat_summary = (
2306            #     resource_requests[["care_cat", "time_type"]].value_counts().reset_index(name="jobs")
2307            #     .sort_values(["care_cat", "time_type"])
2308            #     .copy()
2309            #     )
2310
2311            # missed_jobs_care_cat_summary["jobs_average"] = (
2312            #     missed_jobs_care_cat_summary["jobs"]/
2313            #     total_runs
2314            #     )
2315
2316            # missed_jobs_care_cat_summary["jobs_per_year_average"] = (
2317            #     (missed_jobs_care_cat_summary["jobs_average"] / float(sim_years*365)*365)
2318            #     ).round(0)
2319
2320            missed_jobs_care_cat_summary = _job_outcome_calculation.get_missed_call_df(
2321                    results_all_runs=results_all_runs,
2322                    run_length_days = float(sim_years*365),
2323                    what="summary"
2324                    )
2325
2326            missed_jobs_care_cat_summary.to_csv("historical_data/calculated/SIM_hist_params_missed_jobs_care_cat_summary.csv")
2327
2328
2329            missed_jobs_care_cat_breakdown = _job_outcome_calculation.get_missed_call_df(
2330                    results_all_runs=results_all_runs,
2331                    run_length_days = float(sim_years*365),
2332                    what="breakdown"
2333                    )
2334
2335            missed_jobs_care_cat_breakdown.to_csv("historical_data/calculated/SIM_hist_params_missed_jobs_care_cat_breakdown.csv")
2336
2337        except FileNotFoundError:
2338            pass
2339
2340if __name__ == "__main__":
2341    from distribution_fit_utils import DistributionFitUtils
2342    test = DistributionFitUtils('external_data/clean_daa_import_missing_2023_2024.csv',
2343                                calculate_school_holidays=True)
2344    #test = DistributionFitUtils('external_data/clean_daa_import.csv')
2345    test.import_and_wrangle()
2346    test.run_sim_on_historical_params()
2347
2348# Testing ----------
2349# python distribution_fit_utils.py