Skip to main content

Simulate

import random
from datetime import datetime, timedelta
import polars as pl
import time as time
df_ccs = pl.read_csv("data-core/cc_builder.csv")
events = []
event_ids=[]
initial_event = 100000
LETTERS = 'abcdefghijklmnopqrstuvwxyz'
#Define pain distribution
pain_range = [0, 1, 2]
pain_weights = [0.8, 0.1, 0.1]
#Define age distribution
age_range = [0,10,20,30,40,50,60,70,80,90,100]
age_weights = [0.04,0.03,0.08,0.08,0.08,0.1,0.12,0.16,0.18,0.12,0.01]
#Define spo2 distribution
spo2_range = [90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
spo2_weights = [0.02, 0.02, 0.02, 0.02, 0.02, 0.05, 0.05, 0.1, 0.2, 0.3, 0.2]
def simulate_data(rows, path, file):
'''Simulate EMS data and output to csv
Define length with row and file output with path and filname'''

global initial_event

for event in range(rows):

# Sample event
cc = df_ccs.sample().to_dict()

#Create event id
# initial_event = initial_event + random.randint(1, 1000)
# event_id = initial_event
event_id = 0

# Create transaction date in range
d1 = datetime.strptime('1/1/2023', '%m/%d/%Y')
d2 = datetime.strptime('06/1/2023', '%m/%d/%Y')

# get the timestamp of the beginning and end of the range
start_timestamp = int(d1.timestamp())
end_timestamp = int(d2.timestamp())

# generate times
# time (call)
random_timestamp = random.uniform(start_timestamp, end_timestamp)
dtime_event = datetime.fromtimestamp(random_timestamp)
# dtime_event = dtime_event.strftime('%m/%d/%Y %H:%M:%S.%f')

# time (page)
dtime_page = dtime_event + timedelta(minutes = random.randint(2,10))

# time (arrived)
dtime_arrived = dtime_page + timedelta(minutes = random.randint(2,8))

# time (transport)
dtime_transport = dtime_arrived + timedelta(minutes = random.randint(3,12))

# time (destination)
dtime_dest = dtime_transport + timedelta(minutes = random.randint(4,10))

# create patient id
random_letters = ''.join(random.sample(LETTERS, k=3))
random_numbers = ''.join(str(random.randint(0, 9)) for _ in range(7))
pt_id = f'id-{random_numbers}{random_letters}'


# assign mpds code
mpds_code = cc['mpds_code'].item()
# mpds = random.randint(1, 33)

# assign mpds name
mpds_name = cc['mpds_name'].item()
# mpds = random.randint(1, 33)

# assign impression code
impression_code = cc['impression_code'].item()
# impression = random.randint(1, 17)

# assign impression name
impression_name = cc['impression_name'].item()

# Create license level
lic_lvl = random.choice(['EMR', 'PCP', 'ACP'])

# Create age
age = random.choices(age_range, age_weights)
age = age[0] + random.randint(0,9)

# create vitals (first and last)

## spo2
spo2 = random.choices(spo2_range, spo2_weights)
spo2_first = spo2[0] + random.randint(cc['spo2'].item(),0)
# spo2_first = random.randint(90, 100) - random.randint(0,cc['spo2'].item())
spo2_last = min(spo2_first + random.randint(-5,5),100)

## bp systolic
# bp_sys_first = random.randint(90, 180)
bp_sys_first = random.randint(90, 180) + random.randint(-1*cc['bp'].item(),cc['bp'].item())
bp_sys_last = bp_sys_first + random.randint(-10,10)

## bp diastolic (use pulse pressure 40 to estimate with variabily of 10 per BP gauge)
bp_dia_first = bp_sys_first - 40 + random.randint(-10, 10)
bp_dia_last = bp_sys_last - 40 + random.randint(-10, 10)

## temp
temp_first = round(random.uniform(36,38),1) + (
random.randint(0,cc['temp'].item()) if cc['temp'].item() >= 0 else random.randint(cc['temp'].item(),0))
temp_last = round(temp_first + random.uniform(-0.3,0.3), 1)

## pain
pain = random.choices(pain_range, pain_weights)
pain_first = pain[0] + random.randint(0,cc['pain'].item())
pain_last = max(pain_first + random.randint(-2, 2),0)
pain_last = min(pain_last,10)

## bgl
bgl_first = round(random.uniform(4.5,7) + (
random.randint(0,cc['bgl'].item()) if cc['bgl'].item() >= 0 else random.randint(cc['bgl'].item(),0)),1)
bgl_last = round(bgl_first + random.uniform(-0.5,0.5), 1)

## gcs
gcs_first = random.randint(14, 15) + random.randint(cc['gcs'].item(),0)
gcs_last = max(gcs_first + random.randint(-3,3),3)
gcs_last = min(gcs_last,15)

# create list with the generated elements
events.append([dtime_event, dtime_page, dtime_arrived, dtime_transport, dtime_dest,
pt_id, mpds_code, mpds_name, impression_code, impression_name, lic_lvl,
age, spo2_first, spo2_last, bp_sys_first, bp_sys_last,
bp_dia_first, bp_dia_last, temp_first, temp_last,
pain_first, pain_last,
bgl_first, bgl_last, gcs_first, gcs_last])

# print(df_cc) # Debug
# print(cc['spo2'].item())
# print(cc['bp'].item())
# print(cc['temp'].item())
# print(cc['pain'].item())
# print(cc['bgl'].item())
# print(cc['gcs'].item())


# covert list to dataframe using Polars
df = pl.DataFrame(data=events,schema=['dtime_event', 'dtime_page', 'dtime_arrived', 'dtime_transport', 'dtime_dest',
'pt_id','mpds_code','mpds_name', 'impression_code', 'impression_name', 'license',
'age', 'spo2_first','spo2_last', 'bp_sys_first', 'sp_sys_last',
'bp_dia_first','bp_dia_last','temp_first', 'temp_last',
'pain_first', 'pain_last',
'bgl_first', 'bgl_last','gcs_first','gcs_last'])

for id in range(rows):
initial_event = initial_event + random.randint(1, 1000)
event_id = initial_event
event_ids.append(event_id)


# print(event_ids) # Debug

df = df.sort(["dtime_event"], descending=False)
df = df.select([
pl.Series(name="event_id", values=event_ids),
pl.all()
])

# print(df) # Debug

# write file to csv
df.write_csv(f'{path}/{file}.csv')
# log start time
start_time = time.time()

simulate_data(1000, './data', 'events')

# log execution time
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")
Execution time: 0.3063468933105469 seconds