chat1

import polars as pl

#def ~~Example setup: Adjust according to your DataFrame~~
~~df = pl.DataFrame({~~
~~"timestamp"~~flag_offline_periods(df): [
~~"2024-04-19 07:08:13.959",~~
~~"2024-04-19 07:08:13.970",~~
~~"2024-04-19 07:08:17.823",~~
# ~~Add more timestamps as needed~~
]
~~}).with_columns([~~
~~pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S.%f").dt.truncate("1s").alias("timestamp_bucket")~~
])

~~# Step 1:~~ Count entries per ~~second~~bucket
~~count_per_second~~ counts = df.groupby(~~"timestamp_bucket"~~'timestamp_bucket').agg(pl.count('*').alias(~~"count"~~'count'))

# ~~Step 2:~~ Identify buckets with more than 20 ~~or more~~ entries
~~high_traffic_buckets~~ high_count_buckets = ~~count_per_second.~~counts.filter(pl.col(~~"count"~~'count') >= 20)

~~# Step 3: Create the list of all buckets to be marked as OFFLINE for the next 30 seconds~~
~~def generate_offline_buckets(row):~~
~~base_time = row[~~['timestamp_bucket']
~~return~~
# Create a function to generate offline periods
def generate_offline_periods(high_count_buckets):
offline_periods = [~~base_time~~]
for bucket in high_count_buckets:
offline_end = bucket + pl.duration(seconds=i)30)
offline_periods.append((bucket, offline_end))
return offline_periods

# Generate offline periods
offline_periods = generate_offline_periods(high_count_buckets)

# Merge overlapping periods
merged_periods = []
for istart, end in ~~range(31)]~~

sorted(offline_periods):

if merged_periods and start <= merged_periods[-1][1]:
merged_periods[-1] = (merged_periods[-1][0], max(merged_periods[-1][1], end))
else:
merged_periods.append((start, end))

# Create a ~~list~~function ofto ~~timestamps~~check if a timestamp is within offline periods
def is_offline(timestamp):
return any(start <= timestamp < end for ~~each~~start, ~~high~~end ~~traffic~~in ~~bucket~~
~~offline_buckets = high_traffic_buckets.select([~~merged_periods)
~~pl.explode(pl.map([pl.lit(x)~~
~~for~~ ~~x in range(31)], lambda x: pl.col("timestamp_bucket") + pl.duration(seconds=x))).alias("offline_bucket")~~
~~]).distinct()~~

# ~~Step 4: Join this with~~Apply the ~~original~~offline ~~dataframe~~check to ~~classify~~ each ~~timestamp~~
~~df = df.join(~~row
~~offline_buckets.with_columns([pl.lit("OFFLINE").alias("status")]),~~
~~left_on="timestamp_bucket",~~
~~right_on="offline_bucket",~~
~~how="left"~~
)

~~# Step 5: Fill non-offline periods with "ONLINE"~~
df = df.with_columns(
pl.when(pl.col(~~"status"~~'timestamp_bucket').~~is_null()~~apply(is_offline))
.then(~~"ONLINE"~~pl.lit('OFFLINE'))
.otherwise(pl.~~col("status"~~lit('ONLINE'))
.alias(~~"status"~~'status')
)

return df

~~print(~~# Assuming your DataFrame is called 'df'
df = flag_offline_periods(df)