chat1

import polars as pl

# ~~Assuming~~Example dfsetup: isAdjust according to your ~~original~~DataFrame
df ~~DataFrame~~= ~~with~~pl.DataFrame({
~~'timestamp'~~ ~~already~~"timestamp": ~~truncated~~[
to ~~second~~"2024-04-19 ~~precision~~07:08:13.959",
~~and~~ ~~named~~"2024-04-19 ~~'timestamp_bucket'~~07:08:13.970",
"2024-04-19 07:08:17.823",
# Add more timestamps as needed
]
}).with_columns([
pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S.%f").dt.truncate("1s").alias("timestamp_bucket")
])

# Step 1: Count entries per second
count_per_second = df.groupby(~~'timestamp_bucket'~~"timestamp_bucket").agg(pl.count().alias(~~'count'~~"count"))

# Step 2: ~~Find the~~Identify buckets ~~where count is~~with 20 or more entries
~~offline_starts~~high_traffic_buckets = count_per_second.filter(pl.col(~~'count'~~"count") >= 20)~~.select('timestamp_bucket')~~

# Step 3: ~~Generate~~Create the list of all buckets to be marked as OFFLINE for the next 30 seconds
def generate_offline_buckets(row):
base_time = row['timestamp_bucket']
return [base_time + pl.duration(seconds=i) for i in range(31)]

# Create a ~~range~~list of ~~30 subsequent seconds~~timestamps for each ~~start~~high ~~point~~traffic bucket
~~# We'll use an "exploding" technique with arange to create the additional 30 seconds~~
~~offline_periods~~offline_buckets = ~~offline_starts.with_column(~~
~~(pl.arange(~~high_traffic_buckets.select([
pl.~~col('timestamp_bucket')~~explode(pl.map([pl.lit(x) for x in range(31)],
lambda x: pl.col(~~'timestamp_bucket'~~"timestamp_bucket") + pl.~~duration_seconds(30)~~
duration(seconds=x))).alias(~~'offline_range')~~"offline_bucket")
]).~~explode('offline_range'~~distinct()

# Step 4: ~~Remove duplicates in case of overlap~~
~~offline_periods = offline_periods.select('offline_range').distinct().sort('offline_range')~~

~~# Step 5:~~ Join this with the original dataframe to classify each timestamp
df = df.join(
~~offline_periods.with_column(~~offline_buckets.with_columns([pl.lit("OFFLINE").alias("status")]),
left_on="timestamp_bucket",
right_on="~~offline_range"~~offline_bucket",
how="left"
)

# Step 6:5: Fill non-offline periods with "ONLINE"
df = df.~~with_column(~~with_columns(
pl.when(pl.col(~~'status'~~"status").is_null())
.then("ONLINE")
.otherwise(pl.col(~~'status'~~"status"))
.alias(~~'status'~~"status")
)

print(df)