chat1
import polars as pl
#def Example setup: Adjust according to your DataFramedf = pl.DataFrame({ "timestamp"flag_offline_periods(df): [ "2024-04-19 07:08:13.959", "2024-04-19 07:08:13.970", "2024-04-19 07:08:17.823",
# Add more timestamps as needed ]}).with_columns([ pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S.%f").dt.truncate("1s").alias("timestamp_bucket")])
# Step 1: Count entries per secondbucketcount_per_second counts = df.groupby("timestamp_bucket"'timestamp_bucket').agg(pl.count('*').alias("count"'count'))
# Step 2: Identify buckets with more than 20 or more entrieshigh_traffic_buckets high_count_buckets = count_per_second.counts.filter(pl.col("count"'count') >= 20)
# Step 3: Create the list of all buckets to be marked as OFFLINE for the next 30 secondsdef generate_offline_buckets(row): base_time = row[['timestamp_bucket']
return
# Create a function to generate offline periods
def generate_offline_periods(high_count_buckets):
offline_periods = [base_time]
for bucket in high_count_buckets:
offline_end = bucket + pl.duration(seconds=i)30)
offline_periods.append((bucket, offline_end))
return offline_periods
# Generate offline periods
offline_periods = generate_offline_periods(high_count_buckets)
# Merge overlapping periods
merged_periods = []
for istart, end in range(31)]
if merged_periods and start <= merged_periods[-1][1]:
merged_periods[-1] = (merged_periods[-1][0], max(merged_periods[-1][1], end))
else:
merged_periods.append((start, end))
# Create a listfunction ofto timestampscheck if a timestamp is within offline periods
def is_offline(timestamp):
return any(start <= timestamp < end for eachstart, highend trafficin bucketoffline_buckets = high_traffic_buckets.select([merged_periods)
pl.explode(pl.map([pl.lit(x)
for x in range(31)], lambda x: pl.col("timestamp_bucket") + pl.duration(seconds=x))).alias("offline_bucket")]).distinct()
# Step 4: Join this withApply the originaloffline dataframecheck to classify each timestampdf = df.join(row
offline_buckets.with_columns([pl.lit("OFFLINE").alias("status")]), left_on="timestamp_bucket", right_on="offline_bucket", how="left")
# Step 5: Fill non-offline periods with "ONLINE"
df = df.with_columns(
pl.when(pl.col("status"'timestamp_bucket').is_null()apply(is_offline))
.then("ONLINE"pl.lit('OFFLINE'))
.otherwise(pl.col("status"lit('ONLINE'))
.alias("status"'status')
)
return df
print(# Assuming your DataFrame is called 'df'
df = flag_offline_periods(df)