chat1
import polars as pl
# AssumingExample dfsetup: isAdjust according to your originalDataFrame
df DataFrame= withpl.DataFrame({
'timestamp' already"timestamp": truncated[
to second"2024-04-19 precision07:08:13.959",
and named"2024-04-19 'timestamp_bucket'07:08:13.970",
"2024-04-19 07:08:17.823",
# Add more timestamps as needed
]
}).with_columns([
pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S.%f").dt.truncate("1s").alias("timestamp_bucket")
])
# Step 1: Count entries per second
count_per_second = df.groupby('timestamp_bucket'"timestamp_bucket").agg(pl.count().alias('count'"count"))
# Step 2: Find theIdentify buckets where count iswith 20 or more entriesoffline_startshigh_traffic_buckets = count_per_second.filter(pl.col('count'"count") >= 20).select('timestamp_bucket')
# Step 3: GenerateCreate the list of all buckets to be marked as OFFLINE for the next 30 seconds
def generate_offline_buckets(row):
base_time = row['timestamp_bucket']
return [base_time + pl.duration(seconds=i) for i in range(31)]
# Create a rangelist of 30 subsequent secondstimestamps for each starthigh pointtraffic bucket# We'll use an "exploding" technique with arange to create the additional 30 secondsoffline_periodsoffline_buckets = offline_starts.with_column( (pl.arange(high_traffic_buckets.select([
pl.col('timestamp_bucket')explode(pl.map([pl.lit(x) for x in range(31)],
lambda x: pl.col('timestamp_bucket'"timestamp_bucket") + pl.duration_seconds(30) duration(seconds=x))).alias('offline_range')"offline_bucket")
]).explode('offline_range'distinct()
# Step 4: Remove duplicates in case of overlapoffline_periods = offline_periods.select('offline_range').distinct().sort('offline_range')
# Step 5: Join this with the original dataframe to classify each timestamp
df = df.join(
offline_periods.with_column(offline_buckets.with_columns([pl.lit("OFFLINE").alias("status")]),
left_on="timestamp_bucket",
right_on="offline_range"offline_bucket",
how="left"
)
# Step 6:5: Fill non-offline periods with "ONLINE"
df = df.with_column(with_columns(
pl.when(pl.col('status'"status").is_null())
.then("ONLINE")
.otherwise(pl.col('status'"status"))
.alias('status'"status")
)
print(df)