chat1
import polars as pl
# Assuming df is your original DataFrame with 'timestamp' already truncated to second precision and named 'timestamp_bucket'
# Step 1: Count entries per second
count_per_second = df.groupby('timestamp_bucket').agg(pl.count().alias('count'))
# Step 2: Find the buckets where count is 20 or more
offline_starts = count_per_second.filter(pl.col('count') >= 20).select('timestamp_bucket')
# Step 3: Generate a range of 30 subsequent seconds for each start point
# We'll use an "exploding" technique with arange to create the additional 30 seconds
offline_periods = offline_starts.with_column(
(pl.arange(
pl.col('timestamp_bucket'),
pl.col('timestamp_bucket') + pl.duration_seconds(30)
).alias('offline_range'))
).explode('offline_range')
# Step 4: Remove duplicates in case of overlap
offline_periods = offline_periods.select('offline_range').distinct().sort('offline_range')
# Step 5: Join this with the original dataframe to classify each timestamp
df = df.join(
offline_periods.with_column(pl.lit("OFFLINE").alias("status")),
left_on="timestamp_bucket",
right_on="offline_range",
how="left"
)
# Step 6: Fill non-offline periods with "ONLINE"
df = df.with_column(
pl.when(pl.col('status').is_null())
.then("ONLINE")
.otherwise(pl.col('status'))
.alias('status')
)
print(df)