Skip to main content

chat1

import polars as pl

# Assuming df is your original DataFrame with 'timestamp' already truncated to second precision and named 'timestamp_bucket'

# Step 1: Count entries per second
count_per_second = df.groupby('timestamp_bucket').agg(pl.count().alias('count'))

# Step 2: Find the buckets where count is 20 or more
offline_starts = count_per_second.filter(pl.col('count') >= 20).select('timestamp_bucket')

# Step 3: Generate a range of 30 subsequent seconds for each start point
# We'll use an "exploding" technique with arange to create the additional 30 seconds
offline_periods = offline_starts.with_column(
    (pl.arange(
        pl.col('timestamp_bucket'),
        pl.col('timestamp_bucket') + pl.duration_seconds(30)
    ).alias('offline_range'))
).explode('offline_range')

# Step 4: Remove duplicates in case of overlap
offline_periods = offline_periods.select('offline_range').distinct().sort('offline_range')

# Step 5: Join this with the original dataframe to classify each timestamp
df = df.join(
    offline_periods.with_column(pl.lit("OFFLINE").alias("status")),
    left_on="timestamp_bucket",
    right_on="offline_range",
    how="left"
)

# Step 6: Fill non-offline periods with "ONLINE"
df = df.with_column(
    pl.when(pl.col('status').is_null())
    .then("ONLINE")
    .otherwise(pl.col('status'))
    .alias('status')
)

print(df)