Skip to main content

chat1

import polars as pl

# AssumingExample dfsetup: isAdjust according to your originalDataFrame
df DataFrame= withpl.DataFrame({
 'timestamp'   already"timestamp": truncated[
 to       second"2024-04-19 precision07:08:13.959",
 and       named"2024-04-19 'timestamp_bucket'07:08:13.970",
        "2024-04-19 07:08:17.823",
        # Add more timestamps as needed
    ]
}).with_columns([
    pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S.%f").dt.truncate("1s").alias("timestamp_bucket")
])

# Step 1: Count entries per second
count_per_second = df.groupby('timestamp_bucket'"timestamp_bucket").agg(pl.count().alias('count'"count"))

# Step 2: Find theIdentify buckets where count iswith 20 or more entries
offline_startshigh_traffic_buckets = count_per_second.filter(pl.col('count'"count") >= 20).select('timestamp_bucket')

# Step 3: GenerateCreate the list of all buckets to be marked as OFFLINE for the next 30 seconds
def generate_offline_buckets(row):
    base_time = row['timestamp_bucket']
    return [base_time + pl.duration(seconds=i) for i in range(31)]

# Create a rangelist of 30 subsequent secondstimestamps for each starthigh pointtraffic bucket
# We'll use an "exploding" technique with arange to create the additional 30 seconds
offline_periodsoffline_buckets = offline_starts.with_column(
    (pl.arange(high_traffic_buckets.select([
    pl.col('timestamp_bucket')explode(pl.map([pl.lit(x) for x in range(31)],
lambda x: pl.col('timestamp_bucket'"timestamp_bucket") + pl.duration_seconds(30)
    duration(seconds=x))).alias('offline_range')"offline_bucket")
]).explode('offline_range'distinct()

# Step 4: Remove duplicates in case of overlap
offline_periods = offline_periods.select('offline_range').distinct().sort('offline_range')

# Step 5: Join this with the original dataframe to classify each timestamp
df = df.join(
    offline_periods.with_column(offline_buckets.with_columns([pl.lit("OFFLINE").alias("status")]),
    left_on="timestamp_bucket",
    right_on="offline_range"offline_bucket",
    how="left"
)

# Step 6:5: Fill non-offline periods with "ONLINE"
df = df.with_column(with_columns(
    pl.when(pl.col('status'"status").is_null())
    .then("ONLINE")
    .otherwise(pl.col('status'"status"))
    .alias('status'"status")
)

print(df)