32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165 | class TimeUnit(AutoEnum):
week = AutoEnum.auto()
day = AutoEnum.auto()
hour = AutoEnum.auto()
minute = AutoEnum.auto()
second = AutoEnum.auto()
def as_timedelta(self, interval) -> pendulum.Duration:
if self == self.week:
return pendulum.Duration(days=7 * interval)
elif self == self.day:
return pendulum.Duration(days=1 * interval)
elif self == self.hour:
return pendulum.Duration(hours=1 * interval)
elif self == self.minute:
return pendulum.Duration(minutes=1 * interval)
elif self == self.second:
return pendulum.Duration(seconds=1 * interval)
else:
raise NotImplementedError()
def validate_buckets(
self, start_datetime: DateTime, end_datetime: DateTime, interval: float
):
MAX_ALLOWED_BUCKETS = 1000
delta = self.as_timedelta(interval)
start_in_utc = start_datetime.in_timezone("UTC")
end_in_utc = end_datetime.in_timezone("UTC")
if interval < 0.01:
raise InvalidEventCountParameters("The minimum interval is 0.01")
number_of_buckets = math.ceil((end_in_utc - start_in_utc) / delta)
if number_of_buckets > MAX_ALLOWED_BUCKETS:
raise InvalidEventCountParameters(
f"The given interval would create {number_of_buckets} buckets, "
"which is too many. Please increase the interval or reduce the "
f"time range to produce {MAX_ALLOWED_BUCKETS} buckets or fewer."
)
def get_interval_spans(
self,
start_datetime: DateTime,
end_datetime: DateTime,
interval: float,
):
"""Divide the given range of dates into evenly-sized spans of interval units"""
self.validate_buckets(start_datetime, end_datetime, interval)
# Our universe began on PIVOT_DATETIME and all time after that is
# divided into `delta`-sized buckets. We want to find the bucket that
# contains `start_datetime` and then find the all of the buckets
# that come after it until the bucket that contains `end_datetime`.
delta = self.as_timedelta(interval)
start_in_utc = start_datetime.in_timezone("UTC")
end_in_utc = end_datetime.in_timezone("UTC")
if end_in_utc > pendulum.now("UTC"):
end_in_utc = pendulum.now("UTC").end_of(self.value)
first_span_index = math.floor((start_in_utc - PIVOT_DATETIME) / delta)
yield first_span_index
span_start = PIVOT_DATETIME + delta * first_span_index
while span_start < end_in_utc:
next_span_start = span_start + delta
yield (span_start, next_span_start - timedelta(microseconds=1))
span_start = next_span_start
def database_value_expression(self, time_interval: float):
"""Returns the SQL expression to place an event in a time bucket"""
# The date_bin function can do the bucketing for us:
# https://www.postgresql.org/docs/14/functions-datetime.html#FUNCTIONS-DATETIME-BIN
db = provide_database_interface()
delta = self.as_timedelta(time_interval)
if db.dialect.name == "postgresql":
return sa.cast(
sa.func.floor(
sa.extract(
"epoch",
(
sa.func.date_bin(delta, db.Event.occurred, PIVOT_DATETIME)
- PIVOT_DATETIME
),
)
/ delta.total_seconds(),
),
sa.Text,
)
elif db.dialect.name == "sqlite":
# Convert pivot date and event date to strings formatted as seconds since the epoch
pivot_timestamp = sa.func.strftime(
"%s", PIVOT_DATETIME.strftime("%Y-%m-%d %H:%M:%S")
)
event_timestamp = sa.func.strftime("%s", db.Event.occurred)
seconds_since_pivot = event_timestamp - pivot_timestamp
# Calculate the bucket index by dividing by the interval in seconds and flooring the result
bucket_index = sa.func.floor(
sa.cast(seconds_since_pivot, sa.Integer) / delta.total_seconds()
)
return sa.cast(bucket_index, sa.Text)
else:
raise NotImplementedError(f"Dialect {db.dialect.name} is not supported.")
def database_label_expression(self, db: PrefectDBInterface, time_interval: float):
"""Returns the SQL expression to label a time bucket"""
time_delta = self.as_timedelta(time_interval)
if db.dialect.name == "postgresql":
# The date_bin function can do the bucketing for us:
# https://www.postgresql.org/docs/14/functions-datetime.html#FUNCTIONS-DATETIME-BIN
return sa.func.to_char(
sa.func.date_bin(time_delta, db.Event.occurred, PIVOT_DATETIME),
'YYYY-MM-DD"T"HH24:MI:SSTZH:TZM',
)
elif db.dialect.name == "sqlite":
# We can't use date_bin in SQLite, so we have to do the bucketing manually
seconds_since_epoch = sa.func.strftime("%s", db.Event.occurred)
# Convert the total seconds of the timedelta to a constant in SQL
bucket_size = time_delta.total_seconds()
# Perform integer division and multiplication to find the bucket start epoch using SQL functions
bucket_start_epoch = sa.func.cast(
(sa.cast(seconds_since_epoch, sa.Integer) / bucket_size) * bucket_size,
sa.Integer,
)
bucket_datetime = sa.func.strftime(
"%Y-%m-%dT%H:%M:%SZ", sa.func.datetime(bucket_start_epoch, "unixepoch")
)
return bucket_datetime
else:
raise NotImplementedError(f"Dialect {db.dialect.name} is not supported.")
|