-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodels.py
executable file
·203 lines (154 loc) · 5.38 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/usr/bin/env python
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "pandas>=2",
# "pydantic>=2",
# ]
# ///
"""Write JSON schema for JSON blob in SpineDB
"""
# from dataclasses import dataclass
# from dataclasses import field
from datetime import datetime, timedelta
from typing import Annotated, Literal, Type, TypeAlias
import pandas as pd
from pydantic import RootModel
from pydantic.dataclasses import dataclass
from pydantic.dataclasses import Field as field
from pydantic.types import StringConstraints
Floats: TypeAlias = list[float]
Integers: TypeAlias = list[int]
Strings: TypeAlias = list[str]
Booleans: TypeAlias = list[bool]
Datetimes: TypeAlias = list[datetime]
Timedeltas: TypeAlias = list[timedelta]
# FIXME: how to do w/o Pydantic?
time_pat_re = r"(Y|M|D|WD|h|m|s)[0-9]+-[0-9]+"
TimePattern: TypeAlias = Annotated[str, StringConstraints(pattern=time_pat_re)]
TimePatterns: TypeAlias = list[TimePattern]
NullableIntegers: TypeAlias = list[int | None]
NullableFloats: TypeAlias = list[float | None]
NullableStrings: TypeAlias = list[str | None]
NullableBooleans: TypeAlias = list[bool | None]
NullableDatetimes: TypeAlias = list[datetime | None]
NullableTimedeltas: TypeAlias = list[timedelta | None]
NullableTimePatterns: TypeAlias = list[TimePattern | None]
IndexTypes: TypeAlias = Integers | Strings | Datetimes | Timedeltas | TimePatterns
ValueTypes: TypeAlias = (
Integers | Strings | Floats | Booleans | Datetimes | Timedeltas | TimePatterns
)
NullableValueTypes: TypeAlias = (
NullableIntegers
| NullableStrings
| NullableFloats
| NullableBooleans
| NullableDatetimes
| NullableTimedeltas
| NullableTimePatterns
)
ValueTypeNames: TypeAlias = Literal[
"string", "integer", "number", "boolean", "date-time", "duration", "time-pattern"
]
IndexValueTypeNames: TypeAlias = Literal[
"string", "integer", "date-time", "duration", "time-pattern"
]
type_map: dict[type, ValueTypeNames] = {
str: "string",
int: "integer",
float: "number",
bool: "boolean",
datetime: "date-time",
pd.Timestamp: "date-time",
timedelta: "duration",
pd.Timedelta: "duration",
TimePattern: "time-pattern",
}
class _TypeInferMixin:
def __post_init__(self):
value_type, *_ = set(map(type, getattr(self, "values"))) - {type(None)}
# NOTE: have to do it like this since inherited dataclasses are frozen
super().__setattr__("value_type", type_map[value_type])
@dataclass(frozen=True)
class RunLengthIndex(_TypeInferMixin):
"""Run length encoded array
NOTE: this is not supported by PyArrow, if we use it, we will have
to convert to a supported format.
"""
name: str
run_len: Integers
values: IndexTypes
value_type: IndexValueTypeNames = field(init=False)
type: Literal["run_length_index"] = "run_length_index"
@dataclass(frozen=True)
class RunEndIndex(_TypeInferMixin):
"""Run end encoded array"""
name: str
run_end: Integers
values: IndexTypes
value_type: IndexValueTypeNames = field(init=False)
type: Literal["run_end_index"] = "run_end_index"
@dataclass(frozen=True)
class DictEncodedIndex(_TypeInferMixin):
"""Dictionary encoded array"""
name: str
indices: Integers
values: IndexTypes
value_type: IndexValueTypeNames = field(init=False)
type: Literal["dict_encoded_index"] = "dict_encoded_index"
@dataclass(frozen=True)
class ArrayIndex(_TypeInferMixin):
"""Any array that is an index, e.g. a sequence, timestamps, labels"""
name: str
values: IndexTypes
value_type: IndexValueTypeNames = field(init=False)
type: Literal["array_index"] = "array_index"
@dataclass(frozen=True)
class RunLengthArray(_TypeInferMixin):
"""Run length encoded array
NOTE: this is not supported by PyArrow, if we use it, we will have
to convert to a supported format.
"""
name: str
run_len: Integers
values: NullableValueTypes
value_type: ValueTypeNames = field(init=False)
type: Literal["run_length_array"] = "run_length_array"
@dataclass(frozen=True)
class RunEndArray(_TypeInferMixin):
"""Run end encoded array"""
name: str
run_end: Integers
values: NullableValueTypes
value_type: ValueTypeNames = field(init=False)
type: Literal["run_end_array"] = "run_end_array"
@dataclass(frozen=True)
class DictEncodedArray(_TypeInferMixin):
"""Dictionary encoded array"""
name: str
indices: NullableIntegers
values: NullableValueTypes
value_type: ValueTypeNames = field(init=False)
type: Literal["dict_encoded_array"] = "dict_encoded_array"
@dataclass(frozen=True)
class Array(_TypeInferMixin):
"""Array"""
name: str
values: NullableValueTypes
value_type: ValueTypeNames = field(init=False)
type: Literal["array"] = "array"
# NOTE: To add run-length encoding to the schema, add it to the
# following type union following which, we need to implement a
# converter to an Arrow array type
Table: TypeAlias = list[
RunEndIndex | DictEncodedIndex | ArrayIndex | RunEndArray | DictEncodedArray | Array
]
if __name__ == "__main__":
from argparse import ArgumentParser
import json
from pathlib import Path
parser = ArgumentParser(__doc__)
parser.add_argument("json_file", help="Path of JSON schema file to write")
opts = parser.parse_args()
schema = RootModel[Table].model_json_schema()
Path(opts.json_file).write_text(json.dumps(schema, indent=2))