Skip to content

Commit 4a05bdd

Browse files
author
Feras A Saad
committed
Add some notes about crosscat schema data structure.
1 parent 492e524 commit 4a05bdd

File tree

2 files changed

+349
-0
lines changed

2 files changed

+349
-0
lines changed

docs/notes/crosscat-schema.txt

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
* Current schema of crosscat data
2+
3+
Table metadata is a record with three fields, stored in a JSON blob:
4+
5+
name_to_idx dict mapping column names to column indices
6+
idx_to_name dict mapping column number strings to column names
7+
(XXX This is silly: why not an array of names?)
8+
column_codebook optional array mapping colno to column codebook record
9+
[colno]
10+
short_name column name
11+
description string describing the column
12+
column_metadata array mapping colno to column metadata record
13+
[colno]
14+
modeltype string identifying the model type, one of
15+
- `normal_inverse_gamma' (numerical)
16+
- `symmetric_dirichlet_distance' (categorical)
17+
- `vonmises' (cyclic)
18+
value_to_code dict, empty for non-categorical model types,
19+
mapping integer indices to string values
20+
(XXX This is silly: why not an array of values?)
21+
(XXX This is named backwards!)
22+
code_to_value dict, empty for non-categorical model types,
23+
mapping string values to integer indices
24+
parameters optional(?) record of modeltype-specific parameters
25+
min minimum value (cyclic)
26+
max maximum value (cyclic)
27+
cardinality number of distinct values (categorical) [???]
28+
29+
XXX Not really necessary, can be encoded in SQL. But these get passed
30+
to the engine, so can't change it immediately.
31+
32+
Crosscat model data is a record with the following fields (see also
33+
src/metamodels/crosscat_theta.schema.json.):
34+
35+
X_L record
36+
column_partition record
37+
hypers record
38+
alpha concentration parameter of CRP
39+
assignments array mapping colno ---> viewno
40+
[colno] view number of this column
41+
counts array mapping viewno ---> number of columns
42+
[viewno] sum(assignments == viewno)
43+
column_hypers array mapping colno to record
44+
[colno]
45+
fixed boolean, true iff parameters fixed
46+
strength (asymmetric_beta_bernoulli)
47+
balance (asymmetric_beta_bernoulli)
48+
...
49+
view_state array mapping viewno to record
50+
[viewno]
51+
column_names array of strings indices locally consistent ???
52+
column_component_suffstats
53+
array mapping colno ---> array
54+
[colno] array mapping catno ---> record of suff. stats
55+
[catno]
56+
0_count (asymmetric_beta_bernoulli)
57+
1_count (asymmetric_beta_bernoulli)
58+
N_count (asymmetric_beta_bernoulli)
59+
...
60+
61+
row_partition_model record
62+
hypers record
63+
alpha concentration parameter of CRP in this view
64+
counts array mapping catno ---> number of rows
65+
[catno] sum(X_D[viewno] == catno)
66+
X_D array of views
67+
[viewno] array mapping rowno ---> catno
68+
[rowno] category number
69+
iterations integer number of analysis iterations
70+
column_crp_alpha stack of crp alpha values
71+
logscore stack of logscore values
72+
num_views stack of nviews values
73+
model_config record
74+
kernel_list ?
75+
initialization ?
76+
row_initialization ?

docs/notes/crosscat_theta.schema.json

+273
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,273 @@
1+
{
2+
"title": "schema for a serialized crosscat model",
3+
"$schema": "http://json-schema.org/draft-04/schema#",
4+
"description": "This schema specifies the structure of a single serialized model from the 'crosscat' generator in bayeslite. Such serialized models are stored in the theta_json column of the bayesdb_crosscat_theta table of a .bdb file.",
5+
"type": "object",
6+
"additionalProperties": false,
7+
"required": ["X_D", "X_L", "model_config", "iterations"],
8+
"properties": {
9+
10+
"iterations": {
11+
"description": "The number of iterations ANALYZE has been run for on this model.",
12+
"type": "integer",
13+
"minimum": 0
14+
},
15+
16+
"model_config": {
17+
"description": "Metainformation about the model, independent of its current state. It is always the same value.",
18+
"type": "object",
19+
"additionalProperties": false,
20+
"required": ["initialization", "kernel_list", "row_initialization"],
21+
"properties": {
22+
"initialization": {
23+
"enum": ["from_the_prior"]
24+
},
25+
"row_initialization": {
26+
"enum": ["from_the_prior"]
27+
},
28+
"kernel_list": {
29+
"description": "See transition_name_to_method_name_and_args in crosscat/src/cython_code/State.pyx.",
30+
"type": "array",
31+
"items": {
32+
"enum": [
33+
"column_partition_hyperparameter",
34+
"column_partition_assignments",
35+
"column_hyperparameters",
36+
"row_partition_hyperparameters",
37+
"row_partition_assignments"
38+
]
39+
}
40+
}
41+
}
42+
},
43+
44+
"X_D": {
45+
"description": "X_D specifies the mapping of data rows into categories for each view. It is an array with an element for each view. So, if there were three data rows and two views, X_D could be [[0, 0, 0], [0, 1, 1]]. The first view would assign all rows to a single category. The second view would assign the first row to one category and last two rows to a second category.",
46+
"type": "array",
47+
"items": {
48+
"type": "array",
49+
"items": {
50+
"type": "integer",
51+
"minimum": 0
52+
}
53+
}
54+
},
55+
56+
"X_L": {
57+
"description": "X_L specifies the latent state of the model.",
58+
"type": "object",
59+
"additionalProperties": false,
60+
"required": ["column_partition", "column_hypers", "view_state"],
61+
"properties": {
62+
"col_ensure": {
63+
"description": "Optional declaration of (in)dependence constraints",
64+
"type": "object",
65+
"additionalProperties": false,
66+
"properties": {
67+
"dependent": {"$ref": "#/definitions/dependenceContstraints"},
68+
"independent": {"$ref": "#/definitions/dependenceContstraints"}
69+
}
70+
},
71+
"column_partition": {
72+
"description": "Describes the partitioning of variables into views.",
73+
"type": "object",
74+
"additionalProperties": false,
75+
"required": ["assignments", "counts", "hypers"],
76+
"properties": {
77+
"assignments": {
78+
"description": "An array of view assignments. Element k has value v if variable k is in view v.",
79+
"type": "array",
80+
"items": {
81+
"type": "integer",
82+
"minimum": 0
83+
}
84+
},
85+
"counts": {
86+
"description": "Element v has value n if view v has n variables in it. This is redundant with assignments. This array has the same length as X_D.",
87+
"type": "array",
88+
"items": {
89+
"type": "integer",
90+
"minimum": 0
91+
}
92+
},
93+
"hypers": {
94+
"type": "object",
95+
"additionalProperties": false,
96+
"required": ["alpha"],
97+
"properties": {
98+
"alpha": {
99+
"description": "Parameter for the CRP that generates the views.",
100+
"type": "number",
101+
"minimum": 0.0
102+
}
103+
}
104+
}
105+
}
106+
},
107+
"column_hypers": {
108+
"description": "Hyperparameters for a column. Model parameters within a category are drawn from the distribution implied by these parameters. Elements correspond to columns.",
109+
"type": "array",
110+
"items": {
111+
"oneOf": [
112+
{"$ref": "#/definitions/numericHypers"},
113+
{"$ref": "#/definitions/categoricalHypers"},
114+
{"$ref": "#/definitions/cyclicHypers"}
115+
]
116+
}
117+
},
118+
"view_state": {
119+
"description": "Each element corresponds to a view and contains all information about that view.",
120+
"type": "array",
121+
"items": {
122+
"type": "object",
123+
"additionalProperties": false,
124+
"required": ["column_names", "column_component_suffstats", "row_partition_model"],
125+
"properties" : {
126+
"column_names": {
127+
"description": "An array of the column names in the view. If X_L.view_state[2].column_names[4] is 'S', and the fifth element of X_L.column_partition.assignments with value 2 has index k, then column k has name 'S'.",
128+
"type": "array",
129+
"items": {"type": "string"}
130+
},
131+
"column_component_suffstats": {
132+
"description": "An array of sufficient statistics of the categories of this view. There is one element for each column in the view. This element is itself an array with one element for each category.",
133+
"type": "array",
134+
"items": {
135+
"description": "Array of sufficient-statistics objects for one category of this view.",
136+
"type": "array",
137+
"items": {
138+
"oneOf": [
139+
{"$ref": "#/definitions/normalStats"},
140+
{"$ref": "#/definitions/multinomialStats"},
141+
{"$ref": "#/definitions/vonMisesStats"},
142+
{"$ref": "#/definitions/emptyStats"}
143+
]
144+
}
145+
}
146+
},
147+
"row_partition_model": {
148+
"description": "Latent state for assignment of variables to categories in this view.",
149+
"type": "object",
150+
"additionalProperties": false,
151+
"required": ["counts", "hypers"],
152+
"properties": {
153+
"counts": {
154+
"description": "Index is category, value is number of variables in that category.",
155+
"type": "array",
156+
"items": {"type": "integer"}
157+
},
158+
"hypers": {
159+
"type": "object",
160+
"additionalProperties": false,
161+
"required": ["alpha"],
162+
"properties": {
163+
"alpha": {
164+
"description": "Dirichlet parameter for assignment of variables to categories in this view.",
165+
"type": "number",
166+
"minimum": 0,
167+
"exclusiveMinimum": true
168+
}
169+
}
170+
}
171+
}
172+
}
173+
}
174+
}
175+
}
176+
}
177+
}
178+
},
179+
180+
"definitions": {
181+
"numericHypers": {
182+
"description": "Hyperparameters for a normal-inverse-gamma distribution in column_hypers",
183+
"type": "object",
184+
"additionalProperties": false,
185+
"required": ["fixed"],
186+
"properties": {
187+
"fixed": {"$ref": "#/definitions/fixed"},
188+
"mu": {"type": "number"},
189+
"r": {"type": "number"},
190+
"s": {"type": "number"},
191+
"nu": {"type": "number"}
192+
}
193+
},
194+
"categoricalHypers": {
195+
"description": "Hyperparameters for a Dirichlet distribution in column_hypers. dirichlet_alpha is a single number because the model only allows for Dirichlets with uniform priors; {'dirichlet_alpha': 0.5, 'K': 2} is the Beta(0.5,0.5) distribution.",
196+
"type": "object",
197+
"additionalProperties": false,
198+
"required": ["fixed", "dirichlet_alpha", "K"],
199+
"properties": {
200+
"fixed": {"$ref": "#/definitions/fixed"},
201+
"dirichlet_alpha": {"type": "number"},
202+
"K": {"type": "number"}
203+
}
204+
},
205+
"cyclicHypers": {
206+
"description": "Hyperparameters for a von Mises distribution in column_hypers",
207+
"type": "object",
208+
"additionalProperties": false,
209+
"required": ["fixed", "a", "b", "kappa"],
210+
"properties": {
211+
"fixed": {"$ref": "#/definitions/fixed"},
212+
"a": {"type": "number"},
213+
"b": {"type": "number"},
214+
"kappa": {"type": "number"}
215+
}
216+
},
217+
"fixed": {
218+
"description": "Hyperparameter 'fixed'. Nonzero in theory if Crosscat cannot do hyperparameter inference. Always zero.",
219+
"type": "number",
220+
"minimum": 0.0,
221+
"maximum": 0.0
222+
},
223+
"normalStats": {
224+
"description": "Sufficient statistics for a normal category",
225+
"type": "object",
226+
"additionalProperties": false,
227+
"required": ["N", "sum_x", "sum_x_squared"],
228+
"properties": {
229+
"N": {"type": "number"},
230+
"sum_x": {"type": "number"},
231+
"sum_x_squared": {"type": "number"}
232+
}
233+
},
234+
"multinomialStats": {
235+
"description": "Sufficient statistics for a multinomial category. '0', '1', etc. properties are counts.",
236+
"type": "object",
237+
"additionalProperties": false,
238+
"required": ["N"],
239+
"patternProperties": {
240+
"[0-9]+": {"type": "number"}
241+
},
242+
"properties": {
243+
"N": {"type": "number"}
244+
}
245+
},
246+
"vonMisesStats": {
247+
"description": "Sufficient statistics for a von Mises category.",
248+
"type": "object",
249+
"additionalProperties": false,
250+
"required": ["N", "sum_cos_x", "sum_sin_x"],
251+
"properties": {
252+
"N": {"type": "number"},
253+
"sum_cos_x": {"type": "number"},
254+
"sum_sin_x": {"type": "number"}
255+
}
256+
},
257+
"emptyStats": {
258+
"type": "object",
259+
"additionalProperties": false
260+
},
261+
"dependenceContstraints": {
262+
"description": "Specifies a collection of dependence or independence constraints. Maps a column index to the list of columns on which it is (in)dependent.",
263+
"type": "object",
264+
"additionalProperties": false,
265+
"patternProperties": {
266+
"[0-9]+": {
267+
"type": "array",
268+
"items": {"type": "integer"}
269+
}
270+
}
271+
}
272+
}
273+
}

0 commit comments

Comments
 (0)