Skip to content

Commit 4f9a5bc

Browse files
authored
Merge branch 'main' into fix/rerun_flaky_tests
2 parents 9dc1727 + 1012c1e commit 4f9a5bc

File tree

2 files changed

+235
-1
lines changed

2 files changed

+235
-1
lines changed

tests/unit/data/deserializers/test_file.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def test_hdf5_file_deserializer_success(tmp_path):
284284
###################
285285

286286

287-
@pytest.mark.skip(reason="issue: #492")
287+
@pytest.mark.sanity
288288
def test_db_file_deserializer_success(monkeypatch, tmp_path):
289289
import sqlite3
290290

@@ -317,6 +317,7 @@ def mock_from_sql(sql, con, **kwargs):
317317
data=db_path,
318318
processor_factory=processor_factory(),
319319
random_seed=1,
320+
sql="SELECT * FROM samples",
320321
)
321322

322323
# Assert: result is of type Dataset
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
import pytest
2+
from datasets import Dataset
3+
4+
from guidellm.data.deserializers.deserializer import (
5+
DataNotSupportedError,
6+
)
7+
from guidellm.data.deserializers.memory import (
8+
InMemoryCsvDatasetDeserializer,
9+
InMemoryDictDatasetDeserializer,
10+
InMemoryDictListDatasetDeserializer,
11+
InMemoryItemListDatasetDeserializer,
12+
InMemoryJsonStrDatasetDeserializer,
13+
)
14+
15+
16+
@pytest.fixture
17+
def processor_factory():
18+
return None # Dummy processor factory for testing
19+
20+
21+
###################
22+
# Tests dict in memory deserializer
23+
###################
24+
25+
26+
@pytest.mark.smoke
27+
def test_in_memory_dict_deserializer_success(processor_factory):
28+
deserializer = InMemoryDictDatasetDeserializer()
29+
30+
data = {
31+
"text": ["hello", "world"],
32+
"id": [1, 2],
33+
}
34+
35+
dataset = deserializer(
36+
data=data,
37+
processor_factory=processor_factory,
38+
random_seed=42,
39+
)
40+
41+
assert isinstance(dataset, Dataset)
42+
assert dataset["text"] == ["hello", "world"]
43+
assert dataset["id"] == [1, 2]
44+
assert len(dataset) == 2
45+
46+
47+
@pytest.mark.smoke
48+
def test_in_memory_dict_deserializer_invalid_not_dict(processor_factory):
49+
deserializer = InMemoryDictDatasetDeserializer()
50+
51+
with pytest.raises(DataNotSupportedError):
52+
deserializer(
53+
data="not a dict",
54+
processor_factory=processor_factory,
55+
random_seed=42,
56+
)
57+
58+
59+
@pytest.mark.smoke
60+
def test_in_memory_dict_deserializer_empty_dict(processor_factory):
61+
deserializer = InMemoryDictDatasetDeserializer()
62+
63+
with pytest.raises(DataNotSupportedError):
64+
deserializer(
65+
data={},
66+
processor_factory=processor_factory,
67+
random_seed=42,
68+
)
69+
70+
71+
@pytest.mark.smoke
72+
def test_in_memory_dict_deserializer_value_not_list(processor_factory):
73+
deserializer = InMemoryDictDatasetDeserializer()
74+
75+
with pytest.raises(DataNotSupportedError):
76+
deserializer(
77+
data={"text": "hello"}, # value 不是 list
78+
processor_factory=processor_factory,
79+
random_seed=42,
80+
)
81+
82+
83+
@pytest.mark.smoke
84+
def test_in_memory_dict_deserializer_list_length_mismatch(processor_factory):
85+
deserializer = InMemoryDictDatasetDeserializer()
86+
87+
with pytest.raises(DataNotSupportedError):
88+
deserializer(
89+
data={
90+
"text": ["hello", "world"],
91+
"id": [1], # diferent length
92+
},
93+
processor_factory=processor_factory,
94+
random_seed=42,
95+
)
96+
97+
98+
###################
99+
# Tests dict list in memory deserializer
100+
###################
101+
102+
103+
@pytest.mark.smoke
104+
def test_in_memory_dict_list_deserializer_success(processor_factory):
105+
# Arrange
106+
data = [
107+
{"id": 1, "text": "hello"},
108+
{"id": 2, "text": "world"},
109+
{"id": 3, "text": "guidellm"},
110+
]
111+
112+
deserializer = InMemoryDictListDatasetDeserializer()
113+
114+
# Act
115+
dataset = deserializer(
116+
data=data,
117+
processor_factory=processor_factory,
118+
random_seed=42,
119+
)
120+
121+
# Assert
122+
assert isinstance(dataset, Dataset)
123+
assert dataset["id"] == [1, 2, 3]
124+
assert dataset["text"] == ["hello", "world", "guidellm"]
125+
assert len(dataset) == 3
126+
127+
128+
@pytest.mark.smoke
129+
def test_in_memory_dict_list_deserializer_key_mismatch(processor_factory):
130+
deserializer = InMemoryDictListDatasetDeserializer()
131+
132+
wrong_data = [
133+
{"id": 1, "text": "hello"},
134+
{"id": 2, "msg": "world"}, # key mismatch
135+
]
136+
137+
with pytest.raises(DataNotSupportedError):
138+
deserializer(
139+
data=wrong_data,
140+
processor_factory=processor_factory,
141+
random_seed=42,
142+
)
143+
144+
145+
###################
146+
# Tests list in memory deserializer
147+
###################
148+
149+
150+
@pytest.mark.smoke
151+
def test_in_memory_item_list_deserializer_key_mismatch(processor_factory):
152+
data = ["a", "b", "c"]
153+
154+
deserializer = InMemoryItemListDatasetDeserializer()
155+
156+
# Act
157+
dataset = deserializer(
158+
data=data,
159+
processor_factory=processor_factory,
160+
random_seed=42,
161+
)
162+
163+
# Assert
164+
assert isinstance(dataset, Dataset)
165+
assert dataset["data"] == data
166+
assert len(dataset) == 3
167+
168+
169+
@pytest.mark.smoke
170+
def test_in_memory_item_list_custom_column_name(processor_factory):
171+
deserializer = InMemoryItemListDatasetDeserializer()
172+
data = [1, 2, 3]
173+
174+
dataset = deserializer(
175+
data=data,
176+
processor_factory=processor_factory,
177+
random_seed=123,
178+
column_name="numbers",
179+
)
180+
181+
assert list(dataset.column_names) == ["numbers"]
182+
assert dataset["numbers"] == [1, 2, 3]
183+
184+
185+
###################
186+
# Tests json in memory deserializer
187+
###################
188+
189+
190+
@pytest.mark.parametrize(
191+
("json_input"),
192+
[
193+
'{"text": ["hello", "world"], "id": [1, 2]}',
194+
'[{"id": 1, "text": "hello"}, {"id": 2, "text": "world"}]',
195+
'["a", "b", "c"]',
196+
],
197+
)
198+
@pytest.mark.smoke
199+
def test_in_memory_json_deserializer_success(processor_factory, json_input):
200+
deserializer = InMemoryJsonStrDatasetDeserializer()
201+
202+
dataset = deserializer(
203+
data=json_input,
204+
processor_factory=processor_factory,
205+
random_seed=42,
206+
)
207+
208+
assert isinstance(dataset, Dataset)
209+
assert len(dataset) > 0
210+
211+
212+
###################
213+
# Tests csv in memory deserializer
214+
###################
215+
216+
217+
@pytest.mark.smoke
218+
def test_csv_file_deserializer_success(processor_factory):
219+
csv_str = "id,text\n1,hello\n2,world\n"
220+
221+
deserializer = InMemoryCsvDatasetDeserializer()
222+
223+
dataset = deserializer(
224+
data=csv_str,
225+
processor_factory=processor_factory,
226+
random_seed=43,
227+
)
228+
229+
assert isinstance(dataset, Dataset)
230+
assert {"id", "text"}.issubset(set(dataset.column_names))
231+
assert dataset["id"] == ["1", "2"]
232+
assert dataset["text"] == ["hello", "world"]
233+
assert len(dataset) == 2

0 commit comments

Comments
 (0)