Skip to content

Commit bef9fa3

Browse files
ziyeqinghancopybara-github
authored andcommitted
Add data loader for QA task in model maker.
PiperOrigin-RevId: 306235117
1 parent 9ee6190 commit bef9fa3

12 files changed

Lines changed: 885 additions & 185 deletions

File tree

tensorflow_examples/lite/model_maker/core/data_util/dataloader.py

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -17,45 +17,6 @@
1717
from __future__ import division
1818
from __future__ import print_function
1919

20-
import hashlib
21-
import json
22-
import os
23-
24-
import tensorflow as tf
25-
from official.nlp.bert import input_pipeline
26-
27-
28-
def load(tfrecord_file, meta_data_file, model_spec):
29-
"""Loads data from tfrecord file and metada file."""
30-
31-
dataset = input_pipeline.single_file_dataset(
32-
tfrecord_file, model_spec.get_name_to_features())
33-
dataset = dataset.map(
34-
model_spec.select_data_from_record,
35-
num_parallel_calls=tf.data.experimental.AUTOTUNE)
36-
37-
with tf.io.gfile.GFile(meta_data_file, 'rb') as reader:
38-
meta_data = json.load(reader)
39-
return dataset, meta_data
40-
41-
42-
def get_cache_filenames(cache_dir, model_spec, data_name):
43-
"""Gets cache tfrecord filename, metada filename and prefix of filenames."""
44-
hasher = hashlib.md5()
45-
hasher.update(data_name.encode('utf-8'))
46-
hasher.update(str(model_spec.get_config()).encode('utf-8'))
47-
cache_prefix = os.path.join(cache_dir, hasher.hexdigest())
48-
cache_tfrecord_file = cache_prefix + '.tfrecord'
49-
cache_meta_data_file = cache_prefix + '_meta_data'
50-
51-
return cache_tfrecord_file, cache_meta_data_file, cache_prefix
52-
53-
54-
def write_meta_data(meta_data_file, meta_data):
55-
"""Writes meta data into file."""
56-
with tf.io.gfile.GFile(meta_data_file, 'w') as f:
57-
json.dump(meta_data, f)
58-
5920

6021
class DataLoader(object):
6122
"""This class provides generic utilities for loading customized domain data that will be used later in model retraining.

tensorflow_examples/lite/model_maker/core/data_util/dataloader_test.py

Lines changed: 1 addition & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,13 @@
1616
from __future__ import division
1717
from __future__ import print_function
1818

19-
import collections
20-
import json
21-
import os
22-
2319
import numpy as np
24-
import tensorflow as tf
20+
import tensorflow.compat.v2 as tf
2521
from tensorflow_examples.lite.model_maker.core.data_util import dataloader
26-
from tensorflow_examples.lite.model_maker.core.task import model_spec as ms
2722

2823

2924
class DataLoaderTest(tf.test.TestCase):
3025

31-
def setUp(self):
32-
super(DataLoaderTest, self).setUp()
33-
self.model_spec = ms.AverageWordVecModelSpec(seq_len=4)
34-
3526
def test_split(self):
3627
ds = tf.data.Dataset.from_tensor_slices([[0, 1], [1, 1], [0, 0], [1, 0]])
3728
data = dataloader.DataLoader(ds, 4)
@@ -45,59 +36,6 @@ def test_split(self):
4536
for i, elem in enumerate(test_data.dataset):
4637
self.assertTrue((elem.numpy() == np.array([i, 0])).all())
4738

48-
def _get_tfrecord_file(self):
49-
tfrecord_file = os.path.join(self.get_temp_dir(), 'tmp.tfrecord')
50-
writer = tf.io.TFRecordWriter(tfrecord_file)
51-
input_ids = tf.train.Int64List(value=[0, 1, 2, 3])
52-
label_ids = tf.train.Int64List(value=[0])
53-
features = collections.OrderedDict()
54-
features['input_ids'] = tf.train.Feature(int64_list=input_ids)
55-
features['label_ids'] = tf.train.Feature(int64_list=label_ids)
56-
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
57-
writer.write(tf_example.SerializeToString())
58-
writer.close()
59-
return tfrecord_file
60-
61-
def _get_meta_data_file(self):
62-
meta_data_file = os.path.join(self.get_temp_dir(), 'tmp_meta_data')
63-
meta_data = {'size': 1, 'num_classes': 1, 'index_to_label': ['0']}
64-
with tf.io.gfile.GFile(meta_data_file, 'w') as f:
65-
json.dump(meta_data, f)
66-
return meta_data_file
67-
68-
def test_load(self):
69-
tfrecord_file = self._get_tfrecord_file()
70-
meta_data_file = self._get_meta_data_file()
71-
dataset, meta_data = dataloader.load(tfrecord_file, meta_data_file,
72-
self.model_spec)
73-
for i, (input_ids, label_ids) in enumerate(dataset):
74-
self.assertEqual(i, 0)
75-
self.assertTrue((input_ids.numpy() == [0, 1, 2, 3]).all())
76-
self.assertTrue((label_ids.numpy() == [0]).all())
77-
self.assertEqual(meta_data['size'], 1)
78-
self.assertEqual(meta_data['num_classes'], 1)
79-
self.assertEqual(meta_data['index_to_label'], ['0'])
80-
81-
def test_get_cache_filenames(self):
82-
tfrecord_file, meta_data_file, prefix = dataloader.get_cache_filenames(
83-
cache_dir='/tmp', model_spec=self.model_spec, data_name='train')
84-
self.assertTrue(tfrecord_file.startswith(prefix))
85-
self.assertTrue(meta_data_file.startswith(prefix))
86-
87-
_, _, new_dir_prefix = dataloader.get_cache_filenames(
88-
cache_dir='/tmp1', model_spec=self.model_spec, data_name='train')
89-
self.assertNotEqual(new_dir_prefix, prefix)
90-
91-
_, _, new_model_spec_prefix = dataloader.get_cache_filenames(
92-
cache_dir='/tmp',
93-
model_spec=ms.AverageWordVecModelSpec(seq_len=8),
94-
data_name='train')
95-
self.assertNotEqual(new_model_spec_prefix, prefix)
96-
97-
_, _, new_data_name_prefix = dataloader.get_cache_filenames(
98-
cache_dir='/tmp', model_spec=self.model_spec, data_name='test')
99-
self.assertNotEqual(new_data_name_prefix, prefix)
100-
10139

10240
if __name__ == '__main__':
10341
tf.test.main()
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
{
2+
"version": "v2.0",
3+
"data": [
4+
{
5+
"title": "Normans",
6+
"paragraphs": [
7+
{
8+
"qas": [
9+
{
10+
"question": "In what country is Normandy located?",
11+
"id": "56ddde6b9a695914005b9628",
12+
"answers": [
13+
{
14+
"text": "France",
15+
"answer_start": 159
16+
},
17+
{
18+
"text": "France",
19+
"answer_start": 159
20+
},
21+
{
22+
"text": "France",
23+
"answer_start": 159
24+
},
25+
{
26+
"text": "France",
27+
"answer_start": 159
28+
}
29+
],
30+
"is_impossible": false
31+
}
32+
],
33+
"context": "The English name \"Normans\" comes from the French words Normans/Normanz, plural of Normant, modern French normand, which is itself borrowed from Old Low Franconian Nortmann \"Northman\" or directly from Old Norse Nor\u00f0ma\u00f0r, Latinized variously as Nortmannus, Normannus, or Nordmannus (recorded in Medieval Latin, 9th century) to mean \"Norseman, Viking\"."
34+
}
35+
]
36+
}
37+
]
38+
}
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
{
2+
"version": "v2.0",
3+
"data": [{
4+
"title": "Normans",
5+
"paragraphs": [{
6+
"qas": [{
7+
"question": "In what country is Normandy located?",
8+
"id": "56ddde6b9a695914005b9628",
9+
"answers": [{
10+
"text": "France",
11+
"answer_start": 159
12+
}],
13+
"is_impossible": false
14+
}, {
15+
"question": "When were the Normans in Normandy?",
16+
"id": "56ddde6b9a695914005b9629",
17+
"answers": [{
18+
"text": "10th and 11th centuries",
19+
"answer_start": 94
20+
}],
21+
"is_impossible": false
22+
}, {
23+
"question": "From which countries did the Norse originate?",
24+
"id": "56ddde6b9a695914005b962a",
25+
"answers": [{
26+
"text": "Denmark, Iceland and Norway",
27+
"answer_start": 256
28+
}],
29+
"is_impossible": false
30+
}, {
31+
"plausible_answers": [{
32+
"text": "Rollo",
33+
"answer_start": 308
34+
}],
35+
"question": "Who did King Charles III swear fealty to?",
36+
"id": "5ad39d53604f3c001a3fe8d3",
37+
"answers": [],
38+
"is_impossible": true
39+
}, {
40+
"plausible_answers": [{
41+
"text": "10th century",
42+
"answer_start": 671
43+
}],
44+
"question": "When did the Frankish identity emerge?",
45+
"id": "5ad39d53604f3c001a3fe8d4",
46+
"answers": [],
47+
"is_impossible": true
48+
}],
49+
"context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
50+
}, {
51+
"qas": [{
52+
"question": "Who was the duke in the battle of Hastings?",
53+
"id": "56dddf4066d3e219004dad5f",
54+
"answers": [{
55+
"text": "William the Conqueror",
56+
"answer_start": 1022
57+
}],
58+
"is_impossible": false
59+
}, {
60+
"plausible_answers": [{
61+
"text": "Antioch",
62+
"answer_start": 1295
63+
}],
64+
"question": "What principality did William the conquerer found?",
65+
"id": "5ad3a266604f3c001a3fea2b",
66+
"answers": [],
67+
"is_impossible": true
68+
}],
69+
"context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."
70+
}]
71+
}, {
72+
"title": "Computational_complexity_theory",
73+
"paragraphs": [{
74+
"qas": [{
75+
"question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
76+
"id": "56e16182e3433e1400422e28",
77+
"answers": [{
78+
"text": "Computational complexity theory",
79+
"answer_start": 0
80+
}],
81+
"is_impossible": false
82+
}, {
83+
"plausible_answers": [{
84+
"text": "algorithm",
85+
"answer_start": 472
86+
}],
87+
"question": "What is a manual application of mathematical steps?",
88+
"id": "5ad5316b5b96ef001a10ab76",
89+
"answers": [],
90+
"is_impossible": true
91+
}],
92+
"context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
93+
}, {
94+
"qas": [{
95+
"question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
96+
"id": "56e16839cd28a01900c67887",
97+
"answers": [{
98+
"text": "if its solution requires significant resources",
99+
"answer_start": 46
100+
}],
101+
"is_impossible": false
102+
}, {
103+
"question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
104+
"id": "56e16839cd28a01900c67888",
105+
"answers": [{
106+
"text": "mathematical models of computation",
107+
"answer_start": 176
108+
}],
109+
"is_impossible": false
110+
}, {
111+
"question": "What are two basic primary resources used to guage complexity?",
112+
"id": "56e16839cd28a01900c67889",
113+
"answers": [{
114+
"text": "time and storage",
115+
"answer_start": 305
116+
}],
117+
"is_impossible": false
118+
}, {
119+
"plausible_answers": [{
120+
"text": "the number of gates in a circuit",
121+
"answer_start": 436
122+
}],
123+
"question": "What unit is measured to determine circuit simplicity?",
124+
"id": "5ad532575b96ef001a10ab7f",
125+
"answers": [],
126+
"is_impossible": true
127+
}, {
128+
"plausible_answers": [{
129+
"text": "the number of processors",
130+
"answer_start": 502
131+
}],
132+
"question": "What number is used in perpendicular computing?",
133+
"id": "5ad532575b96ef001a10ab80",
134+
"answers": [],
135+
"is_impossible": true
136+
}],
137+
"context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do."
138+
}]
139+
}]
140+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"version":
3+
"toy",
4+
"data": [{
5+
"title":
6+
"test",
7+
"paragraphs": [{
8+
"context":
9+
"This is test.",
10+
"qas": [{
11+
"question": "What is this?",
12+
"id": "my_id",
13+
"answers": [{
14+
"answer_start": 8,
15+
"text": "test"
16+
}]
17+
}]
18+
}]
19+
}]
20+
}

0 commit comments

Comments
 (0)