Reputation: 139
I am creating a utility that needs to be aware of all the datasets/tables that exist in my BigQuery project. My current code for getting this information is as follows (using Go API):
func populateExistingTableMap(service *bigquery.Service, cloudCtx context.Context, projectId string) (map[string]map[string]bool, error) {
tableMap := map[string]map[string]bool{}
call := service.Datasets.List(projectId)
//call.Fields("datasets/datasetReference")
if err := call.Pages(cloudCtx, func(page *bigquery.DatasetList) error {
for _, v := range page.Datasets {
if tableMap[v.DatasetReference.DatasetId] == nil {
tableMap[v.DatasetReference.DatasetId] = map[string]bool{}
}
table_call := service.Tables.List(projectId, v.DatasetReference.DatasetId)
//table_call.Fields("tables/tableReference")
if err := table_call.Pages(cloudCtx, func(page *bigquery.TableList) error {
for _, t := range page.Tables {
tableMap[v.DatasetReference.DatasetId][t.TableReference.TableId] = true
}
return nil
}); err != nil {
return errors.New("Error Parsing Table")
}
}
return nil
}); err != nil {
return tableMap, err
}
return tableMap, nil
}
For a project with about 5000 datasets, each with up to 10 tables, this code takes almost 15 minutes to return. Is there a faster way to iterate through the names of all existing datasets/tables? I have tried using the Fields method to return only the fields I need (you can see those lines commented out above), but that results in only 50 (exactly 50) of my datasets being returned.
Any ideas?
Upvotes: 3
Views: 154
Reputation: 139
Here is an updated version of my code, with concurrency, that reduced the processing time from about 15 minutes to 3 minutes.
func populateExistingTableMap(service *bigquery.Service, cloudCtx context.Context, projectId string) (map[string]map[string]bool, error) {
tableMap = map[string]map[string]bool{}
call := service.Datasets.List(projectId)
//call.Fields("datasets/datasetReference")
if err := call.Pages(cloudCtx, func(page *bigquery.DatasetList) error {
var wg sync.WaitGroup
wg.Add(len(page.Datasets))
for _, v := range page.Datasets {
if tableMap[v.DatasetReference.DatasetId] == nil {
tableMap[v.DatasetReference.DatasetId] = map[string]bool{}
}
go func(service *bigquery.Service, datasetID string, projectId string) {
defer wg.Done()
table_call := service.Tables.List(projectId, datasetID)
//table_call.Fields("tables/tableReference")
if err := table_call.Pages(cloudCtx, func(page *bigquery.TableList) error {
for _, t := range page.Tables {
tableMap[datasetID][t.TableReference.TableId] = true
}
return nil // NOTE: returning a non-nil error stops pagination.
}); err != nil {
// TODO: Handle error.
fmt.Println(err)
}
}(service, v.DatasetReference.DatasetId, projectId)
}
wg.Wait()
return nil // NOTE: returning a non-nil error stops pagination.
}); err != nil {
return tableMap, err
// TODO: Handle error.
}
return tableMap, nil
}
Upvotes: 2