Mark Wunsch
Mark Wunsch

Reputation: 139

Need faster way to list all datasets/tables in project

I am creating a utility that needs to be aware of all the datasets/tables that exist in my BigQuery project. My current code for getting this information is as follows (using Go API):

func populateExistingTableMap(service *bigquery.Service, cloudCtx context.Context, projectId string) (map[string]map[string]bool, error) {
    tableMap := map[string]map[string]bool{}

    call := service.Datasets.List(projectId)
    //call.Fields("datasets/datasetReference")

    if err := call.Pages(cloudCtx, func(page *bigquery.DatasetList) error {
        for _, v := range page.Datasets {

            if tableMap[v.DatasetReference.DatasetId] == nil {
                tableMap[v.DatasetReference.DatasetId] = map[string]bool{}
            }

            table_call := service.Tables.List(projectId, v.DatasetReference.DatasetId)
            //table_call.Fields("tables/tableReference")

            if err := table_call.Pages(cloudCtx, func(page *bigquery.TableList) error {
                for _, t := range page.Tables {
                    tableMap[v.DatasetReference.DatasetId][t.TableReference.TableId] = true
                }
                return nil 
            }); err != nil {
                return errors.New("Error Parsing Table")
            }
        }
        return nil 
    }); err != nil {
        return tableMap, err
    }

    return tableMap, nil
}

For a project with about 5000 datasets, each with up to 10 tables, this code takes almost 15 minutes to return. Is there a faster way to iterate through the names of all existing datasets/tables? I have tried using the Fields method to return only the fields I need (you can see those lines commented out above), but that results in only 50 (exactly 50) of my datasets being returned.

Any ideas?

Upvotes: 3

Views: 154

Answers (1)

Mark Wunsch
Mark Wunsch

Reputation: 139

Here is an updated version of my code, with concurrency, that reduced the processing time from about 15 minutes to 3 minutes.

func populateExistingTableMap(service *bigquery.Service, cloudCtx context.Context, projectId string) (map[string]map[string]bool, error) {
    tableMap = map[string]map[string]bool{}

    call := service.Datasets.List(projectId)
    //call.Fields("datasets/datasetReference")

    if err := call.Pages(cloudCtx, func(page *bigquery.DatasetList) error {
        var wg sync.WaitGroup
        wg.Add(len(page.Datasets))
        for _, v := range page.Datasets {
            if tableMap[v.DatasetReference.DatasetId] == nil {
                tableMap[v.DatasetReference.DatasetId] = map[string]bool{}
            }

            go func(service *bigquery.Service, datasetID string, projectId string) {
                defer wg.Done()
                table_call := service.Tables.List(projectId, datasetID)
                //table_call.Fields("tables/tableReference")
                if err := table_call.Pages(cloudCtx, func(page *bigquery.TableList) error {
                    for _, t := range page.Tables {
                        tableMap[datasetID][t.TableReference.TableId] = true
                    }
                    return nil // NOTE: returning a non-nil error stops pagination.
                }); err != nil {
                    // TODO: Handle error.
                    fmt.Println(err)
                }
            }(service, v.DatasetReference.DatasetId, projectId)
        }

        wg.Wait()
        return nil // NOTE: returning a non-nil error stops pagination.
    }); err != nil {
        return tableMap, err
        // TODO: Handle error.
    }

    return tableMap, nil
}

Upvotes: 2

Related Questions