Reputation: 10303
I have a database which contains a large number of records (100's of thousands in the n_building/n_residence tables and up to many millions in the buildinggeo table). This is a simplified version of the database:
This is what it looks like when exported to SQL:
CREATE TABLE IF NOT EXISTS `district` (
`districtid` INT(20) NOT NULL COMMENT 'cbs_wijk_cd',
`description` VARCHAR(255) NOT NULL COMMENT 'cbs_wijk_oms',
`municipalityid` INT(20) NOT NULL COMMENT 'FK gemeente',
PRIMARY KEY (`districtid`),
INDEX `wijk_gemeente_fk` (`municipalityid` ASC),
CONSTRAINT `fk_district_municipality`
FOREIGN KEY (`municipalityid`)
REFERENCES `municipality` (`municipalityid`)
ON DELETE CASCADE
ON UPDATE CASCADE)
ENGINE = InnoDB;
CREATE TABLE IF NOT EXISTS `neighborhood` (
`neighborhoodid` INT(20) NOT NULL COMMENT 'cbs_buurt_cd',
`description` VARCHAR(255) NOT NULL COMMENT 'cbs_buurt_oms',
`districtid` INT(20) NOT NULL COMMENT 'FK wijk',
`municipalityid` INT(20) NOT NULL COMMENT 'FK gemeente',
PRIMARY KEY (`neighborhoodid`),
INDEX `buurt_gemeente_fk` (`municipalityid` ASC),
INDEX `buurt_wijk_fk` (`districtid` ASC),
FULLTEXT INDEX `index_neighborhood_description` (`description` ASC),
CONSTRAINT `fk_neighborhood_municipality`
FOREIGN KEY (`municipalityid`)
REFERENCES `municipality` (`municipalityid`)
ON DELETE CASCADE
ON UPDATE CASCADE,
CONSTRAINT `fk_neighborhood_district`
FOREIGN KEY (`districtid`)
REFERENCES `district` (`districtid`)
ON DELETE CASCADE
ON UPDATE CASCADE)
ENGINE = InnoDB;
CREATE TABLE IF NOT EXISTS `n_building` (
`buildingid` BIGINT(20) NOT NULL,
`neighborhoodid` INT(10) NOT NULL,
`constructionyear` INT(4) NOT NULL,
`height` INT(3) NOT NULL DEFAULT 9,
`multifamily` TINYINT(1) NOT NULL DEFAULT 0,
PRIMARY KEY (`buildingid`),
INDEX `fk_building_buurt_idx` (`neighborhoodid` ASC),
INDEX `index_neighborhoodid_buildingid` (`neighborhoodid` ASC, `buildingid` ASC),
CONSTRAINT `fk_building_neighborhood`
FOREIGN KEY (`neighborhoodid`)
REFERENCES `neighborhood` (`neighborhoodid`)
ON DELETE CASCADE
ON UPDATE CASCADE)
ENGINE = InnoDB;
CREATE TABLE IF NOT EXISTS `n_buildinggeo` (
`buildingid` BIGINT(20) NOT NULL,
`order` INT(5) NOT NULL,
`lat` DECIMAL(11,8) NOT NULL,
`lon` DECIMAL(11,8) NOT NULL,
PRIMARY KEY (`buildingid`, `order`),
CONSTRAINT `fk_buildinggeo_building`
FOREIGN KEY (`buildingid`)
REFERENCES `n_building` (`buildingid`)
ON DELETE CASCADE
ON UPDATE CASCADE)
ENGINE = InnoDB;
CREATE TABLE IF NOT EXISTS `n_residence` (
`residenceid` BIGINT(20) NOT NULL,
`buildingid` BIGINT(20) NOT NULL,
`geolat` DECIMAL(11,8) NOT NULL,
`geolon` DECIMAL(11,8) NOT NULL,
PRIMARY KEY (`residenceid`),
INDEX `fk_residence_building_idx` (`buildingid` ASC),
INDEX `index_geoloat_geolon_residenceid` (`geolat` ASC, `geolon` ASC, `residenceid` ASC),
INDEX `index_geolat` (`geolat` ASC),
INDEX `index_geolon` (`geolon` ASC),
CONSTRAINT `fk_residence_building`
FOREIGN KEY (`buildingid`)
REFERENCES `n_building` (`buildingid`)
ON DELETE CASCADE
ON UPDATE CASCADE)
ENGINE = InnoDB;
CREATE TABLE IF NOT EXISTS `n_vabidata` (
`residenceid` BIGINT(20) NOT NULL,
`index` FLOAT NULL COMMENT ' ',
`indexdate` VARCHAR(25) NULL,
`type` VARCHAR(100) NULL,
`subtype` VARCHAR(150) NULL,
`rooftype` VARCHAR(50) NULL,
PRIMARY KEY (`residenceid`),
CONSTRAINT `fk_vabidata_residence`
FOREIGN KEY (`residenceid`)
REFERENCES `n_residence` (`residenceid`)
ON DELETE CASCADE
ON UPDATE CASCADE)
ENGINE = InnoDB;
My goal is to create a JSON representation of a certain portion of the contents of this database like so (this is obviously anonymised data):
[
{
"buildingid": "632100000000000",
"buurtid": "6320103",
"constructionyear": "1969",
"height": "9",
"multifamily": "0",
"gemeenteid": "632",
"geo": [
{
"lat": "52.000",
"lon": "4.000"
},
{
"lat": "52.000",
"lon": "4.000"
},
{
"lat": "52.000",
"lon": "4.000"
},
{
"lat": "52.000",
"lon": "4.000"
},
{
"lat": "52.000",
"lon": "4.000"
}
],
"res": [
{
"residenceid": "632010000000000",
"surface": "159",
"postalcode": "3400AA",
"streetname": "Streetname",
"housenumber": "00",
"clusternr": "6320103533",
"owner": "onbekend",
"usageelec": "2463",
"usagegas": "2006",
"nomupd": "0",
"cpwin": "0",
"cpble": "0",
"enet": "0",
"gnet": "0",
"type": null
}
]
}
]
There are two ways to filter the database: By neighborhoodid (all buildings etc. from that neighborhood) or by bounding box (all buildings etc. that are within). At first I decided to this in a very simple manner:
$path2 = Config::Path(2);//minlat
$path3 = Config::Path(3);//minlon
$path4 = Config::Path(4);//maxlat
$path5 = Config::Path(5);//maxlon
if (($path2 && is_numeric($path2) && $path3 && is_numeric($path3) &&
$path4 && is_numeric($path4) && $path5 && is_numeric($path5)) ||
($path2 == "district" && $path3 && is_numeric($path3))) {
if ($path2 == "neighborhood") {
$buildings = DBUtils::FetchQuery("
SELECT b.`buildingid`, b.`neighborhoodid` AS buurtid, b.`constructionyear`,
b.`height`, b.`multifamily`, n.`municipalityid` AS gemeenteid
FROM `neighborhood` n
INNER JOIN `n_building` b ON b.`neighborhoodid` = n.`neighborhoodid`
INNER JOIN `n_residence` r ON r.`buildingid` = b.`buildingid`
WHERE b.`neighborhoodid` = '$path3'
GROUP BY b.`buildingid`;
");
} else {
$buildings = DBUtils::FetchQuery("
SELECT b.`buildingid`, b.`neighborhoodid` AS buurtid, b.`constructionyear`,
b.`height`, b.`multifamily`, n.`municipalityid` AS gemeenteid
FROM `neighborhood` n
INNER JOIN `n_building` b ON b.`neighborhoodid` = n.`neighborhoodid`
INNER JOIN `n_residence` r ON r.`buildingid` = b.`buildingid`
WHERE r.`geolat` >= '$path2'
AND r.`geolon` >= '$path3'
AND r.`geolat` <= '$path4'
AND r.`geolon` <= '$path5'
GROUP BY b.`buildingid`;
");
}
if ($buildings && count($buildings) > 0) {
for ($i = 0; $i < count($buildings); $i++) {
$building = $buildings[$i];
$buildinggeo = DBUtils::FetchQuery("
SELECT bg.`lat`, bg.`lon`
FROM `n_buildinggeo` bg
WHERE bg.`buildingid` = '$building[buildingid]';
");
if ($buildinggeo && count($buildinggeo) > 0) {
$buildings[$i]['geo'] = $buildinggeo;
$buildingresidences = DBUtils::FetchQuery("
SELECT r.`residenceid`, r.`surface`, r.`postalcode`, r.`streetname`,
r.`housenumber`, r.`clusternr`, r.`owner`, r.`usageelec`,
r.`usagegas`, r.`nomupd`, r.`cpwin`, r.`cpble`, r.`enet`,
r.`gnet`, v.`type`
FROM `n_residence` r
LEFT OUTER JOIN `n_vabidata` v ON r.`residenceid` = v.`residenceid`
WHERE r.`buildingid` = '$building[buildingid]';
");
if ($buildingresidences && count($buildingresidences) > 0) {
$buildings[$i]['res'] = $buildingresidences;
}
}
}
echo json_encode($buildings);
}
}
Later I decided to get all building/residence/vabidata information in a single query and create the required JSON structure from that because most time in each request (> 5 buildings) was spent on getting the residence data.
$path2 = Config::Path(2);//minlat
$path3 = Config::Path(3);//minlon
$path4 = Config::Path(4);//maxlat
$path5 = Config::Path(5);//maxlon
if (($path2 && is_numeric($path2) && $path3 && is_numeric($path3) &&
$path4 && is_numeric($path4) && $path5 && is_numeric($path5)) ||
($path2 == "district" && $path3 && is_numeric($path3))) {
if ($path2 == "district") {
$results = DBUtils::FetchQuery("
SELECT b.`buildingid`, b.`neighborhoodid`, b.`constructionyear`,
b.`height`, b.`multifamily`, n.`municipalityid`, r.`residenceid`,
r.`surface`, r.`postalcode`, r.`streetname`, r.`housenumber`,
r.`clusternr`, r.`owner`, r.`usageelec`, r.`usagegas`,
r.`nomupd`, r.`cpwin`, r.`cpble`, r.`enet`, r.`gnet`,
v.`type`
FROM `neighborhood` n
INNER JOIN `n_building` b ON b.`neighborhoodid` = n.`neighborhoodid`
INNER JOIN `n_residence` r ON r.`buildingid` = b.`buildingid`
LEFT OUTER JOIN `n_vabidata` v ON r.`residenceid` = v.`residenceid`
WHERE b.`neighborhoodid` = '$path3';
");
} else {
$results = DBUtils::FetchQuery("
SELECT b.`buildingid`, b.`neighborhoodid`, b.`constructionyear`,
b.`height`, b.`multifamily`, n.`municipalityid`, r.`residenceid`,
r.`surface`, r.`postalcode`, r.`streetname`, r.`housenumber`,
r.`clusternr`, r.`owner`, r.`usageelec`, r.`usagegas`,
r.`nomupd`, r.`cpwin`, r.`cpble`, r.`enet`, r.`gnet`,
v.`type`
FROM `neighborhood` n
INNER JOIN `n_building` b ON b.`neighborhoodid` = n.`neighborhoodid`
INNER JOIN `n_residence` r ON r.`buildingid` = b.`buildingid`
LEFT OUTER JOIN `n_vabidata` v ON r.`residenceid` = v.`residenceid`
WHERE r.`geolat` >= '$path2'
AND r.`geolon` >= '$path3'
AND r.`geolat` <= '$path4'
AND r.`geolon` <= '$path5';
");
}
if ($results && count($results) > 0) {
$buildings = array();
for ($i = 0; $i < count($results); $i++) {
$b = $results[$i];
if (!array_key_exists($b['buildingid'],$buildings)) {
$buildings[$b['buildingid']] = array(
"buildingid" => $b['buildingid'],
"buurtid" => $b['neighborhoodid'],
"constructionyear" => $b['constructionyear'],
"height" => $b['height'],
"multifamily" => $b['multifamily'],
"gemeenteid" => $b['municipalityid'],
"res" => array()
);
}
$buildings[$b['buildingid']]['res'][] = array(
"residenceid" => $b['residenceid'],
"surface" => $b['surface'],
"postalcode" => $b['postalcode'],
"streetname" => $b['streetname'],
"housenumber" => $b['housenumber'],
"clusternr" => $b['clusternr'],
"owner" => $b['owner'],
"usageelec" => $b['usageelec'],
"usagegas" => $b['usagegas'],
"nomupd" => $b['nomupd'],
"cpwin" => $b['cpwin'],
"cpble" => $b['cpble'],
"enet" => $b['enet'],
"gnet" => $b['gnet'],
"type" => $b['type']
);
}
$buildings = array_values($buildings);
for ($i = 0; $i < count($buildings); $i++) {
$building = $buildings[$i];
$buildinggeo = DBUtils::FetchQuery("
SELECT bg.`lat`, bg.`lon`
FROM `n_buildinggeo` bg
WHERE bg.`buildingid` = '$building[buildingid]';
");
if ($buildinggeo && count($buildinggeo) > 0) {
$buildings[$i]['geo'] = $buildinggeo;
}
}
echo json_encode($buildings);
}
}
However this method seemed to be even slower than the previous by 30-70%. My question is: can any of you find the reason (and possibly the solution) why either/both queries are performing as poorly as they are? If you have any questions or need any additional information, please ask.
EDIT
This is the result from an EXPLAIN on the bounding box version of the second query (all-in-one):
Upvotes: 3
Views: 124
Reputation: 142208
WHERE b.`neighborhoodid` = '$path3'
GROUP BY b.`buildingid`
b
needs INDEX(neighborhoodid, buildingid)
-- in that order
r
needs INDEX(buildingid)
r
needs INDEX(lat, lon, residenceid)
-- in that order
(Please provide SHOW CREATE TABLE
so we can see what indexes you do have.)
WHERE r.`geolat` >= '$path2'
AND r.`geolon` >= '$path3'
AND r.`geolat` <= '$path4'
AND r.`geolon` <= '$path5'
cannot be optimized as it stands. See http://mysql.rjweb.org/doc.php/latlng for how to deal with large searches by lat/lon.
Upvotes: 2