Reputation: 30031
I've got a collection of orders.
[a, b]
[a, b, c]
[a, b, c, d]
[a, b, c, d]
[b, c]
[c, d]
Where a, b, c and d are SKUs, and there are big boxes full of them. And there are thousands of orders and hundreds of possible SKUs.
Now imagine that when packing these orders, if an order lacks items from the previous order, you must put the box for that SKU away (and similarly take one out that you don't have).
How do you sort this so there are a minimum number of box changes? Or, in more programmy terms: how do you minimize the cumulative hamming distance / maximize the intersect between adjacent items in a collection?
I really have no clue where to start. Is there already some algorithm for this? Is there a decent approximation?
Upvotes: 10
Views: 430
Reputation: 47020
I like this problem so much I couldn't resist coding up the algorithm suggested above. The code is a little long, so I'm putting it in a separate response.
It comes up with this sequence on the example.
Step 1: c d
Step 2: b c
Step 3: a b c
Step 4: a b c d
Step 5: a b c d
Step 6: a b
Note this algorithm ignores initial setup and final teardown costs. It only considers inter-setup distances. Here the Hamming distances are 2 + 1 + 1 + 0 + 2 = 6. This is the same total distance as the order given in the question.
#include <stdio.h>
#include <stdlib.h>
// With these data types we can have up to 64k items and 64k sets of items,
// But then the table of pairs is about 20Gb!
typedef unsigned short ITEM, INDEX;
// A sku set in the problem.
struct set {
INDEX n_elts;
ITEM *elts;
};
// A pair of sku sets and associated info.
struct pair {
INDEX i, j; // Indices of sets.
ITEM dist; // Hamming distance between sets.
INDEX rank, parent; // Disjoint set union/find fields.
};
// For a given set, the adjacent ones along the path under construction.
struct adjacent {
unsigned char n; // 0, 1, or 2.
INDEX elts[2]; // Indices of n adjacent sets.
};
// Some tracing functions for fun.
void print_pair(struct pair *pairs, int i)
{
struct pair *p = pairs + i;
printf("%d:(%d,%d@%d)[%d->%d]\n", i, p->i, p->j, p->dist, p->rank, p->parent);
}
void print_adjacent(struct adjacent *adjs, int i)
{
struct adjacent *a = adjs + i;
switch (a->n) {
case 0: printf("%d:o", i); break;
case 1: printf("%d:o->%d\n", i, a->elts[0]); break;
default: printf("%d:%d<-o->%d\n", i, a->elts[0], a->elts[1]); break;
}
}
// Compute the Hamming distance between two sets. Assumes elements are sorted.
// Works a bit like merging.
ITEM hamming_distance(struct set *a, struct set *b)
{
int ia = 0, ib = 0;
ITEM d = 0;
while (ia < a->n_elts && ib < b->n_elts) {
if (a->elts[ia] < b->elts[ib]) {
++d;
++ia;
}
else if (a->elts[ia] > b->elts[ib]) {
++d;
++ib;
}
else {
++ia;
++ib;
}
}
return d + (a->n_elts - ia) + (b->n_elts - ib);
}
// Classic disjoint set find operation.
INDEX find(struct pair *pairs, INDEX x)
{
if (pairs[x].parent != x)
pairs[x].parent = find(pairs, pairs[x].parent);
return pairs[x].parent;
}
// Classic disjoint set union. Assumes x and y are canonical.
void do_union(struct pair *pairs, INDEX x, INDEX y)
{
if (x == y) return;
if (pairs[x].rank < pairs[y].rank)
pairs[x].parent = y;
else if (pairs[x].rank > pairs[y].rank)
pairs[y].parent = x;
else {
pairs[y].parent = x;
pairs[x].rank++;
}
}
// Sort predicate to sort pairs by Hamming distance.
int by_dist(const void *va, const void *vb)
{
const struct pair *a = va, *b = vb;
return a->dist < b->dist ? -1 : a->dist > b->dist ? +1 : 0;
}
// Return a plan with greedily found least Hamming distance sum.
// Just an array of indices into the given table of sets.
// TODO: Deal with calloc/malloc failure!
INDEX *make_plan(struct set *sets, INDEX n_sets)
{
// Allocate enough space for all the pairs taking care for overflow.
// This grows as the square of n_sets!
size_t n_pairs = (n_sets & 1) ? n_sets / 2 * n_sets : n_sets / 2 * (n_sets - 1);
struct pair *pairs = calloc(n_pairs, sizeof(struct pair));
// Initialize the pairs.
int ip = 0;
for (int j = 1; j < n_sets; j++) {
for (int i = 0; i < j; i++) {
struct pair *p = pairs + ip++;
p->i = i;
p->j = j;
p->dist = hamming_distance(sets + i, sets + j);
}
}
// Sort by Hamming distance.
qsort(pairs, n_pairs, sizeof pairs[0], by_dist);
// Initialize the disjoint sets.
for (int i = 0; i < n_pairs; i++) {
struct pair *p = pairs + i;
p->rank = 0;
p->parent = i;
}
// Greedily add pairs to the Hamiltonian path so long as they don't cause a non-path!
ip = 0;
struct adjacent *adjs = calloc(n_sets, sizeof(struct adjacent));
for (int i = 0; i < n_pairs; i++) {
struct pair *p = pairs + i;
struct adjacent *ai = adjs + p->i, *aj = adjs + p->j;
// Continue if we'd get a vertex with degree 3 by adding this edge.
if (ai->n == 2 || aj->n == 2) continue;
// Find (possibly) disjoint sets of pair's elements.
INDEX i_set = find(pairs, p->i);
INDEX j_set = find(pairs, p->j);
// Continue if we'd form a cycle by adding this edge.
if (i_set == j_set) continue;
// Otherwise add this edge.
do_union(pairs, i_set, j_set);
ai->elts[ai->n++] = p->j;
aj->elts[aj->n++] = p->i;
// Done after we've added enough pairs to touch all sets in a path.
if (++ip == n_sets - 1) break;
}
// Find a set with only one adjacency, the path start.
int p = -1;
for (int i = 0; i < n_sets; ++i)
if (adjs[i].n == 1) {
p = i;
break;
}
// A plan will be an ordering of sets.
INDEX *plan = malloc(n_sets * sizeof(INDEX));
// Walk along the path to get the ordering.
for (int i = 0; i < n_sets; i++) {
plan[i] = p;
struct adjacent *a = adjs + p;
// This logic figures out which adjacency takes us forward.
p = a->elts[ a->n > 1 && a->elts[1] != plan[i-1] ];
}
// Done with intermediate data structures.
free(pairs);
free(adjs);
return plan;
}
// A tiny test case. Much more testing needed!
#define ARRAY_SIZE(A) (sizeof A / sizeof A[0])
#define SET(Elts) { ARRAY_SIZE(Elts), Elts }
// Items must be in ascending order for Hamming distance calculation.
ITEM a1[] = { 'a', 'b' };
ITEM a2[] = { 'a', 'b', 'c' };
ITEM a3[] = { 'a', 'b', 'c', 'd' };
ITEM a4[] = { 'a', 'b', 'c', 'd' };
ITEM a5[] = { 'b', 'c' };
ITEM a6[] = { 'c', 'd' };
// Out of order to see how we do.
struct set sets[] = { SET(a3), SET(a6), SET(a1), SET(a4), SET(a5), SET(a2) };
int main(void)
{
int n_sets = ARRAY_SIZE(sets);
INDEX *plan = make_plan(sets, n_sets);
for (int i = 0; i < n_sets; i++) {
struct set *s = sets + plan[i];
printf("Step %d: ", i+1);
for (int j = 0; j < s->n_elts; j++) printf("%c ", (char)s->elts[j]);
printf("\n");
}
return 0;
}
Upvotes: 1
Reputation: 47020
Indeed @irrelephant is correct. This is an undirected Hamiltonian path problem. Model it as a complete undirected graph where the nodes are sku sets and the weight of each edge is the Hamming distance between the respective sets. Then finding a packing order is equivalent to finding a path that touches each node exactly once. This is a Hamiltonian path (HP). You want the minimum weight HP.
The bad news is that finding a min weight HP is NP complete, which means an optimal solution will need exponential time in general.
The good news is that there are reasonable approximation algorithms. The obvious greedy algorithm gives an answer no worse than two times the optimal HP. It is:
create the graph of Hamming distances
sort the edges by weight in increasing order: e0, e1, ...
set C = emptyset
for e in sequence e0, e1, ...
if C union {e} does not cause a cycle nor a vertex with degree more than 2 in C
set C = C union {e}
return C
Note the if
statement test can be implemented in nearly constant time with the classical disjoint set union-find algorithm and incident edge counters in vertices.
So the run time here can be O(n^2 log n) for n sku sets assuming that computing a Hamming distance is constant time.
If graphs are not in your vocabulary, think of a triangular table with one entry for each pair of sku sets. The entries in the table are Hamming distances. You want to sort the table entries and then add sku set pairs in sorted order one by one to your plan, skipping pairs that would cause a "fork" or a "loop." A fork would be a set of pairs like (a,b), (b,c), (b,d). A loop would be (a,b), (b,c), (c, a).
There are more complex polynomial time algorithms that get to a 3/2 approximation.
Upvotes: 5