Can anyone tell me how to use GEOS/C++ to efficiently find all point pairs in a dataset closer than a threshold distance d?
I suspect this might involve quadtrees or rtrees but not quite sure how to query them.
Answer
Ok, got this working - and feel like putting the long answer here, as it has a lot of useful GEOS example bits in it. Here we go.
Warnings
- I haven't compiled this - I stripped out a load of project specific stuff and replaced it with a simple Point class which probably needs a copy/assignment operator. But it worked before I did that.
- I'm not sure this counts as clustering, as a long line of points each pair of which is closer than
tolerance
would count as a cluster (fine for my purposes). Complexity isO(n log n)
as long as your clusters don't get too big, I think.tolerance
is defined as a square around each point, not a circle - It uses the C interface to GEOS which is meant to be stable, hence use of boost::pool to manage memory of things I'm leaving GEOS with pointers to
- for some reason stackoverflow won't format this correctly, sorry!
void my_geos_message_handler(const char *fmt, ...)
{
#ifdef DEBUG
va_list args;
va_start( args, fmt );
vprintf( fmt, args );
va_end( args );
printf( "\n" );
#endif
}
struct Point
{
double x,y;
Point(double x,double y) : x(x),y(y) {}
};typedef vector
Cluster;
typedef vectorClusterList;
class ClusterFinder
{
public:
ClusterFinder()
{
initGEOS(&my_geos_message_handler,&my_geos_message_handler);
tree = GEOSSTRtree_create(10);
}
~ClusterFinder()
{
GEOSSTRtree_destroy(tree);
finishGEOS();
}
void add(const Point &p)
{
ClusterFinderNode *n = new (node_pool.malloc()) ClusterFinderNode(p);
const double x = p.x;
const double y = p.y;
GEOSCoordSequence* coords = GEOSCoordSeq_create(1,2);
GEOSCoordSeq_setX(coords,0,x);
GEOSCoordSeq_setY(coords,0,y);
GEOSGeometry * point = GEOSGeom_createPoint(coords); //point assumes ownership of coords
GEOSSTRtree_insert(tree,point,(void*)n); //tree assumes ownership of point
}
ClusterList get_clusters(double tolerance)
{
ClusterFinderData data(tolerance,tree);
GEOSSTRtree_iterate(tree,&try_to_find_cluster_starting_from_node,&data);
return data.clusters;
}
private:
struct ClusterFinderNode
{
Point point;
bool visited;
ClusterFinderNode(const Point &p)
:point(p),visited(false) {}
};
GEOSSTRtree* tree;
boost::object_poolnode_pool;
struct ClusterFinderData
{
ClusterList clusters;
vectorsearchqueue;
double tolerance;
GEOSSTRtree *tree;
ClusterFinderData(double t,GEOSSTRtree* tree):tolerance(t),tree(tree)
{
clusters.reserve(100);
searchqueue.reserve(100);
}
};
static void add_node_to_queue(void* vp_node,void* vp_clusterfinderdata)
{
ClusterFinderNode * const node = (ClusterFinderNode*) vp_node;
ClusterFinderData * const cfd = (ClusterFinderData*) vp_clusterfinderdata;
if (!node->visited)
{
cfd->searchqueue.push_back(node);
}
}
static void check_node_for_neighbours(ClusterFinderNode *node,ClusterFinderData* cfd)
{
//mark visited and add to end of list
node->visited = true;
cfd->clusters.back().push_back(node->point);
//add all neighbours within tolerance to search queue
const double x = node->point.x;
const double y = node->point.y;
const double buffer = cfd->tolerance;
GEOSCoordSequence* buffer_coords = GEOSCoordSeq_create(2,2);
GEOSCoordSeq_setX(buffer_coords,0,x-buffer);
GEOSCoordSeq_setY(buffer_coords,0,y-buffer);
GEOSCoordSeq_setX(buffer_coords,1,x+buffer);
GEOSCoordSeq_setY(buffer_coords,1,y+buffer);
GEOSGeometry *line = GEOSGeom_createLineString(buffer_coords); //line takes ownership of buffer_coords
GEOSGeometry *envelope = GEOSEnvelope(line);
GEOSSTRtree_query(cfd->tree,envelope,&add_node_to_queue,(void*)cfd);
GEOSGeom_destroy(line);
GEOSGeom_destroy(envelope);
}
static void try_to_find_cluster_starting_from_node(void* vp_node,void* vp_clusterfinderdata)
{
ClusterFinderNode * const initial_node = (ClusterFinderNode*) vp_node;
ClusterFinderData * const cfd = (ClusterFinderData*) vp_clusterfinderdata;
if (initial_node->visited)
return; //node was already discovered when starting from another node
//push back new empty cluster vector
cfd->clusters.push_back(Cluster());
//initialize exploration queue
assert(cfd->searchqueue.size()==0);
cfd->searchqueue.push_back(initial_node);
//explore node to fill cluster vector
while (cfd->searchqueue.size()>0)
{
ClusterFinderNode* node_to_search_next = cfd->searchqueue.back();
cfd->searchqueue.pop_back();
check_node_for_neighbours(node_to_search_next,cfd);
}
//all found nodes will now be on end of cfd->clusters
//remove cluster vector if no neighbours found
Cluster &latest_cluster = cfd->clusters.back();
if (latest_cluster.size() <= 1)
{
//should be a vector containing only the starting point
//should never be size 0
assert(latest_cluster.size()==1 && latest_cluster[0] == initial_node->point);
cfd->clusters.pop_back();
}
}
};
int main()
{
ClusterFinder cf;
cf.add(Point(1,2));
cf.add(Point(3,4));
cf.add(Point(3.05,4));
//etc
ClusterList clusters = cf.get_clusters(0.1);
return 0;
}
No comments:
Post a Comment