Monday 23 September 2019

How to use GEOS/C++ to efficiently find all point pairs closer than a threshold?


Can anyone tell me how to use GEOS/C++ to efficiently find all point pairs in a dataset closer than a threshold distance d?


I suspect this might involve quadtrees or rtrees but not quite sure how to query them.




Answer



Ok, got this working - and feel like putting the long answer here, as it has a lot of useful GEOS example bits in it. Here we go.


Warnings



  • I haven't compiled this - I stripped out a load of project specific stuff and replaced it with a simple Point class which probably needs a copy/assignment operator. But it worked before I did that.

  • I'm not sure this counts as clustering, as a long line of points each pair of which is closer than tolerance would count as a cluster (fine for my purposes). Complexity is O(n log n) as long as your clusters don't get too big, I think. tolerance is defined as a square around each point, not a circle

  • It uses the C interface to GEOS which is meant to be stable, hence use of boost::pool to manage memory of things I'm leaving GEOS with pointers to

  • for some reason stackoverflow won't format this correctly, sorry!




void my_geos_message_handler(const char *fmt, ...)
{
#ifdef DEBUG
va_list args;
va_start( args, fmt );
vprintf( fmt, args );
va_end( args );
printf( "\n" );
#endif
}

struct Point
{
double x,y;
Point(double x,double y) : x(x),y(y) {}
};

typedef vector Cluster;
typedef vector ClusterList;

class ClusterFinder

{
public:
ClusterFinder()
{
initGEOS(&my_geos_message_handler,&my_geos_message_handler);
tree = GEOSSTRtree_create(10);
}
~ClusterFinder()
{
GEOSSTRtree_destroy(tree);

finishGEOS();
}

void add(const Point &p)
{
ClusterFinderNode *n = new (node_pool.malloc()) ClusterFinderNode(p);

const double x = p.x;
const double y = p.y;


GEOSCoordSequence* coords = GEOSCoordSeq_create(1,2);
GEOSCoordSeq_setX(coords,0,x);
GEOSCoordSeq_setY(coords,0,y);
GEOSGeometry * point = GEOSGeom_createPoint(coords); //point assumes ownership of coords
GEOSSTRtree_insert(tree,point,(void*)n); //tree assumes ownership of point
}

ClusterList get_clusters(double tolerance)
{
ClusterFinderData data(tolerance,tree);

GEOSSTRtree_iterate(tree,&try_to_find_cluster_starting_from_node,&data);
return data.clusters;
}


private:
struct ClusterFinderNode
{
Point point;
bool visited;

ClusterFinderNode(const Point &p)
:point(p),visited(false) {}
};

GEOSSTRtree* tree;
boost::object_pool node_pool;

struct ClusterFinderData
{
ClusterList clusters;

vector searchqueue;
double tolerance;
GEOSSTRtree *tree;
ClusterFinderData(double t,GEOSSTRtree* tree):tolerance(t),tree(tree)
{
clusters.reserve(100);
searchqueue.reserve(100);
}
};


static void add_node_to_queue(void* vp_node,void* vp_clusterfinderdata)
{
ClusterFinderNode * const node = (ClusterFinderNode*) vp_node;
ClusterFinderData * const cfd = (ClusterFinderData*) vp_clusterfinderdata;
if (!node->visited)
{
cfd->searchqueue.push_back(node);
}
}


static void check_node_for_neighbours(ClusterFinderNode *node,ClusterFinderData* cfd)
{
//mark visited and add to end of list
node->visited = true;
cfd->clusters.back().push_back(node->point);

//add all neighbours within tolerance to search queue
const double x = node->point.x;
const double y = node->point.y;
const double buffer = cfd->tolerance;


GEOSCoordSequence* buffer_coords = GEOSCoordSeq_create(2,2);
GEOSCoordSeq_setX(buffer_coords,0,x-buffer);
GEOSCoordSeq_setY(buffer_coords,0,y-buffer);
GEOSCoordSeq_setX(buffer_coords,1,x+buffer);
GEOSCoordSeq_setY(buffer_coords,1,y+buffer);

GEOSGeometry *line = GEOSGeom_createLineString(buffer_coords); //line takes ownership of buffer_coords
GEOSGeometry *envelope = GEOSEnvelope(line);


GEOSSTRtree_query(cfd->tree,envelope,&add_node_to_queue,(void*)cfd);

GEOSGeom_destroy(line);
GEOSGeom_destroy(envelope);
}

static void try_to_find_cluster_starting_from_node(void* vp_node,void* vp_clusterfinderdata)
{
ClusterFinderNode * const initial_node = (ClusterFinderNode*) vp_node;
ClusterFinderData * const cfd = (ClusterFinderData*) vp_clusterfinderdata;


if (initial_node->visited)
return; //node was already discovered when starting from another node

//push back new empty cluster vector
cfd->clusters.push_back(Cluster());

//initialize exploration queue
assert(cfd->searchqueue.size()==0);
cfd->searchqueue.push_back(initial_node);


//explore node to fill cluster vector
while (cfd->searchqueue.size()>0)
{
ClusterFinderNode* node_to_search_next = cfd->searchqueue.back();
cfd->searchqueue.pop_back();
check_node_for_neighbours(node_to_search_next,cfd);
}

//all found nodes will now be on end of cfd->clusters

//remove cluster vector if no neighbours found
Cluster &latest_cluster = cfd->clusters.back();
if (latest_cluster.size() <= 1)
{
//should be a vector containing only the starting point
//should never be size 0
assert(latest_cluster.size()==1 && latest_cluster[0] == initial_node->point);
cfd->clusters.pop_back();
}
}

};

int main()
{
ClusterFinder cf;

cf.add(Point(1,2));
cf.add(Point(3,4));
cf.add(Point(3.05,4));
//etc


ClusterList clusters = cf.get_clusters(0.1);
return 0;
}

No comments:

Post a Comment

arcpy - Changing output name when exporting data driven pages to JPG?

Is there a way to save the output JPG, changing the output file name to the page name, instead of page number? I mean changing the script fo...