Monday, 5 February 2018

Find nearest line segment to each point in R


I've seen similar questions, but not with R. I have a sf object with approx 500000 thousand points. I also have an sf object of lines. Each line has a unique segment number called 'segnum'. For each pixel, I want to know which line (identified by segment number) is the closest. The point and line files are currently in geographic coordinates (crs = 4326), but I can reproject them if need be.


Based on this question (How to calculate distance from POINT to LINESTRING in R using `sf` library and get all POINT features from LINESTRING where distances were calculated?) I was thinking of using dist2line from geosphere.


For example: dist = geosphere::dist2Line(p=st_coordinates(pixels), line = st_coordinates(lines)[,1:2])


But that just gives you the distance from the point to the line. It doesn't tell you which line segment is closest. Any ideas of how to do this?



Here is a small sample of my data:


pixels = structure(list(id = c("1", "1", "1", "1", "1"), system.index = 0:4, 
b2_allPix = c(7009L, 3932L, 7329L, 5734L, 6525L), b2_cloudfilt = c(0L,
3932L, 0L, 0L, 0L), system.time_start = c(951350400000, 951350400000,
951350400000, 951350400000, 951350400000), .geo = c(NA, NA,
NA, NA, NA), geometry = structure(list(`1` = structure(c(-137.849125851618,
66.7968749939405), class = c("XY", "POINT", "sfg")), `2` = structure(c(-137.838550341857,
66.7968749939405), class = c("XY", "POINT", "sfg")), `3` = structure(c(-137.879732491708,
66.7947916606074), class = c("XY", "POINT", "sfg")), `4` = structure(c(-137.862752400773,
66.7927083272742), class = c("XY", "POINT", "sfg")), `5` = structure(c(-137.881661018612,

66.7885416606079), class = c("XY", "POINT", "sfg"))), .Names = c("1",
"2", "3", "4", "5"), class = c("sfc_POINT", "sfc"), precision = 0, bbox = structure(c(-137.881661018612,
66.7885416606079, -137.838550341857, 66.7968749939405), .Names = c("xmin",
"ymin", "xmax", "ymax"), class = "bbox"), crs = structure(list(
epsg = 4326L, proj4string = "+proj=longlat +datum=WGS84 +no_defs"), .Names = c("epsg",
"proj4string"), class = "crs"), n_empty = 0L)), .Names = c("id",
"system.index", "b2_allPix", "b2_cloudfilt", "system.time_start",
".geo", "geometry"), row.names = c(NA, 5L), class = c("sf", "data.frame"
), sf_column = "geometry", agr = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Names = c("id", "system.index", "b2_allPix", "b2_cloudfilt",

"system.time_start", ".geo"), .Label = c("constant", "aggregate",
"identity"), class = "factor"))

lines = structure(list(id = structure(c(1L, 112L, 223L, 334L, 445L), .Label = c("1",
"10", "100", "101", "102", "103", "104", "105", "106", "107",
"108", "109", "11", "110", "111", "112", "113", "114", "115",
"116", "117", "118", "119", "12", "120", "121", "122", "123",
"124", "125", "126", "127", "128", "129", "13", "130", "131",
"132", "133", "134", "135", "136", "137", "138", "139", "14",
"140", "141", "142", "143", "144", "145", "146", "147", "148",

"149", "15", "150", "151", "152", "153", "154", "155", "156",
"157", "158", "159", "16", "160", "161", "162", "163", "164",
"165", "166", "167", "168", "169", "17", "170", "171", "172",
"173", "174", "175", "176", "177", "178", "179", "18", "180",
"181", "182", "183", "184", "185", "186", "187", "188", "189",
"19", "190", "191", "192", "193", "194", "195", "196", "197",
"198", "199", "2", "20", "200", "201", "202", "203", "204", "205",
"206", "207", "208", "209", "21", "210", "211", "212", "213",
"214", "215", "216", "217", "218", "219", "22", "220", "221",
"222", "223", "224", "225", "226", "227", "228", "229", "23",

"230", "231", "232", "233", "234", "235", "236", "237", "238",
"239", "24", "240", "241", "242", "243", "244", "245", "246",
"247", "248", "249", "25", "250", "251", "252", "253", "254",
"255", "256", "257", "258", "259", "26", "260", "261", "262",
"263", "264", "265", "266", "267", "268", "269", "27", "270",
"271", "272", "273", "274", "275", "276", "277", "278", "279",
"28", "280", "281", "282", "283", "284", "285", "286", "287",
"288", "289", "29", "290", "291", "292", "293", "294", "295",
"296", "297", "298", "299", "3", "30", "300", "301", "302", "303",
"304", "305", "306", "307", "308", "309", "31", "310", "311",

"312", "313", "314", "315", "316", "317", "318", "319", "32",
"320", "321", "322", "323", "324", "325", "326", "327", "328",
"329", "33", "330", "331", "332", "333", "334", "335", "336",
"337", "338", "339", "34", "340", "341", "342", "343", "344",
"345", "346", "347", "348", "349", "35", "350", "351", "352",
"353", "354", "355", "356", "357", "358", "359", "36", "360",
"361", "362", "363", "364", "365", "366", "367", "368", "369",
"37", "370", "371", "372", "373", "374", "375", "376", "377",
"378", "379", "38", "380", "381", "382", "383", "384", "385",
"386", "387", "388", "389", "39", "390", "391", "392", "393",

"394", "395", "396", "397", "398", "399", "4", "40", "400", "401",
"402", "403", "404", "405", "406", "407", "408", "409", "41",
"410", "411", "412", "413", "414", "415", "416", "417", "418",
"419", "42", "420", "421", "422", "423", "424", "425", "426",
"427", "428", "429", "43", "430", "431", "432", "433", "434",
"435", "436", "437", "438", "439", "44", "440", "441", "442",
"443", "444", "445", "446", "447", "448", "449", "45", "450",
"451", "452", "453", "454", "455", "456", "457", "458", "459",
"46", "460", "461", "462", "463", "464", "465", "466", "467",
"468", "469", "47", "470", "471", "472", "473", "474", "475",

"476", "477", "478", "479", "48", "480", "481", "482", "483",
"484", "485", "486", "487", "488", "489", "49", "490", "491",
"492", "493", "494", "495", "496", "497", "498", "499", "5",
"50", "500", "501", "502", "503", "504", "505", "506", "507",
"508", "509", "51", "510", "52", "53", "54", "55", "56", "57",
"58", "59", "6", "60", "61", "62", "63", "64", "65", "66", "67",
"68", "69", "7", "70", "71", "72", "73", "74", "75", "76", "77",
"78", "79", "8", "80", "81", "82", "83", "84", "85", "86", "87",
"88", "89", "9", "90", "91", "92", "93", "94", "95", "96", "97",
"98", "99"), class = "factor"), segnum = 1:5, geometry = structure(list(

structure(c(-163.939480000102, -163.932950000242, -163.930539999721,
-163.93100000025, -163.933320000218, -163.935640000186, -163.941510000186,
-163.947380000187, -163.950175000026, -163.952969999866,
-163.952909999797, -163.948820000043, -163.953614999932,
-163.958409999822, -163.958460000329, -163.955969999716,
-163.948809999582, -163.950620000313, -163.94980000027, -163.945235000195,
-163.94067000012, -163.93931771722, 62.4387199999281, 62.4343600003138,
62.4286899996438, 62.4238500003773, 62.4198250000238, 62.4157999996703,
62.4099199996583, 62.4040399996463, 62.3988099999316, 62.3935800002169,
62.387119999988, 62.3795499998327, 62.3751399997113, 62.3707299995899,

62.3691199999881, 62.3658599998392, 62.3633900001485, 62.3612500003873,
62.3598899997237, 62.3578349998354, 62.3557799999471, 62.3541680789363
), .Dim = c(22L, 2L), class = c("XY", "LINESTRING", "sfg"
)), structure(c(-163.93931771722, -163.938169999946, -163.93740000041,
-163.938740000151, -163.944244999957, -163.949749999763,
-163.947819999793, -163.939500000125, -163.945110000126,
-163.950720000128, -163.956330000129, -163.959765000255,
-163.96320000038, -163.967020000273, -163.970840000166, -163.974660000059,
-163.978479999952, -163.979979999877, -163.981479999802,
-163.985049999858, -163.988619999913, -163.988893564025,

62.3541680789363, 62.3528000001199, 62.3498299998546, 62.3447300002894,
62.3405950002591, 62.3364600002289, 62.3337599998242, 62.3315399999711,
62.3283500000525, 62.325160000134, 62.3219700002154, 62.3189000002849,
62.3158300003543, 62.3098650002446, 62.303900000135, 62.2979350000253,
62.2919699999156, 62.2861900000185, 62.2804100001214, 62.2773400001909,
62.2742700002603, 62.2739300405841), .Dim = c(22L, 2L), class = c("XY",
"LINESTRING", "sfg")), structure(c(-163.988893564025, -163.992389999749,
-163.996159999584, -163.996669999721, -163.994829999855,
-163.992989999988, -163.994169999995, -163.995350000002,
-163.999320000068, -164.003290000133, -164.005879999962,

-164.005639999686, -164.001679999632, -163.995054999888,
-163.988430000144, -163.980987500023, -163.973544999902,
-163.966102499781, -163.958659999659, 62.2739300405841, 62.2695850000478,
62.2648999998352, 62.2581699997457, 62.2527699998357, 62.2473699999256,
62.2422650001297, 62.2371600003338, 62.2304550000483, 62.2237499997627,
62.2154199996334, 62.2137999995704, 62.2114799996024, 62.2112999998452,
62.211120000088, 62.2094475000771, 62.2077750000662, 62.2061025000554,
62.2044300000445), .Dim = c(19L, 2L), class = c("XY", "LINESTRING",
"sfg")), structure(c(-162.435819999793, -162.430109999976,
-162.427880000112, -162.425650000247, -162.425349999903,

-162.42719999978, -162.428489999914, -162.42620000043, -162.421750000262,
-162.417300000095, -162.41132999998, -162.405359999864, -162.399389999749,
-162.390509999887, -162.381630000024, -162.37460000004, -162.367570000055,
-162.363089999854, -162.358609999652, -162.352496666488,
-162.346383333324, -162.34027000016, -162.333680000232, -162.329452505243,
61.9670499997578, 61.9634399996563, 61.9593549996835, 61.9552699997108,
61.9520299995849, 61.9504499995679, 61.9418499995779, 61.9350699998806,
61.9318899998237, 61.9287099997667, 61.9272066666544, 61.9257033335421,
61.9242000004297, 61.9244350000253, 61.9246699996208, 61.9266249998439,
61.928580000067, 61.9307850001277, 61.9329900001884, 61.9375400001709,

61.9420900001534, 61.946640000136, 61.9500199999731, 61.9521882750156
), .Dim = c(24L, 2L), class = c("XY", "LINESTRING", "sfg"
)), structure(c(-162.329452505243, -162.327090000303, -162.321173333582,
-162.315256666862, -162.309340000141, -162.300583333454,
-162.291826666767, -162.28307000008, -162.275282500012, -162.267494999944,
-162.259707499876, -162.251919999808, -162.244499999825,
-162.237079999842, -162.230319999868, -162.223559999894,
-162.21679999992, -162.209842499906, -162.202884999893, -162.195927499879,
-162.188969999866, 61.9521882750156, 61.9533999998102, 61.9556199999631,
61.957840000116, 61.9600600002689, 61.9619966669127, 61.9639333335566,

61.9658700002004, 61.9665250000538, 61.9671799999071, 61.9678349997605,
61.9684899996138, 61.9682049999608, 61.9679200003078, 61.9667066667793,
61.9654933332508, 61.9642799997222, 61.9621174998228, 61.9599549999234,
61.9577925000239, 61.9556300001245), .Dim = c(21L, 2L), class = c("XY",
"LINESTRING", "sfg"))), class = c("sfc_LINESTRING", "sfc"
), precision = 0, bbox = structure(c(-164.005879999962, 61.9242000004297,
-162.188969999866, 62.4387199999281), .Names = c("xmin", "ymin",
"xmax", "ymax"), class = "bbox"), crs = structure(list(epsg = 4326L,
proj4string = "+proj=longlat +datum=WGS84 +no_defs"), .Names = c("epsg",
"proj4string"), class = "crs"), n_empty = 0L)), .Names = c("id",

"segnum", "geometry"), row.names = c(NA, 5L), class = c("sf",
"data.frame"), sf_column = "geometry", agr = structure(c(NA_integer_,
NA_integer_), .Names = c("id", "segnum"), .Label = c("constant",
"aggregate", "identity"), class = "factor"))

Answer



You can split the sf points object in parts and calculate the distances of every part to lines. I suggest use the library parallel to speed up the process. Also, I think that calculating the distances to lines in small parts use less RAM memory.


My answer is partially based in this post Processing vector to raster faster with R. If you are using Mac OS or Windows OS check this post Efficient spatial joining for large dataset in R to build/configure your cluster.


Try the reproducible and commented example below. At the end you have the nearest line segment ID (segnum) to each point (system.index).


# Load libraries ----------------------------------------------------------


library(sf)
library(geosphere)

# Create sample data ------------------------------------------------------

# 500000 random points
extentPolygon <- st_as_sfc(st_bbox(lines)) # extent polygon of lines sf object
randomPoints <- st_sample(extentPolygon, size = 500000, type = "random") # randomPoints in extent Polygon
pixels <- st_sf(system.index = 1:500000, geom = randomPoints, crs = 4326) # add data to randomPoints sfc object


headPixels


# Plot lines + sample points
plot(lines[2], axes = TRUE, graticule = TRUE, xlab = "Longitude", ylab = "Latitude", lwd = 4)
plot(pixels[1:10000,], pch = 19, col = "black", cex = 0.2, add = TRUE) # plot a sample of 10000 points (plotting all the points fill the map)

plot


Without Cluster


# Simple thread dist2Line function ----------------------------------------

# Very time consuming: I tried it with a sample of pixels


points.sp <- as_Spatial(st_geometry(pixels[1:10000,]))
points.sp$system.index <- pixels$system.index[1:10000]
lines.sp <- as_Spatial(st_geometry(lines), IDs = as.character(lines$segnum))

system.time(dist <- geosphere::dist2Line(p = points.sp, line = lines.sp))

# user system elapsed
# 91.728 0.012 91.752


# Convert dist to data.frame
dist.df <- as.data.frame(dist)

# Add points ID column to dist.df
dist.df$system.index <- pixels$system.index[1:1000]
colnames(dist.df) <- c("distance", "lon", "lat", "segnum", "system.index")

viewDist1


Using Cluster


# Make cluster ------------------------------------------------------------


# Load 'parallel' package for support Parallel computation in R
library(parallel)

# Calculate the number of cores
no_cores <- detectCores() - 1

# Split features in n parts
n <- 100 # split pixels by 100 points
parts <- split(1:nrow(pixels), cut(1:nrow(pixels), n))


lines.sp <- as_Spatial(st_geometry(lines), IDs = as.character(lines$segnum))

# Initiate cluster (after loading all the necessary object to R environment: BRA_adm2, parts, r.raster, n)
cl <- makeCluster(no_cores, type = "FORK")
print(cl)

# Multithread dist2Line function ------------------------------------------

# Parallelize dist2Line function

system.time(distParts <- parLapply(cl = cl,
X = 1:n,
fun = function(x) {
points.sp <- as_Spatial(st_geometry(pixels[parts[[x]],]))
points.sp$system.index <- pixels$system.index[parts[[x]]]
dist <- geosphere::dist2Line(p = points.sp, line = lines.sp)
# Convert dist to data.frame
dist.df <- as.data.frame(dist)
# Add points ID column to dist.df
dist.df$system.index <- pixels$system.index[parts[[x]]]

colnames(dist.df) <- c("distance", "lon", "lat", "segnum", "system.index")
gc(verbose = FALSE) # free memory
return(dist.df)
}))

# user system elapsed
# 1.869 2.067 1314.271

# Finish
stopCluster(cl)


# Bind rows
distBind <- do.call("rbind", distParts)

ViewDist


No comments:

Post a Comment

arcpy - Changing output name when exporting data driven pages to JPG?

Is there a way to save the output JPG, changing the output file name to the page name, instead of page number? I mean changing the script fo...