-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathWorkflow.R
More file actions
1199 lines (933 loc) · 51.9 KB
/
Workflow.R
File metadata and controls
1199 lines (933 loc) · 51.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#' ---------------------------------------------------------------------------- #
#' PROJECT: Welcome to the ExMove Workflow
#' - This code can be used for reading and processing animal tracking data files
#'
#' CONTENTS:
#' - Load libraries
#' - 0. Pre-flight checks
#' - 1. Read in data files
#' - 2. Merge with metadata
#' - 3. Cleaning = remove corrupt/missing values
#' - 4. Processing = spatial and temporal calculations
#' - 5. Save df_diagnostic = for use in shiny app
#' - 6. Filtering = remove erroneous fixes
#' - 7. Summarise cleaned & filtered tracking data
#' - 8. Save df_filtered and summary data
#' - 9. Visualisation
#' - 10. Post Processing : Optional steps
#' - 11. Reformat data for upload to public databases
#'
#' DEPENDENCIES:
#' - Requires tidyverse, data.table, sf and here to be installed
#' - Source data: use the example data sets provided or use your own data
#' AUTHORS: Liam Langley, Stephen Lang, Luke Ozsanlav-Harris, Alice Trevail
#' #----------------------------------------------------------------------- #
#-----------------------------#
## Load required libraries ####
#-----------------------------#
library(data.table) # data manipulation
library(tidyverse) # data manipulation, date time parsing and plotting
library(sf) # spatial data handling and manipulation
library(here) # reproducible filepaths
library(lubridate)
#--------------------------#
##0. Pre-flight checks ####
#--------------------------#
## How to use this workflow:
## - We will inspect the data before reading it in, so there is no need to open it in another program (e.g., excel, which can corrupt dates and times)
## - User-defined parameters are entered between ## USER INPUT START ## and ## USER INPUT END ##
## - These parameters are then called within the subsequent processing steps
## - Where you see: ## ** Option ** ##, there is an alternative version of the code to fit some common alternative data formats
## - Throughout, we will use some key 'functions' to inspect the data (e.g., 'head' for top rows, 'str' for column types, and 'names' for column names)
## Data & directory structure:
## - Data files should all be stored in a specific folder, ideally within the `Data` folder
## - Tracking data must contain a timestamp and at least one other sensor column
## - Data for each deployment/individual should be in a separate file
## - ID should be in tracking data file name, and should be the same length for all individuals
## - We also provide code to read in data that are already combined into one/multiple file(s) with an ID column
## - Metadata file should be in parent directory of data files
## - Metadata should contain one row per deployment per individual
## The importance of ID:
## - Throughout this workflow, we use ID to refer to the unique code for an individual animal
## - In certain cases, you might have additional ID columns in the metadata (e.g., DeployID),
## or read in data with a unique TagID instead of ID.
## - This code will work as long as all of the relevant info is included in the metadata
## - For more info and helpful code, see the FAQ document & troubleshooting script
## How to troubleshoot problems if something doesn't work with your data:
## - Refer to the FAQ document in the github page
## - This signposts to helpful resources online (e.g., spatial co-ordinate systems)
## - See the troubleshooting code scripts that we've written to accompany this workflow
## - (e.g., using multiple ID columns for re-deployments of tags/individuals)
#---------------------------#
##1. Read in data files ####
#---------------------------#
#--------------------#
## USER INPUT START ##
#--------------------#
## Throughout this script, we will save files pertaining to this data set using a species code as a file/folder identifier
## Define it here, for consistency:
species_code <- "RFB"
## Set filepath for folder containing raw data files
## NB: this code will try to open all files matching the file pattern within this folder
## Therefore, it is best if this folder only contains the raw data files
filepath <- here("Data", species_code)
## Define common file pattern to look for
## An asterisk (*) matches any character except a forward-slash
## e.g., "*.csv" will import all files within filepath folders that end with ".csv"
filepattern <- "*.csv"
## Let's view the file names, to check that we have the files we want & find ID position
## This will include names of sub-folders
ls_filenames <- list.files(path = filepath, recursive = TRUE, pattern = filepattern)
ls_filenames
## If your data are all in separate files with ID in the file name: (this is optimal for new tracking data)
## Find ID number from file name (excluding names of sub-folders)
## Numbers refer to the position of the characters in the file name string
## e.g., for "GV37501_201606_DG_RFB.csv", the ID number GV37501 is characters 1 to 7
## This will only work if ID numbers are the same length and position in all file names to be imported
IDstart <- 1 #start position of the ID in the filename
IDend <- 7 #end position of the ID in the filename
## Now, let's inspect the data by reading in the top of the first data file as raw text
## (To inspect the first row of all data files, you can remove the '[1]' and change "n_max" to 1)
test <- fs::dir_ls(path = filepath, recurse = TRUE, type = "file", glob = filepattern)[1]
read_lines(test, n_max = 5) # change n_max to change the number of rows to read in
## number of lines at top of file to skip (e.g., if importing a text file with additional info at top)
skiplines <- 0
## define date format(s) used (for passing to lubridate)
## "d"=day as decimal, "m"=month as decimal, "y"=year w/o century (2 digits), "Y"=year w/ century (4 digits)
## Here, we've included common combinations, modify if your data include a different format
date_formats <- c("dmY", "Ymd") #specify date formats (e.g. "dmY" works for 01-12-2022 and 01/12/2022)
datetime_formats <- c("dmY HMS", "Ymd HMS") #specify date & time format
## define time zone for tracking data
## Run the function `OlsonNames()` to get a full list of time zones
trackingdatatimezone <- "GMT"
## By default, the below code will find column names from the first row of data: colnames <- TRUE
## if you want to specify your own column names, do so here as a character vector, e.g. c("Date", "Time", "Lat", "Long")
## set colnames <- FALSE to automatically number columns
colnames <- TRUE
## How are your data delimited?
## Here, we use the function read_delim() and specify the delimiter to make this code more universal
## More guidance here: https://readr.tidyverse.org/reference/read_delim.html
## Some examples:
## ',' = comma delimited = equivalent to using read_csv (typically saved with file extension .csv)
## '\t' = tab delimited = equivalent to using read_tsv
## ' ' = whitespace delimited = equivalent to using read_table
## Let's inspect the data again, this time skipping rows if set, to check the file delimiter
read_lines(test, n_max = 5, skip = skiplines)
## Set delimiter to use within read_delim
user_delim <- ","
user_trim_ws <- TRUE # Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?
## Data need an ID column, which can either be the tag ID ("TagID") or individual ID ("ID")
## Specify ID type in the raw data here, for later matching with the same column in the metadata:
ID_type <- "ID"
#------------------#
## USER INPUT END ##
#------------------#
## Read in and merge tracking data files directly from github repository
df_combined <- fs::dir_ls(path = filepath, glob = filepattern, #use our defined filepath and pattern
type = "file", recurse = TRUE) %>% # recurse = T searches all sub-folders
purrr::set_names(nm = basename(.)) %>% # removing path prefix (makes filename more manageable)
purrr::map_dfr(read_delim, .id="filename", #read all the files in using filename as ID column
col_types = cols(.default = "c"), col_names = colnames,
skip = skiplines, delim = user_delim, trim_ws = user_trim_ws) %>%
mutate("{ID_type}" := str_sub(string = filename, start = IDstart, end = IDend), #substring ID from the filename (start to end of substring)
.after = filename) #position the new ID column after filename column
df_combined
colnames(df_combined)
## ** Option ** ##
## If your data are combined into one or multiple csv files containing an ID column, use the following code:
## This is the same as above, but doesn't create a new ID column from the file name
## To un-hash code, highlight and press "Ctrl + Shft + C" on a PC or "Cmd + Shft + C" on a Mac
# df_combined <- fs::dir_ls(path = filepath, recurse = TRUE, type = "file", glob = filepattern) %>% # recurse = T searches all sub-folders
# purrr::map_dfr(read_delim, col_types = cols(.default = "c"), col_names = colnames, skip = skiplines, delim = user_delim, trim_ws = user_trim_ws)
# df_combined
#--------------------#
## USER INPUT START ##
#--------------------#
## Data need a time stamp, either in separate columns (e.g., "Date" and "Time") or combined ("DateTime")
## Specify below which columns date and time info are stored in the data
## NB: These have to be in the same order as specified in earlier user input, i.e. "Date" and "Time" have to be the right way round
datetime_formats # a reminder of the datetime orders previously specified
datetime_colnames <- c("Date", "Time") # or c("DateTime")
## Specify location coordinates as X (e.g., longitude) and Y (e.g., latitude)
## Additional columns depending on logger type, e.g.:
## lc = Argos fix quality
## X2/Y2 = additional location fixes from Argos tag
## laterr/lonerr = location error information provided by some GLS processing packages
## Change column names to those present in your tracking data, additional columns can be added (see above examples)
## This standardises important column names for the rest of the workflow, e.g. TagID, X/Y coordinates
df_slim <- data.frame(ID = as.character(df_combined$ID),
Date = df_combined$Date,
Time = df_combined$Time,
Y = as.numeric(df_combined$Latitude),
X = as.numeric(df_combined$Longitude))
## ** Optional ** ##
# For the example dataset RFB_IMM (immature red-footed boobies), use the following:
# datetime_colnames <- c("DateTime")
# df_slim <- data.frame(TagID = as.character(df_combined$TagID),
# DateTime = df_combined$`Date Time`,
# lc = df_combined$Fix, # Argos fix quality
# Y = df_combined$`Lat1(N)`,
# X = df_combined$`Long1(E)`,
# Y2 = df_combined$`Lat2(N)`,
# X2 = df_combined$`Long2(E)`)
## ** Optional ** ##
## Here's an example of how to change the above code for data with different columns and column names
## This code works with immersion data recorded by a GLS logger (no location data)
## To un-hash code, highlight and press "Ctrl + Shft + C" on a PC or "Cmd + Shft + C" on a Mac
# df_slim <- data.frame(ID = df_combined$ID,
# Date = df_combined$`DD/MM/YYYY`,
# Time = df_combined$`HH:MM:SS`,
# Immersion = df_combined$`wets0-20`)
#------------------#
## USER INPUT END ##
#------------------#
## Check the data frame just created
str(df_slim);head(df_slim)
## Dates and Times
message("If you see any 'failed to parse' warnings below a date or time has not formatted (we will remove these NA's later)")
## Parse dates, create datetime, date and year columns
df_slim <- df_slim %>%
tidyr::unite(col = "DateTime_unparsed", all_of(datetime_colnames), sep = " ", remove = FALSE) %>%
mutate(DateTime = lubridate::parse_date_time(DateTime_unparsed, #use lubridate to parse DateTime
orders=datetime_formats, #using the datetime_formats object we made earlier
tz=trackingdatatimezone),
Date = lubridate::as_date(DateTime),
Year = lubridate::year(DateTime)) %>%
select(-DateTime_unparsed)
## Check which DateTimes failed to parse (if any)
Fails <- df_slim %>% filter(is.na(DateTime)==T)
head(Fails)
## Order finished raw dataframe by ID type (either individual ID or TagID) and DateTime
df_raw <- df_slim %>%
arrange(across(all_of(c(ID_type, "DateTime")))) %>%
drop_na(DateTime) #remove NA's in datetime column
head(df_raw)
## Remove intermediate files/objects if necessary to speed up processing
rm(list=ls()[!ls() %in% c("df_raw","date_formats","datetime_formats","trackingdatatimezone", "ID_type", "species_code")])
#----------------------------#
##2. Merge with metadata ####
#----------------------------#
#--------------------#
## USER INPUT START ##
#--------------------#
## set file path to metadata
metadata_path <- paste0(species_code, "_Metadata.csv")
filepath_meta <- here("Data", metadata_path)
## define metadata date and time format(s) used (for passing to lubridate)
## "d"=day as decimal, "m"=month as decimal, "y"=year w/o century, "Y"=year w/ century
## Here, we've included common combinations, modify if your data include a different format
## Run the function `OlsonNames()` to get a full list of time zones
metadate_formats <- c("dmY", "Ymd") #specify date format used in metadata
metadatetime_formats <- c("dmY HMS", "Ymd HMS") #specify date & time format
metadatatimezone <- "Indian/Chagos" #specify timezone used for metadata
#------------------#
## USER INPUT END ##
#------------------#
## Read in metadata file
df_metadata <- readr::read_csv(filepath_meta)
names(df_metadata)
#--------------------#
## USER INPUT START ##
#--------------------#
## Select necessary columns & coerce column names
## Compulsory columns: ID as defined in tracking data (individual "ID" or "TagID"), deployment date & deployment time
## Optional columns depending on study system: e.g., population, sex, age
## Add or delete columns below as appropriate
## If you have multiple ID columns, include them here (e.g., TagID/DeployID)
## For example, if one individual was tracked over multiple deployments/years
## Similarly, if one tag was re-deployed on multiple individuals
## For more information and helpful code, see the FAQ document and troubleshooting script
## Deployment and Retrieval dates:
## Archival tags need to be retrieved, and so we need to remove data following retrieval date
## Some tags are turned on before deployment, and so we need to remove data prior to deployment
## Tags with remote/satellite download capability don't need retrieval dates
## To filter by deployment/retrieval, we need to sort out these columns in the metadata
## If not relevant for this data, set to "NA"
## Central Place foragers:
## If you are working with a central place forager (e.g., animals returning to a breeding location)
## and you have breeding location information in your metadata,
## this is a good place to add this info to the tracking data
## e.g., breeding seabirds with known individual nest location
## e.g., seals returning to known haul out location
## we recommend adding these columns as:
## CPY = Central place Y coordinate
## CPX = Central place X coordinate
df_metadataslim <- data.frame(ID = as.character(df_metadata$BirdID), # compulsory column
TagID = as.character(df_metadata$TagID),
DeployID = as.character(df_metadata$DeployID),
DeployDate_local = df_metadata$DeploymentDate, # compulsory column (set to NA if not relevant)
DeployTime_local = df_metadata$DeploymentTime, # compulsory column (set to NA if not relevant)
RetrieveDate_local = df_metadata$RetrievalDate, # compulsory column (set to NA if not relevant)
RetrieveTime_local = df_metadata$RetrievalTime, # compulsory column (set to NA if not relevant)
CPY = df_metadata$NestLat,
CPX = df_metadata$NestLong,
Species = species_code,
Population = df_metadata$Population,
Age = df_metadata$Age,
BreedStage = df_metadata$BreedingStage)
## ** Optional ** ##
# For the example dataset RFB_IMM (immature red-footed boobies), use the following:
# df_metadataslim <- data.frame(ID = as.character(df_metadata$bird_id), # compulsory column
# TagID = as.character(df_metadata$Tag_ID),
# DeployID = as.character(df_metadata$Deploy_ID),
# DeployDate_local = df_metadata$capture_date, # compulsory column (set to NA if not relevant)
# DeployTime_local = df_metadata$capture_time, # compulsory column (set to NA if not relevant)
# RetrieveDate_local = NA, # compulsory column (set to NA if not relevant)
# RetrieveTime_local = NA, # compulsory column (set to NA if not relevant)
# DeployY = df_metadata$lat,
# DeployX = df_metadata$long,
# Species = species_code,
# Age = df_metadata$age)
#------------------#
## USER INPUT END ##
#------------------#
# Parse dates and times
# if NAs in deployment/retrieval date times these will throw up warnings, these are safe to ignore if you know there are NAs in these columns
df_metadataslim <- df_metadataslim %>%
mutate(Deploydatetime = parse_date_time(paste(DeployDate_local, DeployTime_local),#create deploy datetime
order=metadatetime_formats,
tz=metadatatimezone),
Retrievedatetime = parse_date_time(paste(RetrieveDate_local, RetrieveTime_local), #create retrieve datetime
order=metadatetime_formats,
tz=metadatatimezone)) %>%
select(-any_of(c("DeployDate_local","DeployTime_local", "RetrieveDate_local", "RetrieveTime_local"))) %>%
mutate(across(contains('datetime'), #return datetime as it would appear in a different tz
~with_tz(., tzone=trackingdatatimezone)))
## create dataframe of temporal extents in data to use in absence of deploy/retrieve times
## also useful for e.g., data checks/writing up methods
df_temporalextents <- df_raw %>%
group_by(across(all_of(ID_type))) %>%
summarise(min_datetime = min(DateTime),
max_datetime = max(DateTime))
## fill in NAs in deploy/retrieve times with extent of tracking data
df_metadataslim <- df_metadataslim %>%
left_join(., df_temporalextents, by = ID_type) %>%
mutate(Deploydatetime = case_when(!is.na(Deploydatetime) ~ Deploydatetime,
is.na(Deploydatetime) ~ min_datetime),
Retrievedatetime = case_when(!is.na(Retrievedatetime) ~ Retrievedatetime,
is.na(Retrievedatetime) ~ max_datetime)) %>%
select(-c(min_datetime, max_datetime))
## Merge metadata with raw data using ID column
df_metamerged <- df_raw %>%
left_join(., df_metadataslim, by=ID_type)
head(df_metamerged)
### Remove intermediate files/objects if necessary to speed up processing
rm(list=ls()[!ls() %in% c("df_metamerged", "species_code")]) #specify objects to keep
#-----------------#
##3. Cleaning ####
#-----------------#
## This data cleaning stage is to remove erroneous data values
## E.g., from poor location fixes, no data values, etc...
#--------------------#
## USER INPUT START ##
#--------------------#
## Define your own no/empty/erroneous data values in X and Y columns
## e.g. bad values specified by the tag manufacturer
No_data_vals <- c(0, -999)
## Define a vector of columns which can't have NAs,
## If these columns do have NAs then the entirety of those rows will be removed
na_cols <- c("X", "Y", "DateTime", "ID")
#------------------#
## USER INPUT END ##
#------------------#
## Pipe to clean data:
## Remove NAs
## Remove user-defined no data values in Lat Lon columns
## Remove duplicates
## Remove un deployed locations
df_clean <- df_metamerged %>%
drop_na(all_of(na_cols)) %>%
filter(!X %in% No_data_vals & !Y %in% No_data_vals) %>% # remove bad data values in Lat Lon columns
distinct(DateTime, ID, .keep_all = TRUE) %>% # this might be a problem for ACC data where we don't have milliseconds so beware if using it for this purpose
filter(case_when(!is.na(Retrievedatetime) ~ Deploydatetime < DateTime & DateTime < Retrievedatetime, # keep only data within deployment period
.default = Deploydatetime < DateTime)) # if retrieve date is NA (i.e., tags submit via satellite), only filter by deploy date
head(df_clean)
#--------------------#
## USER INPUT START ##
#--------------------#
## ** Option ** ##
## Argos fix quality can be used to filter the data set to remove locations with too much uncertainty
## If you know the error classes that you want to retain in this dataset, you can run this filter here
## If you want to do further exploration of location quality, keep all location classes here
## (e.g., from GPS PTT tags to compare locations with contemporaneous GPS locations)
## To un-hash code, highlight and press "Ctrl + Shft + C" on a PC or "Cmd + Shft + C" on a Mac
# ## Define vector of location classes to keep
# ## Typically, location classes 1, 2, and 3 are of sufficient certainty
# lc_keep <- c("1", "2", "3")
#
# ## Filter data to only retain location classes in lc_keep
# df_clean <- df_clean %>%
# filter(lc %in% lc_keep)
# head(df_clean)
#------------------#
## USER INPUT END ##
#------------------#
## Remove intermediate files/objects if necessary to speed up processing
rm(list=ls()[!ls() %in% c("df_clean", "species_code")]) #specify objects to keep
#-------------------#
##4. Processing ####
#-------------------#
## Some useful temporal and spatial calculations on the data
#--------------------#
## USER INPUT START ##
#--------------------#
## Specify co-ordinate projection systems (CRS) for the tracking data and metadata
## Default here is that the X/Y coordinates are lon/lat for both tracking data & metadata, for which the EPSG code is 4326.
## Users should know which CRS their location data is recorded in
## Look online for alternative EPSG codes, e.g., https://inbo.github.io/tutorials/tutorials/spatial_crs_coding/
tracking_crs <- 4326 # Only change if data are in a different coordinate system
meta_crs <- 4326 # Only change if data are in a different coordinate system
## Specify metric co-ordinate projection system to transform user data into for distance calculations with units in metres
## WE STRONGLY RECOMMEND CHANGING THIS CRS TO MATCH YOUR STUDY SYSTEM/LOCATION
## Here, we use the Spherical Mercator projection — aka 'WGS' (crs = 3857), used by Google maps
## this can be used worldwide, and so we chose it to ensure that this workflow will work on any data
## However, distance calculations can be over-estimated
## Consider the location and scale of your data (e.g., equatorial/polar/local scale/global scale) when choosing a projection system
## Other options include (but are not limited to) UTM, Lambert azimuthal equal-area (LAEA)
## For information about choosing a projection system, see the FAQ doc in documentation folder
transform_crs <- 3857 # Suggest changing if using own data
#------------------#
## USER INPUT END ##
#------------------#
## Transform coordinates of data and perform spatial calculations
df_diagnostic <- df_clean %>%
mutate(geometry_GPS = st_transform( # transform X/Y coordinates to `transform_crs` for distance calculations
st_as_sf(., coords=c("X","Y"), crs=tracking_crs), crs = transform_crs)$geometry) %>%
group_by(ID) %>% #back to grouping by ID for calculations per individual
mutate(dist = st_distance(geometry_GPS, lag(geometry_GPS), by_element = T), # distance travelled from previous fix, by_element = T means calculations are done by row
difftime = difftime(DateTime, lag(DateTime), units="secs"), # time passed since previous fix
netdisp = st_distance(geometry_GPS, geometry_GPS[1], by_element = F)[,1], # calculate distance between first location and current location, by_element = F returns dense matrix with all pairwise distances
speed = as.numeric(dist)/as.numeric(difftime), # calculate speed (distance/time)
dX = as.numeric(X)-lag(as.numeric(X)), #difference in longitude, relative to previous location
dY = as.numeric(Y)-lag(as.numeric(Y)), #difference in latitude, relative to previous location
turnangle = atan2(dX, dY)*180/pi + (dX < 0)*360) %>% #angle (in degrees) from previous to current location using formula theta = atan(y/x), where y = change along y axis & x = change along x axis
ungroup() %>% select(-c(geometry_GPS, dX, dY)) # ungroup and remove geometries
## add latitude and longitude column
## this can be useful for plotting and is a common coordinate system used in the shiny app
df_diagnostic <- st_coordinates(st_transform(st_as_sf(df_diagnostic, coords=c("X","Y"), crs=tracking_crs), crs = 4326)) %>%
as.data.frame() %>%
rename("Lon" = "X", "Lat" = "Y") %>%
cbind(df_diagnostic, .)
#---------------------------#
##5. Save df_diagnostic ####
#---------------------------#
## Save df_diagnostic to use in shiny app
## The app is designed to explore how further filtering and processing steps affect the data
#--------------------#
## USER INPUT START ##
#--------------------#
## Define file path for saved file
filepath_dfout <- here("DataOutputs","WorkingDataFrames")
## Create folder if it doesn't exist
dir.create(filepath_dfout)
## Define file name for saved file
## here, we use the species code and "_diagnostic"
## Change if you want to use a different naming system
filename_dfout <- paste0(species_code, "_diagnostic")
## ** Option ** ##
## If not added from the metadata, add a species column and any other columns here relevant to your data
# df_diagnostic$Species <- species_code
#------------------#
## USER INPUT END ##
#------------------#
## save dataframe as .csv
write_csv(df_diagnostic, file = here(filepath_dfout, paste0(filename_dfout,".csv")))
## Remove intermediate files/objects if necessary to speed up processing
rm(list=ls()[!ls() %in% c("df_diagnostic", "species_code")]) #specify objects to keep
#-----------------#
##6. Filtering ####
#-----------------#
## This filtering stage is designed to remove outliers in the data
## You can use outputs from the shiny app to inform these choices
## Accessing Shiny App
## OPTION 1:
## Access the shiny app online at the following link: https://lukeozsanlav.shinyapps.io/exmove_explorer/
## OPTION 2:
## Alternatively run the app from your local R session with the following code
## This will require some additional packages to be installed on the start up of the app but this will happen automatically
if (!require("shiny")) install.packages("shiny")
library(shiny)
runGitHub("ExMove", username = "ExMove",
ref = "master", subdir = "app")
## APP USAGE:
## Upload your csv version of df_diagnostic to the app by clicking the `Upload data` button in the top left
## At the bottom of each app page are printed code chunks that can be copied into subsequent user input sections
## Those code chunks contain the user input values you manually select in the app
## If you want to manually define thresholds, skip the app and adjust the values in the code below
## If you don't need to filter for outliers, skip this step and keep using df_diagnostic
#--------------------#
## USER INPUT START ##
#--------------------#
## Define a period to filter after tag deployment
## All points before the cutoff will be removed
## For example, to remove potentially unnatural behaviour following the tagging event
## This has to be an integer value
## change the units within the function (e.g., "min"/"mins"/"hours"/"year"...)
filter_cutoff <- as.period(30, unit = "minutes")
## Define speed filter in m/s
## Any points with faster speeds will be removed
filter_speed <- 20
## Define net displacement filter and specify units
## Any points further away from the first tracking point will be removed
## If you want to retain points no matter the net displacement value then run `filter_netdisp_dist <- max(df_diagnostic$netdisp)` and `filter_netdist_units <- "m`
filter_netdisp_dist <- 300
filter_netdist_units <- "km" # e.g., "m", "km"
#------------------#
## USER INPUT END ##
#------------------#
# create net displacement filter using distance and units
filter_netdisp <- units::as_units(filter_netdisp_dist, filter_netdist_units)
# filter df_diagnostic
df_filtered <- df_diagnostic %>%
filter(Deploydatetime + filter_cutoff < DateTime, # keep times after cutoff
speed < filter_speed, # keep speeds slower than speed filter
netdisp <= filter_netdisp) # keep distances less than net displacement filter
head(df_filtered)
## Remove intermediate files/objects if necessary to speed up processing
rm(list=ls()[!ls() %in% c("df_diagnostic", "df_filtered", "species_code")]) #specify objects to keep
#--------------------------------------------------#
##7. Summarise cleaned & filtered tracking data ####
#--------------------------------------------------#
#--------------------#
## USER INPUT START ##
#--------------------#
## set the units to display sampling rate in the summary table
sampleRateUnits <- "mins"
## Define levels of grouping factors to summarise over
## Firstly, down to species level
## here, we are working on data from one population & year, and so use 'Species' as the grouping factor
## add any other relevant grouping factors here, e.g., Country / Year / Season / Age
grouping_factors_poplevel <- c("Species")
## Secondly, down to individual level
## add e.g., DeployID if relevant
grouping_factors_indlevel <- c("ID")
#------------------#
## USER INPUT END ##
#------------------#
## function to calculate standard error
se <- function(x) sqrt(sd(x, na.rm = T) / length(x[!is.na(x)]))
## create a summary table of individual-level summary statistics
df_summary_ind <- df_filtered %>%
group_by(across(c(all_of(grouping_factors_poplevel), all_of(grouping_factors_indlevel)))) %>%
summarise(NoPoints = NROW(ID), # number of fixes
NoUniqueDates = length(unique(Date)), # number of tracking dates
FirstDate = as.Date(min(Date)), # first tracking date
LastDate = as.Date(max(Date)), # last tracking date
SampleRate = mean(as.numeric(difftime, units = sampleRateUnits), na.rm = T), # sample rate mean
SampleRate_se = se(as.numeric(difftime, units = sampleRateUnits))) # sample rate standard error
df_summary_ind
## create a table of population-level summary statistics
df_summary_pop <- df_summary_ind %>% # use the individual-level summary data
group_by(across(all_of(c(grouping_factors_poplevel)))) %>%
summarise(NoInds = length(unique(ID)), # number of unique individuals
NoPoints_total = sum(NoPoints), # total number of tracking locations
FirstDate = as.Date(min(FirstDate)), # first tracking date
LastDate = as.Date(max(LastDate)), # last tracking date
PointsPerBird = mean(NoPoints), # number of locations per individual: mean
PointsPerBird_se = se(NoPoints), # number of locations per individual: standard error
DatesPerBird = mean(NoUniqueDates), # number of tracking days per bird: mean
DatesPerBird_se = se(NoUniqueDates), # number of tracking days per bird: standard error
SampleRate_mean = mean(SampleRate), # sample rate mean
SampleRate_se = se(SampleRate)) # sample rate standard error
df_summary_pop
## Remove intermediate files/objects if necessary to speed up processing
rm(list=ls()[!ls() %in% c("df_diagnostic", "df_filtered", "df_summary_ind", "df_summary_pop", "species_code")]) #specify objects to keep
#-----------------------------------------#
##8. Save df_filtered and summary data ####
#-----------------------------------------#
#--------------------#
## USER INPUT START ##
#--------------------#
## Define file path for df_filtered
filepath_filtered_out <- here("DataOutputs","WorkingDataFrames")
## Define file path for summary file
filepath_summary_out <- here("DataOutputs","SummaryDataFrames")
## Create species folder if it doesn't exist
dir.create(filepath_filtered_out)
dir.create(filepath_summary_out)
## Define file names for saved files
## here, we use the species code and "_summary_" followed by ind (individual level) or pop (population level)
## Change if you want to use a different naming system
filename_filtered_out <- paste0(species_code, "_filtered")
filename_summary_ind_out <- paste0(species_code, "_summary_ind")
filename_summary_pop_out <- paste0(species_code, "_summary_pop")
#------------------#
## USER INPUT END ##
#------------------#
## save dataframes as .csv files
write_csv(df_filtered, file = here(filepath_filtered_out, paste0(filename_filtered_out,".csv")))
write_csv(df_summary_ind, file = here(filepath_summary_out, paste0(filename_summary_ind_out,".csv")))
write_csv(df_summary_pop, file = here(filepath_summary_out, paste0(filename_summary_pop_out,".csv")))
## Remove intermediate files/objects if necessary to speed up processing
rm(list=ls()[!ls() %in% c("df_diagnostic", "df_filtered", "df_summary_ind", "df_summary_pop", "species_code")]) #specify objects to keep
#----------------------#
##9. Visualisation ####
#----------------------#
## Users can visualise their data now to check the data cleaning has removed erroneous data
## If the data needs further filtering then return to section 6 and update the data filters
#--------------------#
## USER INPUT START ##
#--------------------#
## Define parameters for reading out plots
## Define device to read plots out as e.g. tiff/jpeg
device <- "tiff"
## define units for plot size - usually mm
units <- "mm"
## define plot resolution in dpi - 300 usually minimum
dpi <- 300
## define filepath to read out plots
out_path <- here("DataOutputs","Figures")
## Create species folder within "DataBaseUploadFiles" if it doesn't exist
dir.create(out_path)
## We plot maps over a topography base-layer
## topography data include terrestrial (elevation) and marine (bathymetry/water depth) data
## set legend label for topography data, relevant to your data:
topo_label = "Depth (m)"
#------------------#
## USER INPUT END ##
#------------------#
## load additional libraries for spatial visualisation
message("If you see masking warning these are fine.
Watch out for packages that aren't installed yet")
library(rnaturalearth)
library(marmap)
library(plotly)
## Create version of data for plotting
## transform required columns to numeric and create time elapsed columns
df_plotting <- df_filtered %>%
group_by(ID) %>%
mutate(diffsecs = as.numeric(difftime),
secs_elapsed = cumsum(replace_na(diffsecs, 0)),
time_elapsed = as.duration(secs_elapsed),
days_elapsed = as.numeric(time_elapsed, "days")) %>%
mutate(across(c(dist,speed, Lat, Lon), as.numeric))
## create a map of all points with the code below
## set the plot limits as the max and min lat/longs as the tracking data
## first set up a basemap to plot over
## use rnaturalearth low res country outlines as a basemap and return as an sf object
countries <- ne_countries(scale = "medium", returnclass = "sf")
## define min and max co-ordinates based on extent of tracking data
minlon <- min(df_plotting$Lon)
maxlon <- max(df_plotting$Lon)
minlat <- min(df_plotting$Lat)
maxlat <- max(df_plotting$Lat)
## load in topography basemap
## set limits slightly beyond tracking data to make a buffer so no gaps around plot margins
base_topography_map <- getNOAA.bathy(lon1 = minlon - 0.1, lon2 = maxlon + 0.1,
lat1 = minlat - 0.1, lat2 = maxlat + 0.1, resolution = 1)
## fortify the topography basemap for plotting
base_topography_fort = fortify(base_topography_map)
## create base map with correct extent, topography, country outlines, etc.,
map_base <- ggplot() +
geom_raster(data = base_topography_fort, aes(x=x, y=y, fill=z), alpha = 0.9) +
# add colour scheme for the fill
scale_fill_viridis_c(option="mako", name = topo_label) +
# add map of countries over the top
geom_sf(data = countries, aes(geometry = geometry), fill = NA) +
# set plot limits
coord_sf(xlim = c(minlon-0.1, maxlon+0.1),
ylim = c(minlat-0.1, maxlat+0.1), crs = 4326, expand = F) +
# add labels
labs(x = "Longitude", y = "Latitude") +
theme(axis.text=element_text(colour="black"),
axis.title.x = element_text(size = 15),
axis.text.x = element_text(hjust=0.7),
axis.title.y = element_text(angle=90, vjust = 0.4, size = 15),
axis.text.y = element_text(hjust=0.7, angle=90, vjust=0.3)) +
# set a theme
theme_light()
## map all tracking locations
map_alllocs <- map_base +
# add GPS points
geom_point(data = df_plotting, aes(x = Lon, y = Lat), alpha = 0.8, size = 0.5, col = "violetred3")
map_alllocs
## map individual locations
## colour points by speed and facet by ID
map_individuals <- map_base +
# add GPS points and paths between them
geom_point(data = df_plotting, aes(x = Lon, y = Lat, col = speed),
alpha = 0.8, size = 0.5 ) +
geom_path(data = df_plotting, aes(x = Lon, y = Lat, col = speed),
alpha = 0.8, linewidth = 0.5 ) +
# colour birds using scale_colour_gradient2
scale_colour_gradient2(name = "Speed", low = "blue", mid = "white", high = "red",
midpoint = (max(df_plotting$speed,na.rm=TRUE)/2)) + # `midpoint` argument ensures an even transition of color across speed value
# facet for individual
facet_wrap(~ ID, ncol = round(sqrt(n_distinct(df_plotting$ID))))
map_individuals
## ** Option ** ##
## Throughout these visualisations, we'll split the population into individual facets.
## This works fine on the example code, where we only have a few individuals
## If you have more individuals and the facets are too small, you can split the plot onto multiple pages
## Use the below code to use facet_wrap_paginate from the ggforce package:
# install.packages("ggforce")
# library(ggforce)
# ## save plot as object to later extract number of pages
# ## e.g., with 2 per page:
# map_individuals <- map_base +
# # add GPS points and paths between them
# geom_point(data = df_plotting, aes(x = Lon, y = Lat, col = speed), alpha = 0.8, size = 0.5 ) +
# geom_path(data = df_plotting, aes(x = Lon, y = Lat, col = speed), alpha = 0.8, linewidth = 0.5 ) +
# # colour birds using scale_colour_gradient2
# scale_colour_gradient2(name = "Speed", low = "blue", mid = "white", high = "red", midpoint = (max(df_plotting$speed,na.rm=TRUE)/2)) +
# ##facet for individual
# facet_wrap_paginate(~ID, ncol = 2, nrow= 1, page = 1)
# ## How many pages of plots?
# n_pages(map_individuals)
# ## display plot
# map_individuals
# ## run through different values of page to show each page in turn
# save maps for further use
# use ggsave function
ggsave(plot = map_alllocs,
filename = paste0(species_code, "_map_all_locs.", device),
device = device,
path = out_path,
units = units, width = 200, height = 175, dpi = dpi,
)
ggsave(plot = map_individuals,
filename = paste0(species_code, "_map_individuals.", device),
device = device,
path = out_path,
units = units, width = 200, height = 175, dpi = dpi,
)
## ** Option ** ##
## generate an interactive version of the maps to explore in plot window
## this can be slow, so we don't run by default
# ggplotly(map_alllocs)
# create a time series plot of speed
# faceted for each individual
speed_time_plot <- df_plotting %>% #speed over time
ggplot(data=., aes(x=days_elapsed, y=speed, group=ID)) +
# add line of speed over time
geom_line() +
# add axis labels
xlab("time elapsed (days)") + ylab("speed (m/s)") +
# facet by individual
facet_wrap(~ID, nrow= round(sqrt(n_distinct(df_plotting$ID)))) +
# set plotting theme
theme(axis.text=element_text(colour="black")) +
theme_light()
message("Warnings about 'non-finite' values for speed/step length plots are expected
and should refer to the first location for each individual (i.e. number of
non-finite values should be equal to number of individuals)")
# save plot for further use
ggsave(plot = speed_time_plot,
filename = paste0(species_code, "_speed_timeseries_plot.", device),
device = device,
path = out_path,
units = units, width = 200, height = 175, dpi = dpi
)
# create a histogram of point to point speeds
# can adjust binwidth and x limits manually
speed_hist <- df_plotting %>%
ggplot(data=., aes(speed)) +
geom_histogram(binwidth=0.1, alpha=0.7) + # can adjust binwidth to suite your needs
geom_density(aes(y =0.1*after_stat(count))) +
# add plot labels
xlab("speed (m/s)") + ylab("count") +
# facet by individual
facet_wrap(~ID, nrow= round(sqrt(n_distinct(df_plotting$ID)))) +
# set plotting theme
theme(axis.text=element_text(colour="black"))+
theme_light()
# save plot for further use
ggsave(plot = speed_hist,
filename = paste0(species_code, "_speed_histogram.", device),
device = device,
path = out_path,
units = units, width = 200, height = 175, dpi = dpi,
)
# create a time series plot of step lengths
# faceted for each individual
step_time_plot <- df_plotting %>% #step length over time
ggplot(data=., aes(x=days_elapsed, y=as.numeric(netdisp), group=ID)) +
geom_line() +
# add plot labels
xlab("time elapsed (days)") + ylab("Distance from first fix (m)") +
# facet by individual
facet_wrap(~ID, nrow= round(sqrt(n_distinct(df_plotting$ID)))) +
# set plotting theme
theme(axis.text=element_text(colour="black")) +
theme_light()
# save plot for further use
ggsave(plot = step_time_plot,
filename = paste0(species_code, "_step_time_plot.", device),
device = device,
path = out_path,
units = units, width = 200, height = 175, dpi = dpi,
)
# create a histogram of step lengths
# can adjust binwidth and x limits manually
step_hist <- df_plotting %>% #step histogram
ggplot(data=., aes(as.numeric(dist))) +
geom_histogram(binwidth=5000, alpha=0.7) + # can adjust binwidth to suite your needs
# add plot labels
xlab("step length (m)") + ylab("count") +
# facet by individual
facet_wrap(~ID, nrow= round(sqrt(n_distinct(df_plotting$ID))))+
# set plotting theme
theme_light()
# save plot for further use
ggsave(plot = step_hist,
filename = paste0(species_code, "_step_hist.", device),
device = device,
path = out_path,
units = units, width = 200, height = 175, dpi = dpi,
)
## Remove intermediate files/objects if necessary to speed up processing
rm(list=ls()[!ls() %in% c("df_diagnostic", "species_code")]) #specify objects to keep
#----------------------------------------#
##10. Post processing: Optional steps ####