setwd("D:/~/Data/")
if(!file.exists("har")){dir.create("har")}
Import & Export
Directory
set directory
Let’s start at the beginning:
- In most cases when we are starting out a project we start by going to the directory, or
- start by creating a new directory for that specific project
- here we are going to create a new directory to import the data we need
get directory
If the files have already been imported and we need to work in a specific directory we just get the directory to make sure we are in the correct working one. If we are not, we just setwd() to the correct one.
getwd()
Import
download.file
- Download.file will download any file regardless if it’s csv, xls, or….
- we’ve already created the directory we’ll use
- let’s say we have to download a .zip file from a site
- set a time marker dateDownloaded so you can always tell which version of the data you are working on in the event the data gets updated
<- "https://d396qusza40orc.cloudfront.net
fileUrl /getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
download.file(fileUrl, destfile = "zipped.zip", method="curl")
<- date()
dateDownloaded # you can always print out date() without saving it dateDownloaded
Unzip
- In the event that you want to unzip an entire folder
- without seeing the list of files
- or if you already have seen it as described in the section below
unzip("zipped.zip", exdir= "D:/~/Data/har/unzipped")
Load
RDS
- You can read these files directly but I tend to break the code down into two parts
<- file("D:/Education/R/Data/EPA/summarySCC_PM25.rds")
con1 <- file("D:/Education/R/Data/EPA/Source_Classification_Code.rds")
con2 <- readRDS(con1)
NEI <- readRDS(con2) SCC
Zipped
zipped .bz2
- Zipped .bz2 file can be read directly with read.csv
<- read.csv("D:/Education/R/Data/JH_C5_week2/
storm_data repdata_data_StormData.csv.bz2", header = TRUE)
- Continuing with the example above “zipped.zip”, at times the zipped folder contains many files
- you can list the files within the zipped folder prior to unzipping it
- reason being: if you only need 1 or 2 files and not an entire large dataset you can read those files specifically
List files
zipped
- You can list all the files in the zipped folder using the same command to read them but set list=TRUE
<- unzip("zipped.zip", list=TRUE) all_files
directory
- If you want to read a long list of files from a directory
- assign the list to all_files
<- list.files("har") all_files
File List
lapply
- If you have a list of wanted files that you chose from above, or possibly all_files in a directory
- you can use lapply to scan through the list and read them
- lapply will give the output in a list, so it will output all the files in a list of dfs one for each file in the list
<- lapply(all_files, read.csv) dataIn
read.table
- refer to Basics - In & Out
- as handy as read.table is it has some drawbacks
- one major one is that it reads the data into RAM, so large sets might cause issues
- can always sub with read.csv or in the readr package: read_csv
<- read.table("D:/~/har/activity_labels.txt") labelfile
read.csv
<- read.csv("D:/yourdataiq/dataiq/datasets/pm0.csv") pm0
readLines
- used for .txt files instead of read.table
<- readLines("D:/yourdataiq/dataiq/datasets/cnames.txt") cnames
Function
- What if you want the user to input the directory, file name, and extension
- create a function that does just that
- sometimes it’s just easier to write the code directly, but coding is to make our life easier so here is such a function
- quarto doesn’t work with a function to read the files as it cannot establish a connection but in R script it works (seehow_to_merge )
<- function(directory, name, extension){
loadfile_to_table <- setwd("D:/~/Data/har")
fileDir = file.path(fileDir,directory,paste(name, extension ,sep = "")
wantedfile fsep="/")
,return(read.table(wantedfile))
}
- then you just call it using
<- loadfile_to_table("test","subject_test",".txt") subject_test
Save
File Output
.txt & .csv
- I’ll save both files in .txt and .csv formats
- Verify the files were saved in the correct directory
- Confirm operation with a timestamp
library(readr)
if(!file.exists("har/meanPerSubject.csv"))
write_csv(persubfile,"har/meanPerSubject.csv")}
{
#______Save in txt format as well using both write.table & write_csv
if(!file.exists("har/meanPerSubject.txt"))
write.table(persubfile,"har/meanPerSubject.txt")}
{if(!file.exists("har/meanPerActivity.txt"))
write_csv(peractivityfile,"har/meanPerActivity.txt")}
{
list.files("har")
<- date() dateUploaded
png Output
save png
- We can save a plot as a png with exact dimensions given
- Here we first process the data
- Set the png() function and parameters
- Plot the graph, which will automatically save it into a png
- It will not display the .png file until we turn
- dev.off()
<- NEI |>
emm_year group_by(year) |>
mutate(Emm_per_year=sum(Emissions))
png(filename = "D:/yourdataiq/dataiq/images/plot1.png",
width=480, height = 480, units = "px")
with(emm_year,
plot(year,Emm_per_year, type="l", col="green",
lwd=2, ylab="totalPM2.5 emmission"))
dev.off()