Reputation: 401
I have so many(about 1,000) xlsx like the picture above. And I want to read every xlsx and get the data of every candatate's name, number and age. But I don't know how to read this unregular xlsx?
Upvotes: 1
Views: 60
Reputation: 20362
You can run the VBA code in Excel, and convert any number of XLSX files into CSV files. Then loop through all CSV files to merge all into a Data Frame in R.
Sub Convert_Excel_To_CSV()
Dim MyPath As String, FilesInPath As String
Dim MyFiles() As String, Fnum As Long
Dim mybook As Workbook
Dim CalcMode As Long
Dim sh As Worksheet
Dim ErrorYes As Boolean
Dim LPosition As Integer
'Fill in the path\folder where the Excel files are
MyPath = "C:\Users\Ryan\Desktop\Excel_Files\"
FilesInPath = Dir(MyPath & "*.xlsx*")
If FilesInPath = "" Then
MsgBox "No files found"
Exit Sub
End If
Fnum = 0
Do While FilesInPath <> ""
Fnum = Fnum + 1
ReDim Preserve MyFiles(1 To Fnum)
MyFiles(Fnum) = FilesInPath
FilesInPath = Dir()
Loop
With Application
CalcMode = .Calculation
.Calculation = xlCalculationManual
.ScreenUpdating = False
.EnableEvents = False
End With
If Fnum > 0 Then
For Fnum = LBound(MyFiles) To UBound(MyFiles)
Set mybook = Nothing
On Error Resume Next
Set mybook = Workbooks.Open(MyPath & MyFiles(Fnum))
On Error GoTo 0
If Not mybook Is Nothing Then
LPosition = InStr(1, mybook.Name, ".") - 1
mybookname = Left(mybook.Name, LPosition)
mybook.Activate
'All XLSX Files get saved in the directory below:
ActiveWorkbook.SaveAs Filename:="C:\your_path_here\" & mybookname & ".csv" _
, FileFormat:=xlCSVMSDOS, _
CreateBackup:=False
End If
mybook.Close SaveChanges:=False
Next Fnum
End If
If ErrorYes = True Then
MsgBox "There are problems in one or more files, possible problem:" _
& vbNewLine & "protected workbook/sheet or a sheet/range that not exist"
End If
With Application
.ScreenUpdating = True
.EnableEvents = True
.Calculation = CalcMode
End With
End Sub
setwd("C:/your_path")
fnames <- list.files()
csv <- lapply(fnames, read.csv)
result <- do.call(rbind, csv)
******** ******** ******** ******** ******** ******** ******** ********
filedir <- setwd("C:/your_path")
file_names <- dir(filedir)
your_data_frame <- do.call(rbind,lapply(file_names,read.csv))
******** ******** ******** ******** ******** ******** ******** ********
filedir <- setwd("C:/your_path")
file_names <- dir(filedir)
your_data_frame <- do.call(rbind, lapply(file_names, read.csv, skip = 1, header = FALSE))
******** ******** ******** ******** ******** ******** ******** ********
filedir <- setwd("C:/your_path")
file_names <- dir(filedir)
your_data_frame <- do.call(rbind, lapply(file_names, read.csv, header = FALSE))
******** ******** ******** ******** ******** ******** ******** ********
#
temp <- setwd("C:/your_path")
temp = list.files(pattern="*.csv")
myfiles = lapply(temp, read.delim)
Upvotes: 1
Reputation: 522752
I don't know if any R Excel API is smart enough to handle your column formatting, but there is an easy workaround. You can just save the above worksheet in CSV format. Doing this for the data you showed above left me with the following three CSV lines:
Title,,,,,
name,mike,number,123214,age,28
,score,,ddd,aaa,bbb
You can try the following code:
df <- read.csv(file="path/to/your/file.csv", header=FALSE)
df <- df[2:nrow(df), ] # drop first row
To get the name, number, and age for Mike:
name <- df[1, 2]
number <- df[1, 4]
age <- df[1, 6]
Upvotes: 4