MEXT, Ministry of Education,Culture,Sports,Science & Technology in Japan has published PDF files called ‘Standard Tables of Food Composition in Japan 2010’. I’m interest in those files, so I have converted them to text file.
Standard Tables of Food Composition in Japan 2010
The procedure is as follows:
- Download PDF files
- Open a file, Copy all text
- Paste to EXCEL
- Process with VBA
1. Download PDF files
Download the PDF files on the above link. Top of the list is 1299012_1. Last of the list is 1299012_18.
2. Open a file, Copy all text
Open a file with Adobe Reader. In order to select all text in document, you have to select ‘View’ -> ‘Page view’ -> ‘NOT Single page’. Select All, copy.
3. Paste to EXCEL
‘Paste option’ -> ‘Use text file wizard’. Select option ‘The data field separated by delimiters such as comma or tab’. Change option data type of the first column to ‘String’, click ‘Finish’.
4. Process with VBA
Press ‘Alt’ key and ‘F11’ key to launch VBE. ‘Insert’ -> ‘Standard Module’. Paste the code below.
Option Explicit
Sub Procedure1()
Dim mySht As Worksheet
Dim myAr1 As Variant
Dim myAr2 As Variant
Dim i As Long
Dim j As Long
Dim k As Long
Dim myAr(1878, 53) As String
Dim RegEx1 As Object
Dim Match1 As Object
Dim Matches1 As Object
Dim strPtn1 As String
Dim tempStr As String
Dim RegEx2 As Object
Dim Match2 As Object
Dim Matches2 As Object
Dim strPtn2 As String
Set mySht = ActiveSheet
Set myAr1 = Application.Intersect(mySht.UsedRange, mySht.Range("A:B"))
Set myAr2 = mySht.UsedRange
Set RegEx1 = CreateObject("VBScript.RegExp")
Set RegEx2 = CreateObject("VBScript.RegExp")
strPtn2 = "^[0-9]+(\.[0-9]*)?"
i = 0
For k = 1 To myAr1.Rows.Count
If Len(myAr1.Cells(k, 1).Value) = 5 And _
myAr1.Cells(k, 1).Value >= "01001" And _
myAr1.Cells(k, 1).Value <= "18016" And _
myAr1.Cells(k, 2) <> "(欠番)" Then
myAr(i, 0) = myAr2.Cells(k, 1)
strPtn1 = "^[0-9]+$"
RegEx1.Pattern = strPtn1
RegEx1.IgnoreCase = True
RegEx1.Global = True
Set Matches1 = RegEx1.Execute(myAr2.Cells(k, 2))
If Matches1.Count > 0 Then
tempStr = myAr2.Cells(k, 2) & myAr2.Cells(k, 3)
strPtn1 = "[^0-9][0-9]+$"
RegEx1.Pattern = strPtn1
RegEx1.IgnoreCase = True
RegEx1.Global = True
Set Matches1 = RegEx1.Execute(tempStr)
If Matches1.Count > 0 Then
Set Match1 = Matches1.Item(Matches1.Count - 1)
myAr(i, 1) = Left(tempStr, Match1.firstindex + 1)
myAr(i, 2) = Mid(tempStr, Match1.firstindex + 2)
For j = 3 To 52
Select Case True
Case myAr2.Cells(k, j + 1).Value = "-"
myAr(i, j) = 0
Case myAr2.Cells(k, j + 1).Value = "Tr"
myAr(i, j) = 0
Case myAr2.Cells(k, j + 1).Value = "(Tr)"
myAr(i, j) = 0
Case myAr2.Cells(k, j + 1).Value = "(0)"
myAr(i, j) = 0
Case Else
myAr(i, j) = myAr2.Cells(k, j + 1).Value
End Select
Next j
RegEx2.Pattern = strPtn2
RegEx2.IgnoreCase = True
RegEx2.Global = True
Set Matches2 = RegEx2.Execute(myAr2.Cells(k, 54))
On Error Resume Next
Set Match2 = Matches2.Item(0)
On Error GoTo 0
myAr(i, 53) = Match2.Value
Else
For j = 1 To 52
Select Case True
Case myAr2.Cells(k, j + 1).Value = "-"
myAr(i, j) = 0
Case myAr2.Cells(k, j + 1).Value = "Tr"
myAr(i, j) = 0
Case myAr2.Cells(k, j + 1).Value = "(Tr)"
myAr(i, j) = 0
Case myAr2.Cells(k, j + 1).Value = "(0)"
myAr(i, j) = 0
Case Else
myAr(i, j) = myAr2.Cells(k, j + 1).Value
End Select
Next j
RegEx2.Pattern = strPtn2
RegEx2.IgnoreCase = True
RegEx2.Global = True
Set Matches2 = RegEx2.Execute(myAr2.Cells(k, 54))
On Error Resume Next
Set Match2 = Matches2.Item(0)
On Error GoTo 0
myAr(i, 53) = Match2.Value
End If
Else
strPtn1 = "[^0-9][0-9]+$"
RegEx1.Pattern = strPtn1
RegEx1.IgnoreCase = True
RegEx1.Global = True
Set Matches1 = RegEx1.Execute(myAr2.Cells(k, 2))
If Matches1.Count > 0 Then
Set Match1 = Matches1.Item(Matches1.Count - 1)
myAr(i, 1) = Left(myAr2.Cells(k, 2), Match1.firstindex + 1)
myAr(i, 2) = Mid(myAr2.Cells(k, 2), Match1.firstindex + 2)
For j = 3 To 52
Select Case True
Case myAr2.Cells(k, j).Value = "-"
myAr(i, j) = 0
Case myAr2.Cells(k, j).Value = "Tr"
myAr(i, j) = 0
Case myAr2.Cells(k, j).Value = "(Tr)"
myAr(i, j) = 0
Case myAr2.Cells(k, j).Value = "(0)"
myAr(i, j) = 0
Case Else
myAr(i, j) = myAr2.Cells(k, j).Value
End Select
Next j
RegEx2.Pattern = strPtn2
RegEx2.IgnoreCase = True
RegEx2.Global = True
Set Matches2 = RegEx2.Execute(myAr2.Cells(k, 53))
On Error Resume Next
Set Match2 = Matches2.Item(0)
On Error GoTo 0
myAr(i, 53) = Match2.Value
Else
For j = 1 To 52
Select Case True
Case myAr2.Cells(k, j + 1).Value = "-"
myAr(i, j) = 0
Case myAr2.Cells(k, j + 1).Value = "Tr"
myAr(i, j) = 0
Case myAr2.Cells(k, j + 1).Value = "(Tr)"
myAr(i, j) = 0
Case myAr2.Cells(k, j + 1).Value = "(0)"
myAr(i, j) = 0
Case Else
myAr(i, j) = myAr2.Cells(k, j + 1).Value
End Select
Next j
RegEx2.Pattern = strPtn2
RegEx2.IgnoreCase = True
RegEx2.Global = True
Set Matches2 = RegEx2.Execute(myAr2.Cells(k, 54))
On Error Resume Next
Set Match2 = Matches2.Item(0)
On Error GoTo 0
myAr(i, 53) = Match2.Value
End If
End If
Else
i = i - 1
End If
i = i + 1
Next k
Set mySht = Worksheets.Add
With mySht
.Name = myAr2.Cells(1, 1).Value & myAr2.Cells(1, 2).Value
.Range("A1:BB1878").Value = myAr
End With
Set mySht = Nothing
Set myAr1 = Nothing
Set myAr2 = Nothing
Set RegEx1 = Nothing
Set Match1 = Nothing
Set Matches1 = Nothing
Set RegEx2 = Nothing
Set Match2 = Nothing
Set Matches2 = Nothing
End Sub
Please delete worksheets except for worksheets which above code added. Run following code:
Sub AllSheets_to_TextFile()
Dim myBook As Workbook
Dim mySht As Worksheet
Dim tmpSht As Worksheet
Dim myRng As Range
Dim myAr(1877, 53) As String
Dim tempAr As Variant
Dim i As Long
Dim j As Long
Dim k As Long
Dim GOF As Variant
Dim RegExp As Object
Dim Matches As Object
Dim Match As Object
Dim strPtn As String
Set RegExp = CreateObject("VBScript.RegExp")
strPtn = "\\"
k = 0
For Each tmpSht In Worksheets
Set myRng = tmpSht.Range("A1").CurrentRegion
tempAr = myRng
For i = LBound(tempAr) To UBound(tempAr)
For j = LBound(tempAr, 2) To UBound(tempAr, 2)
myAr(k, j - 1) = tempAr(i, j)
Next j
k = k + 1
Next i
Next tmpSht
GOF = Application.GetOpenFilename(FileFilter:="PDF file,*.pdf", _
Title:="Select PDF file", _
MultiSelect:=False)
If TypeName(GOF) = "Boolean" Then Exit Sub
GOF = Left(GOF, Len(GOF) - 4) & ".txt"
With RegExp
.Pattern = strPtn
.IgnoreCase = True
.Global = True
End With
Set Matches = RegExp.Execute(GOF)
GOF = Left(GOF, Matches.Item(Matches.Count - 1).FirstIndex) & "\M_FOODS.txt"
Set mySht = Worksheets.Add
With mySht
.Name = "M_FOODS"
.Range("A1:BB1878") = myAr
.Move
End With
ActiveWorkbook.SaveAs Filename:=GOF, _
FileFormat:=xlText, _
CreateBackup:=False
Set myBook = ActiveWorkbook
Application.DisplayAlerts = False
myBook.Close
Application.DisplayAlerts = True
Set Match = Nothing
Set Matches = Nothing
Set RegExp = Nothing
Set myRng = Nothing
Set mySht = Nothing
Set tmpSht = Nothing
Set myBook = Nothing
End Sub
I can not upload the text file because of copyright. Please contact MEXT about legal issues.