Talk:2020年中国围棋甲级联赛
将新浪棋牌比赛结果转换为维基格式的R程序(作者Robin Lu):
- Install libraries
library(rvest) library(readr)
- ASSUMPTIONS:
- - number of rows in a table (5 - header and 4 rows)
- - main text of website is in div whose id is artibody
- - title of article is of class main-title
- - date is followed by \UFF0C (fullwidth comma)
- - 台次 is surrounded by \UFF08 (fullwidth left parenthesis) and \UFF09 (fullwidth right parenthesis)
- - each row with match info is proceeded by 3 characters: "#台:"
- Input
stop("Change inputs") theurl <- "https://sports.sina.com.cn/go/2020-12-13/doc-iiznctke6296498.shtml" resultFile <- "test10.txt"
- Harvest website
website <- read_html(theurl)
par <- html_nodes(website, "#artibody div ~ p") %>%
html_text() %>% trimws(which = "both", whitespace = "[\\h\\v]")
artTitle <- html_nodes(website, ".main-title") %>%
html_text()
tableInfo <- par[-c(1,length(par))] tableInfo <- gsub("\UFF08", "||", tableInfo, fixed = T) tableInfo <- gsub("\UFF09", "", tableInfo, fixed = T) tableInfo <- gsub(" ", "||", tableInfo, fixed = T)
artDate <- unlist(strsplit(par[1], "\UFF0C"))[1] header <- paste0(artDate, "[1]\n")
result <- c(header)
colBegin <- "
\n{|class=\"wikitable\"\n"
colMidEnd <- "|}\n" colEnd <- " |
\n"
rowSep <- "|-\n"
nColObjs <- ceiling(length(par) / 6 / 3)
nObj <- length(tableInfo)
- loop for each row of 3 tables
for (i in 1:nColObjs) {
currCol <- colBegin # loop for each table in row for (j in 1:3) { start <- ((i - 1) * 18) + ((j - 1) * 6) + 1 if (start <= nObj) currCol <- paste0(currCol, colMidBeg) %>% paste0("!", gsub("||", "!!", tableInfo[start], fixed=TRUE), "!!台次\n") %>% paste0(rowSep) %>% paste0("|", substr(tableInfo[start + 1], 4, nchar(tableInfo[start + 1])), "\n") %>% paste0(rowSep) %>% paste0("|", substr(tableInfo[start + 2], 4, nchar(tableInfo[start + 2])), "\n") %>% paste0(rowSep) %>% paste0("|", substr(tableInfo[start + 3], 4, nchar(tableInfo[start + 3])), "\n") %>% paste0(rowSep) %>% paste0("|", substr(tableInfo[start + 4], 4, nchar(tableInfo[start + 4])), "\n") %>% paste0(colMidEnd) } currCol <- paste0(currCol, colEnd) result <- c(result, currCol)
}
toprint <- paste(result, collapse = "") write_file(toprint, resultFile)—以上未簽名的留言由Kelu(對話|貢獻)於2021年1月1日 (五) 14:01 (UTC)加入。
- ^ [", theurl, " ", artTitle, "]