Professional Documents
Culture Documents
数据科学导论 数据科学导论HW4
数据科学导论 数据科学导论HW4
xian
2022-10-05
library(utils)
library(xml2)
library(rvest)
library(stringr)
library(dplyr)
##
## 载入程辑包: 'dplyr'
A=untar("C:/Users/HP/Desktop/数科导 2022/skd-HW4/pubmed.tar.gz")
xmlfile=dir()%>%str_extract(".*?(?=.xml)");
xmlfile=xmlfile[is.na(xmlfile)==F];
xmlfile=str_c(xmlfile,".xml");
MeshHeading="";
1
2 2
for(i in 1:20){
f=read_xml(xmlfile[i]);
a=xml_find_all(f,xpath = '//MeshHeading')%>%html_text();
MeshHeading=c(MeshHeading,a);
}
data1=as.data.frame(table(MeshHeading))%>%top_n(3000, Freq);
data1=data1[order(data1$Freq, decreasing = T), ][1:3000, ];
data1$MeshHeading[1:30];
pubmed=read_xml("pubmed20n1220.xml");
a=xml_find_all(pubmed,xpath = './/MeshHeading',ns=xml_ns(pubmed))%>%html_text();
a=sapply(a,function(x){str_split(x,",")});
2 3
lsa=rep(0,3000);
names(lsa)=data1$MeshHeading;
tdm=sapply(a,function(x){y=lsa;y[x[x%in%data1$MeshHeading]]=1;return(y)});
tdm=tdm[,which(colSums(tdm)>0)];
decomp=svd(tdm);
embed=t(decomp$u[,1:10]);
colnames(embed)=data1$MeshHeading
embed[1:10, 1:10]
## Humans Female Male Animals Adult Middle Aged Aged Young Adult Adolescent
## [1,] 1 0 0 0 0 0 0 0 0
## [2,] 0 1 0 0 0 0 0 0 0
## [3,] 0 0 1 0 0 0 0 0 0
## [4,] 0 0 0 1 0 0 0 0 0
## [5,] 0 0 0 0 0 0 1 0 0
## [6,] 0 0 0 0 0 1 0 0 0
## [7,] 0 0 0 0 1 0 0 0 0
## [8,] 0 0 0 0 0 0 0 0 0
## [9,] 0 0 0 0 0 0 0 0 0
## [10,] 0 0 0 0 0 0 0 1 0
## Mice
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## [7,] 0
## [8,] 1
## [9,] 0
## [10,] 0
2 4
cosv=function(a, B){return((t(a)/(sqrt(sum(a*a))))%*%B/sqrt(diag(t(B)%*%(B))))}
B=matrix(embed, nrow = 10);
a=data.frame(embed)$Coronavirus;
cor=cosv(a, B);
colnames(cor)=data1$MeshHeading;
data1$MeshHeading[order(cor[1,],decreasing = T)[1:10]]