Download as pdf or txt
Download as pdf or txt
You are on page 1of 4

HW4-skd

xian

2022-10-05

library(utils)
library(xml2)
library(rvest)
library(stringr)
library(dplyr)

##
## 载入程辑包: 'dplyr'

## The following objects are masked from 'package:stats':


##
## filter, lag

## The following objects are masked from 'package:base':


##
## intersect, setdiff, setequal, union

A=untar("C:/Users/HP/Desktop/数科导 2022/skd-HW4/pubmed.tar.gz")
xmlfile=dir()%>%str_extract(".*?(?=.xml)");
xmlfile=xmlfile[is.na(xmlfile)==F];
xmlfile=str_c(xmlfile,".xml");
MeshHeading="";

1
2 2

for(i in 1:20){
f=read_xml(xmlfile[i]);
a=xml_find_all(f,xpath = '//MeshHeading')%>%html_text();
MeshHeading=c(MeshHeading,a);
}
data1=as.data.frame(table(MeshHeading))%>%top_n(3000, Freq);
data1=data1[order(data1$Freq, decreasing = T), ][1:3000, ];
data1$MeshHeading[1:30];

## [1] Humans Female


## [3] Male Animals
## [5] Adult Middle Aged
## [7] Aged Young Adult
## [9] Adolescent Mice
## [11] Child Retrospective Studies
## [13] Treatment Outcome Aged, 80 and over
## [15] Risk Factors Child, Preschool
## [17] Infant Prospective Studies
## [19] Cross-Sectional Studies Pregnancy
## [21] Prognosis Rats
## [23] Time Factors Surveys and Questionnaires
## [25] Mice, Inbred C57BL Phylogeny
## [27] Pandemics Cohort Studies
## [29] Cell Line, Tumor Infant, Newborn
## 126839 Levels: ...

pubmed=read_xml("pubmed20n1220.xml");
a=xml_find_all(pubmed,xpath = './/MeshHeading',ns=xml_ns(pubmed))%>%html_text();
a=sapply(a,function(x){str_split(x,",")});
2 3

lsa=rep(0,3000);
names(lsa)=data1$MeshHeading;
tdm=sapply(a,function(x){y=lsa;y[x[x%in%data1$MeshHeading]]=1;return(y)});
tdm=tdm[,which(colSums(tdm)>0)];
decomp=svd(tdm);
embed=t(decomp$u[,1:10]);
colnames(embed)=data1$MeshHeading
embed[1:10, 1:10]

## Humans Female Male Animals Adult Middle Aged Aged Young Adult Adolescent
## [1,] 1 0 0 0 0 0 0 0 0
## [2,] 0 1 0 0 0 0 0 0 0
## [3,] 0 0 1 0 0 0 0 0 0
## [4,] 0 0 0 1 0 0 0 0 0
## [5,] 0 0 0 0 0 0 1 0 0
## [6,] 0 0 0 0 0 1 0 0 0
## [7,] 0 0 0 0 1 0 0 0 0
## [8,] 0 0 0 0 0 0 0 0 0
## [9,] 0 0 0 0 0 0 0 0 0
## [10,] 0 0 0 0 0 0 0 1 0
## Mice
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## [7,] 0
## [8,] 1
## [9,] 0
## [10,] 0
2 4

cosv=function(a, B){return((t(a)/(sqrt(sum(a*a))))%*%B/sqrt(diag(t(B)%*%(B))))}
B=matrix(embed, nrow = 10);
a=data.frame(embed)$Coronavirus;
cor=cosv(a, B);
colnames(cor)=data1$MeshHeading;
data1$MeshHeading[order(cor[1,],decreasing = T)[1:10]]

## [1] Humans Female Male Animals Adult Middle Aged


## [7] Aged Young Adult Adolescent Mice
## 126839 Levels: ...

You might also like